Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

OpenCV CUDA SURF Performance vs CPU version

I am trying to process a number of images to find if they are side by side stereopairs. First step uses SURF to find matches between left and right halfs of an image. The difference between CPU and GPU implementation speed seems to be rather small. What might be the reason? Here is a code for testing and test image:

image description

import cv2 as cv
import time

print('OpenCV Version', cv.__version__)

surfCPU = cv.xfeatures2d.SURF_create(300, nOctaveLayers=2)
surfGPU = cv.cuda.SURF_CUDA_create(300,_nOctaveLayers=2)
matcherCPU = cv.BFMatcher(cv.NORM_L2)
matcherGPU = cv.cuda.DescriptorMatcher_createBFMatcher(cv.NORM_L2)

resize_width = 1024

def getMatches(img, useCPU):
    height, width = img.shape[0], img.shape[1]

    w_percent = resize_width / width
    height = int(height * w_percent)
    width = resize_width

    if useCPU:
        img = cv.cvtColor(img, cv.COLOR_BGR2GRAY)
        img = cv.resize(img,(width, height),
                        interpolation=cv.INTER_LINEAR)

        gray_left = img[:,:width//2]
        gray_right = img[:,width//2:]

        kp1, des1 = surfCPU.detectAndCompute(gray_left,None)
        kp2, des2 = surfCPU.detectAndCompute(gray_right,None)
        knn_matches = matcherCPU.knnMatch(des1, des2, k=2)
    else:
        gpu_img = cv.cuda_GpuMat(img)
        gpu_img = cv.cuda.resize(gpu_img, (width,height),
                                 interpolation=cv.INTER_LINEAR)

        gpu_gray = cv.cuda.cvtColor(gpu_img, cv.COLOR_BGR2GRAY)

        gpu_gray_left = cv.cuda_GpuMat(gpu_gray, (0, 0, width//2, height))
        gpu_gray_right = cv.cuda_GpuMat(gpu_gray, (width//2, 0,width //2, height))

        kp1GPU, des1 = surfGPU.detectWithDescriptors(gpu_gray_left, None)
        kp2GPU, des2  = surfGPU.detectWithDescriptors(gpu_gray_right, None)
        knn_matches = matcherGPU.knnMatch(des1, des2, k=2)
        kp1 = cv.cuda_SURF_CUDA.downloadKeypoints(surfGPU, kp1GPU)
        kp2 = cv.cuda_SURF_CUDA.downloadKeypoints(surfGPU, kp2GPU)

img = cv.imread('photo.jpg')

res = getMatches(img, False)
samples = 100

for testName in ['GPU', 'CPU']:
    start_time = time.time()
    for i in range(samples):
        getMatches(img, testName == 'CPU')
    print(testName,
          "time",
          str(int((time.time() - start_time) / samples * 1000)) + 'ms')

Output on my machine (Core i5 6500, GTX 750 Ti)

OpenCV Version 4.3.0
GPU time 32ms
CPU time 52ms

Here is an attempt at measuring times. Resizing image on CPU before uploading makes some difference though not dramatic:

- GPU (35ms)

upload            23%      8ms  |||||||||||
grayscale          6%      2ms  |||
resize (gpu)       1%      0ms  
crop               0%      0ms  
detect            65%     23ms  ||||||||||||||||||||||||||||||||
matcher            1%      0ms  
download           0%      0ms  



- GPU (26ms)

resize (cpu)       4%      1ms  ||
upload             1%      0ms  
grayscale          2%      0ms  |
crop               0%      0ms  
detect            88%     23ms  ||||||||||||||||||||||||||||||||||||||||||||
matcher            1%      0ms  
download           0%      0ms



- CPU (50ms)

grayscale          6%      3ms  |||
resize             2%      1ms  |
crop               0%      0ms  
detect            89%     45ms  ||||||||||||||||||||||||||||||||||||||||||||
matcher            2%      1ms  |