Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

cuda::dft speed issues (too slow)

I am tying to do some image Fourier transforms (FFT) in OpenCV 3.0 RC1. In order to speed up the process, I decided to use the cuda module in OpenCV. However, the results is disappointing.

To test the speed, I did DFT to a 512x512 random complex matrix using CPU and GPU respectively. On my computer, the CUP takes 2.1 milliseconds (ms) to do it, while GPU takes 2.4 ms. I understand that copying data from memory to video memory is time consuming, so the data transferring time was excluded from the test results.

Since MATLAB also supports cuda acceleration, I ran a similar test in MATLAB 2014b. The gpu version of FFT in MATLAB was surprisingly faster. The CUP takes 5 ms, GPU only takes 0.007 ms.

So the question is, if both OpenCV and MATLAB are using the same cuda dft function (I assume), why is OpenCV so much slower?

OpenCV code I used is here:

#include <opencv2/core/core.hpp>
#include <opencv2/core/utility.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

// CUDA structures and methods
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaarithm.hpp>

#include <iostream>

using namespace cv;
using namespace std;


int main(int argc, char ** argv)
{
    // create a random complex image that is to be FFTed
    Mat complexImg = Mat(512, 512, CV_32FC2);
    randu(complexImg, Scalar::all(0), Scalar::all(255)); 

    Mat imgFFT;

    // DFT speed test on CPU
    double t = getTickCount();
    int NN = 100; //iteration number
    for (int i = 0; i < NN; i++)
    {
        dft(complexImg, imgFFT, DFT_COMPLEX_OUTPUT);
    }
    t = 1000 * ((double)getTickCount() - t) / getTickFrequency() / NN;
    cout << "CPU TIME: " << t << " ms" << endl;

    // DFT speed test on GPU
    cuda::GpuMat imageG, imgFFTG;
    imageG.upload(complexImg);
    cuda::dft(complexImg, imgFFTG, imageG.size());  
    t = getTickCount();
    for (int i = 0; i < NN; i++)
    {
        cuda::dft(complexImg, imgFFTG, imageG.size());
    }
    t = 1000 * ((double)getTickCount() - t) / getTickFrequency() / NN;
    cout << "GPU TIME: " << t << " ms" << endl;

    return 0;
}

MATLAB code I used is here:

M = double(rand(512,512,2));
N = zeros(size(M));

NN = 100; % iteration number
% CPU speed test
tic;
for i = 1:NN
    N = fft2(M);
end
elapsedTime = toc/NN;
disp(elapsedTime);

A = gpuArray(M);
B = fft2(A);

% GPU speed test
tic;
for i = 1:NN
    B = fft2(A);
end
elapsedTime = toc/NN;
disp(elapsedTime);

cuda::dft speed issues (too slow)

I am tying to do some image Fourier transforms (FFT) in OpenCV 3.0 RC1. In order to speed up the process, I decided to use the cuda module in OpenCV. However, the results is disappointing.

To test the speed, I did DFT to a 512x512 random complex matrix using CPU and GPU respectively. On my computer, the CUP takes 2.1 milliseconds (ms) to do it, while GPU takes 2.4 1.5 ms. I understand that copying data from memory to video memory is time consuming, so the data transferring time was excluded from the test results.

Since MATLAB also supports cuda acceleration, I ran a similar test in MATLAB 2014b. The gpu version of FFT in MATLAB was surprisingly faster. The CUP takes 5 ms, GPU only takes 0.007 ms.

So the question is, if both OpenCV and MATLAB are using the same cuda dft function (I assume), why is OpenCV so much slower?

OpenCV code I used is here:

#include <opencv2/core/core.hpp>
#include <opencv2/core/utility.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui/highgui.hpp>

// CUDA structures and methods
#include <opencv2/core/cuda.hpp>
#include <opencv2/cudaarithm.hpp>

#include <iostream>

using namespace cv;
using namespace std;


int main(int argc, char ** argv)
{
    // create a random complex image that is to be FFTed
    Mat complexImg = Mat(512, 512, CV_32FC2);
    randu(complexImg, Scalar::all(0), Scalar::all(255)); 

    Mat imgFFT;

    // DFT speed test on CPU
    double t = getTickCount();
    int NN = 100; //iteration number
    for (int i = 0; i < NN; i++)
    {
        dft(complexImg, imgFFT, DFT_COMPLEX_OUTPUT);
    }
    t = 1000 * ((double)getTickCount() - t) / getTickFrequency() / NN;
    cout << "CPU TIME: " << t << " ms" << endl;

    // DFT speed test on GPU
    cuda::GpuMat imageG, imgFFTG;
    imageG.upload(complexImg);
    cuda::dft(complexImg, cuda::dft(imageG, imgFFTG, imageG.size());  
    t = getTickCount();
    for (int i = 0; i < NN; i++)
    {
        cuda::dft(complexImg, cuda::dft(imageG, imgFFTG, imageG.size());
    }
    t = 1000 * ((double)getTickCount() - t) / getTickFrequency() / NN;
    cout << "GPU TIME: " << t << " ms" << endl;

    return 0;
}

MATLAB code I used is here:

M = double(rand(512,512,2));
N = zeros(size(M));

NN = 100; % iteration number
% CPU speed test
tic;
for i = 1:NN
    N = fft2(M);
end
elapsedTime = toc/NN;
disp(elapsedTime);

A = gpuArray(M);
B = fft2(A);

% GPU speed test
tic;
for i = 1:NN
    B = fft2(A);
end
elapsedTime = toc/NN;
disp(elapsedTime);