Revision history [back]

Android OpenCL DFT vs. CPP Version is very slow!

Hey, I've started to learn a bit about Android GPU programming and wanted to implement the DFT with the new T-API in OpenCV 3.0. My Device is a Sony XPERIA Z1 which runs with OpenCL 1.1 (on Lollipop - hope that doesnt cause problems? Khronos website says, that Adreno 330 supports KitKat)

When comparing the two codes, the GPU-Version takes ~3200ms and the CPU-Version ~2800 ms .. What could be the issue? Any ideas?

Another problem, when I run the GPU-Version a second time I get the following error:

E/cv::error(): OpenCV Error: Assertion failed (u->refcount == 0 || u->tempUMat()) in virtual void cv::ocl::OpenCLAllocator::upload(cv::UMatData*, const void*, int, const size_t*, const size_t*, const size_t*, const size_t*) const, file C:/opencv3cl/modules/core/src/ocl.cpp, line 5025
A/libc: Fatal signal 6 (SIGABRT), code -6 in tid 28911 (.openclexample1)

Any ideas?

CPU-Version

void refNR (unsigned char* bufIn, unsigned char* bufOut, int* info)
{
    clock_t startTimer, stopTimer;

    String path = "/storage/emulated/0/DCIM/";
    String filename = "TEST.JPG";
    String filename_result = "TEST_CPU.png";

    startTimer=clock();

    Mat I = cv::imread(path+filename);
    cvtColor(I, I, COLOR_BGR2GRAY);
    LOGI("Imread took %g ms\n\n", 1000.0* (double)(clock() - startTimer)/(double)CLOCKS_PER_SEC) ;

    Mat padded;                            //expand input image to optimal size
    int m = getOptimalDFTSize( I.rows );
    int n = getOptimalDFTSize( I.cols ); // on the border add zero values
    copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, BORDER_CONSTANT, Scalar::all(0));

    Mat complexI;
    Mat planes[] = {Mat_<float>(padded), Mat::zeros(padded.size(), CV_32F)};
    merge(planes, 2, complexI);

    dft(complexI, complexI);            // this way the result may fit in the source matrix

    split(complexI, planes);                   // planes[0] = Re(DFT(I), planes[1] = Im(DFT(I))
    magnitude(planes[0], planes[1], planes[0]);// planes[0] = magnitude
    Mat magI = planes[0];
    log(magI, magI);

    magI = magI(Rect(0, 0, magI.cols & -2, magI.rows & -2));

    // rearrange the quadrants of Fourier image  so that the origin is at the image center
    int cx = magI.cols/2;
    int cy = magI.rows/2;

    Mat q0(magI, Rect(0, 0, cx, cy));   // Top-Left - Create a ROI per quadrant
    Mat q1(magI, Rect(cx, 0, cx, cy));  // Top-Right
    Mat q2(magI, Rect(0, cy, cx, cy));  // Bottom-Left
    Mat q3(magI, Rect(cx, cy, cx, cy)); // Bottom-Right

    Mat tmp;                           // swap quadrants (Top-Left with Bottom-Right)
    q0.copyTo(tmp);
    q3.copyTo(q0);
    tmp.copyTo(q3);

    q1.copyTo(tmp);                    // swap quadrant (Top-Right with Bottom-Left)
    q2.copyTo(q1);
    tmp.copyTo(q2);

    normalize(magI, magI, 0, 255, CV_MINMAX); // Transform the matrix with float values into a
    // viewable image form (float between values 0 and 1).
    imwrite(path+filename_result, magI);

    stopTimer=clock();

    double elapse = 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC;
    LOGI("OpenCL code on the GPU took %g ms\n\n", 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC) ;

    info[2] = (int)elapse;

    return;
}

GPU-Version

void openCLNR (unsigned char* bufIn, unsigned char* bufOut, int* info)
{


//get all platforms (drivers)
    std::vector<cl::Platform> all_platforms;
    cl::Platform::get(&all_platforms);
    if(all_platforms.size()==0){
        LOGE(" No platforms found. Check OpenCL installation!");
        return;
    }
    cl::Platform default_platform=all_platforms[0];
    cl_context_properties props[] =
            {   CL_GL_CONTEXT_KHR, 0,
                CL_EGL_DISPLAY_KHR, 0,
                CL_CONTEXT_PLATFORM, 0,
                0 };

    props[5] = (cl_context_properties) default_platform();

    //get default device of the default platform
    std::vector<cl::Device> all_devices;
    default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
    if(all_devices.size()==0){
        LOGE(" No devices found. Check OpenCL installation!");
        return;
    }
    cl::Device default_device=all_devices[0];

    cl::Context context = cl::Context(CL_DEVICE_TYPE_GPU, props);

    cv::ocl::attachContext(default_platform.getInfo<CL_PLATFORM_NAME>(), default_platform(), context(), default_device());

    cv::ocl::setUseOpenCL(true);

    clock_t startTimer, stopTimer;

    String path = "/storage/emulated/0/DCIM/";
    String filename = "TEST.JPG";
    String filename_result = "TEST_GPU.png";

    cv::ocl::setUseOpenCL(true);
    startTimer=clock();

    Mat mat = cv::imread(path+filename);
    cvtColor(mat, mat, COLOR_BGR2GRAY);
    LOGI("Imread took %g ms\n\n", 1000.0* (double)(clock() - startTimer)/(double)CLOCKS_PER_SEC) ;
    UMat I = mat.getUMat(cv::ACCESS_READ);

    UMat padded;                            //expand input image to optimal size
    int m = getOptimalDFTSize( I.rows );
    int n = getOptimalDFTSize( I.cols ); // on the border add zero values
    copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, BORDER_CONSTANT, Scalar::all(0));

    UMat complexI;
    std::vector<UMat> planes(2);
    padded.convertTo(padded,CV_32FC1);
    planes[0] = padded;
    planes[1] = UMat::zeros(padded.size(), CV_32FC1);

    merge(planes, complexI);
    dft(complexI, complexI);            // this way the result may fit in the source matrix
    split(complexI, planes);                   // planes[0] = Re(DFT(I), planes[1] = Im(DFT(I))

    magnitude(planes[0], planes[1], planes[0]);// planes[0] = magnitude
    UMat magI = planes[0];
    log(magI, magI);
    magI = magI(Rect(0, 0, magI.cols & -2, magI.rows & -2));

    // rearrange the quadrants of Fourier image  so that the origin is at the image center
    int cx = magI.cols/2;
    int cy = magI.rows/2;

    UMat q0(magI, Rect(0, 0, cx, cy));   // Top-Left - Create a ROI per quadrant
    UMat q1(magI, Rect(cx, 0, cx, cy));  // Top-Right
    UMat q2(magI, Rect(0, cy, cx, cy));  // Bottom-Left
    UMat q3(magI, Rect(cx, cy, cx, cy)); // Bottom-Right

    UMat tmp;                           // swap quadrants (Top-Left with Bottom-Right)
    q0.copyTo(tmp);
    q3.copyTo(q0);
    tmp.copyTo(q3);

    q1.copyTo(tmp);                    // swap quadrant (Top-Right with Bottom-Left)
    q2.copyTo(q1);
    tmp.copyTo(q2);

    normalize(magI, magI, 0, 255, CV_MINMAX); // Transform the matrix with float values into a
    // viewable image form (float between values 0 and 1).
    imwrite(path+filename_result, magI);

    cv::ocl::finish();
    stopTimer=clock();

    double elapse = 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC;
    LOGI("OpenCL code on the GPU took %g ms\n\n", 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC) ;

    info[2] = (int)elapse;

    return;
}

Android OpenCL DFT vs. CPP Version is very slow!

When comparing the two codes, the GPU-Version takes ~3200ms and the CPU-Version ~2800 ms .. What could be the issue? Any ideas?

~~Another problem, when I run~~ UPDATE

I've changed the ~~GPU-Version a second time I get the following error:~~code to something easier:

E/cv::error(): OpenCV Error: Assertion failed (u->refcount == 0 || u->tempUMat()) in virtual void cv::ocl::OpenCLAllocator::upload(cv::UMatData*, const void*, int, const size_t*, const size_t*, const size_t*, const size_t*) const, file C:/opencv3cl/modules/core/src/ocl.cpp, line 5025
A/libc: Fatal signal 6 (SIGABRT), code -6 in tid 28911 (.openclexample1)

Any ideas?

CPU-Version

void refNR (unsigned char* bufIn, unsigned char* bufOut, int* info)
{
    clock_t startTimer, stopTimer;

    String path UMat uIn, uOut, uTmp, uEdges, uBlur;
Mat input = "/storage/emulated/0/DCIM/";
    String filename = "TEST.JPG";
    String filename_result = "TEST_CPU.png";

    imread( path+filename, IMREAD_GRAYSCALE );//.getUMat( ACCESS_FAST );
input.copyTo(uIn);
startTimer=clock();

    Mat I = cv::imread(path+filename);
    cvtColor(I, I, COLOR_BGR2GRAY);
    LOGI("Imread took %g ms\n\n", 1000.0* (double)(clock() - startTimer)/(double)CLOCKS_PER_SEC) ;
GaussianBlur(uIn, uBlur, Size(1, 1), 1.5, 1.5);
Canny(uBlur, uEdges, 0, 30, 3);
stopTimer=clock();
imwrite(path+filename_result, uEdges);
     Mat padded;                            //expand input image to optimal size
    int m = getOptimalDFTSize( I.rows );
    int n = getOptimalDFTSize( I.cols ); // on the border add zero values
    copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, BORDER_CONSTANT, Scalar::all(0));

    Mat complexI;
    Mat planes[] = {Mat_<float>(padded), Mat::zeros(padded.size(), CV_32F)};
    merge(planes, 2, complexI);

    dft(complexI, complexI);            // this way the result may fit in the source matrix

    split(complexI, planes);                   // planes[0] = Re(DFT(I), planes[1] = Im(DFT(I))
    magnitude(planes[0], planes[1], planes[0]);// planes[0] = magnitude
    Mat magI = planes[0];
    log(magI, magI);

    magI = magI(Rect(0, 0, magI.cols & -2, magI.rows & -2));

    // rearrange the quadrants of Fourier image  so that the origin is at the image center
    int cx = magI.cols/2;
    int cy = magI.rows/2;

    Mat q0(magI, Rect(0, 0, cx, cy));   // Top-Left - Create a ROI per quadrant
    Mat q1(magI, Rect(cx, 0, cx, cy));  // Top-Right
    Mat q2(magI, Rect(0, cy, cx, cy));  // Bottom-Left
    Mat q3(magI, Rect(cx, cy, cx, cy)); // Bottom-Right

    Mat tmp;                           // swap quadrants (Top-Left with Bottom-Right)
    q0.copyTo(tmp);
    q3.copyTo(q0);
    tmp.copyTo(q3);

    q1.copyTo(tmp);                    // swap quadrant (Top-Right with Bottom-Left)
    q2.copyTo(q1);
    tmp.copyTo(q2);

    normalize(magI, magI, 0, 255, CV_MINMAX); // Transform the matrix with float values into a
    // viewable image form (float between values 0 and 1).
    imwrite(path+filename_result, magI);

    stopTimer=clock();

    cv::ocl::finish();
double elapse = 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC;
    LOGI("OpenCL code on the GPU took %g ms\n\n", 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC) ;

    info[2] = (int)elapse;

    return;
}

~~GPU-Version~~Running the Code the first time is slowlier, than the second time, but takes exactly the same time than the CPU-implementation.

void openCLNR (unsigned char* bufIn, unsigned char* bufOut, int* info) { //get all platforms (drivers) std::vector<cl::Platform> all_platforms; cl::Platform::get(&all_platforms); if(all_platforms.size()==0){ LOGE(" No platforms found. Check OpenCL installation!"); return; } cl::Platform default_platform=all_platforms[0]; cl_context_properties props[] = { CL_GL_CONTEXT_KHR, 0, CL_EGL_DISPLAY_KHR, 0, CL_CONTEXT_PLATFORM, 0, 0 }; props[5] = (cl_context_properties) default_platform(); //get default device of the default platform std::vector<cl::Device> all_devices; default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices); if(all_devices.size()==0){ LOGE(" No devices found. Check OpenCL installation!"); return; } cl::Device default_device=all_devices[0]; cl::Context context = cl::Context(CL_DEVICE_TYPE_GPU, props); cv::ocl::attachContext(default_platform.getInfo<CL_PLATFORM_NAME>(), default_platform(), context(), default_device()); cv::ocl::setUseOpenCL(true); clock_t startTimer, stopTimer; String path = "/storage/emulated/0/DCIM/"; String filename = "TEST.JPG"; String filename_result = "TEST_GPU.png"; cv::ocl::setUseOpenCL(true); startTimer=clock(); Mat mat = cv::imread(path+filename); cvtColor(mat, mat, COLOR_BGR2GRAY); LOGI("Imread took %g ms\n\n", 1000.0* (double)(clock() - startTimer)/(double)CLOCKS_PER_SEC) ; UMat I = mat.getUMat(cv::ACCESS_READ); UMat padded; //expand input image to optimal size int m = getOptimalDFTSize( I.rows ); int n = getOptimalDFTSize( I.cols ); // on the border add zero values copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, BORDER_CONSTANT, Scalar::all(0)); UMat complexI; std::vector<UMat> planes(2); padded.convertTo(padded,CV_32FC1); planes[0] = padded; planes[1] = UMat::zeros(padded.size(), CV_32FC1); merge(planes, complexI); dft(complexI, complexI); // this way the result may fit in the source matrix split(complexI, planes); // planes[0] = Re(DFT(I), planes[1] = Im(DFT(I)) magnitude(planes[0], planes[1], planes[0]);// planes[0] = magnitude UMat magI = planes[0]; log(magI, magI); magI = magI(Rect(0, 0, magI.cols & -2, magI.rows & -2)); // rearrange the quadrants of Fourier image so that the origin is at the image center int cx = magI.cols/2; int cy = magI.rows/2; UMat q0(magI, Rect(0, 0, cx, cy)); // Top-Left - Create a ROI per quadrant UMat q1(magI, Rect(cx, 0, cx, cy)); // Top-Right UMat q2(magI, Rect(0, cy, cx, cy)); // Bottom-Left UMat q3(magI, Rect(cx, cy, cx, cy)); // Bottom-Right UMat tmp; // swap quadrants (Top-Left with Bottom-Right) q0.copyTo(tmp); q3.copyTo(q0); tmp.copyTo(q3); q1.copyTo(tmp); // swap quadrant (Top-Right with Bottom-Left) q2.copyTo(q1); tmp.copyTo(q2); normalize(magI, magI, 0, 255, CV_MINMAX); // Transform the matrix with float values into a // viewable image form (float between values 0 and 1). imwrite(path+filename_result, magI); cv::ocl::finish(); stopTimer=clock(); double elapse = 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC; LOGI("OpenCL code on the GPU took %g ms\n\n", 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC) ; info[2] = (int)elapse; return; }

Any Ideas?