Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

cv::gpu::remap comparatively slow

Hello OpenCV-CUDA community,

because processing speed is very important for my application, I moved my LensUndistortion function from CPU to GPU processing. But the expected performance gain did not show, the opposite is the case, gpu::remap is slower than the cpu remap.

My Measurements for an 1280x720 image remap with linear interpolation:

i5: 9,7ms
i7m: 5,8ms
gts250: 23ms (upload: 2ms, remap: 17ms, download: 4ms)
gtx560m: 40ms ??

The question: am I doing something wrong or is this the expected behavior?

I'm Using OpenCV 2.4.2 with CUDA 4.2.


Here's my code: CPU:

double LensUndistort(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, CvMat* efficientMatX, CvMat* efficientMatY, int quality)
{
    if(frameHelper == NULL || mapX == NULL || mapY == NULL || efficientMatX == NULL || efficientMatY == NULL)
        return -1;

    CHighPerformanceCounter calculations;

    try
    {
        calculations.Tick();

        if(m_pInput == NULL)
        {
            CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
            m_pInput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
        }
        if(m_pOutput == NULL)
        {
            CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
            m_pOutput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
        }

        //write our data into an iplImage container
        //IplImage's imageData field looks like this... BGRBGR : imageData[0] = B; imageData[1] = G; and so on..
        int imageLen = 0;
        m_pInput->imageData = (char*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode);
        switch (quality)
        {
            case 1:
                cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_LINEAR + cv::BORDER_CONSTANT, cvScalarAll(0)); //9.7ms //this looks good, but takes a little longer than nearest neighbour interpolation
                break;
            case 2: 
                cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_CUBIC + cv::BORDER_CONSTANT, cvScalarAll(0)); //65ms //this looks good, but takes a lot longer than nearest neighbour interpolation
                break;
            default: //or 0
                cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_NN + cv::BORDER_CONSTANT); //8.5ms //nearest neighbour interpolation is fastest! But looks shitty :(
                break;
        }

        //set image
        frameHelper->SetReversedImageBytes((BYTE*)(m_pOutput->imageData), imageLen);
    }
    catch( cv::Exception& e )
    {
        const char* err_msg = e.what();
        CString err;
        err.Format(_T("Error while LensUndistort(). Description: %s"), err_msg);
        theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
        return -1;
    }
    catch(CException* p_Ex)
    {
        TCHAR lpszError[MAX_TEMP_BUFFER];
        p_Ex->GetErrorMessage(lpszError, MAX_TEMP_BUFFER);

        CString err;
        err.Format(_T("Error while LensUndistort(). Description: %s"), lpszError);
        theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
        p_Ex->Delete();
        return -1;
    }

    return calculations.GetDeltaInMS();
}

GPU:

double LensUndistortGPU(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, int quality)
{
    if(frameHelper == NULL || mapX == NULL || mapY == NULL)
        return -1;

    CHighPerformanceCounter calculations;

    try
    {
        calculations.Tick();

        cv::Size imgSize = cv::Size(frameHelper->GetWidth(), frameHelper->GetHeight());

        if(m_inputGPU.data == NULL)
        {
            m_inputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
        }
        if(m_outputGPU.data == NULL)
        {
            m_outputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
        }

        int imageLen = 0;
        double timeToDecode = 0.0;

        cv::Mat input = cv::Mat(imgSize, CV_8UC3);
        input.data = (uchar*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode); //gets the decoded image bytes (should take no time at all, since the image is already decoded)

        m_inputGPU.upload(input);

        cv::gpu::GpuMat matXGPU(mapX);
        cv::gpu::GpuMat matYGPU(mapY);

        switch (quality)
        {
            case 1:
                cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_LINEAR, cv::BORDER_CONSTANT); //22ms
                break;
            case 2:
                cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_CUBIC, cv::BORDER_CONSTANT); //45ms
                break;
            default: //or 0
                cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_NN, cv::BORDER_CONSTANT); //15ms
                break;
        }

        cv::Mat output = cv::Mat(imgSize, CV_8UC3);
        m_outputGPU.download(output);

        //set image
        frameHelper->SetReversedImageBytes((BYTE*)(output.data), imageLen);

        input.release();
        output.release();
        matXGPU.release();
        matYGPU.release();
    }
    catch( cv::Exception& e )
    {
        const char* err_msg = e.what();
        CString err;
        err.Format(_T("Error while LensUndistort(). Description: %s"), err_msg);
        theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
        return -1;
    }
    catch(CException* p_Ex)
    {
        TCHAR lpszError[MAX_TEMP_BUFFER];
        p_Ex->GetErrorMessage(lpszError, MAX_TEMP_BUFFER);

        CString err;
        err.Format(_T("Error while LensUndistort(). Description: %s"), lpszError);
        theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
        p_Ex->Delete();
        return -1;
    }

    return calculations.GetDeltaInMS();
}