cv::gpu::remap comparatively slow

asked 2012-10-17 06:32:00 -0600

Chris3D
1 ●1 ●1

Hello OpenCV-CUDA community,

because processing speed is very important for my application, I moved my LensUndistortion function from CPU to GPU processing. But the expected performance gain did not show, the opposite is the case, gpu::remap is slower than the cpu remap.

My Measurements for an 1280x720 image remap with linear interpolation:

i5: 9,7ms
i7m: 5,8ms
gts250: 23ms (upload: 2ms, remap: 17ms, download: 4ms)
gtx560m: 40ms ??

The question: am I doing something wrong or is this the expected behavior?

I'm Using OpenCV 2.4.2 with CUDA 4.2.

Here's my code: CPU:

double LensUndistort(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, CvMat* efficientMatX, CvMat* efficientMatY, int quality)
{
    if(frameHelper == NULL || mapX == NULL || mapY == NULL || efficientMatX == NULL || efficientMatY == NULL)
        return -1;

    CHighPerformanceCounter calculations;

    try
    {
        calculations.Tick();

        if(m_pInput == NULL)
        {
            CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
            m_pInput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
        }
        if(m_pOutput == NULL)
        {
            CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
            m_pOutput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
        }

        //write our data into an iplImage container
        //IplImage's imageData field looks like this... BGRBGR : imageData[0] = B; imageData[1] = G; and so on..
        int imageLen = 0;
        m_pInput->imageData = (char*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode);
        switch (quality)
        {
            case 1:
                cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_LINEAR + cv::BORDER_CONSTANT, cvScalarAll(0)); //9.7ms //this looks good, but takes a little longer than nearest neighbour interpolation
                break;
            case 2: 
                cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_CUBIC + cv::BORDER_CONSTANT, cvScalarAll(0)); //65ms //this looks good, but takes a lot longer than nearest neighbour interpolation
                break;
            default: //or 0
                cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_NN + cv::BORDER_CONSTANT); //8.5ms //nearest neighbour interpolation is fastest! But looks shitty :(
                break;
        }

        //set image
        frameHelper->SetReversedImageBytes((BYTE*)(m_pOutput->imageData), imageLen);
    }
    catch( cv::Exception& e )
    {
        const char* err_msg = e.what();
        CString err;
        err.Format(_T("Error while LensUndistort(). Description: %s"), err_msg);
        theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
        return -1;
    }
    catch(CException* p_Ex)
    {
        TCHAR lpszError[MAX_TEMP_BUFFER];
        p_Ex->GetErrorMessage(lpszError, MAX_TEMP_BUFFER);

        CString err;
        err.Format(_T("Error while LensUndistort(). Description: %s"), lpszError);
        theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
        p_Ex->Delete();
        return -1;
    }

    return calculations.GetDeltaInMS();
}

GPU:

double LensUndistortGPU(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, int quality)
{
    if(frameHelper == NULL || mapX == NULL || mapY == NULL)
        return -1;

    CHighPerformanceCounter calculations;

    try
    {
        calculations.Tick();

        cv::Size imgSize = cv::Size(frameHelper->GetWidth(), frameHelper->GetHeight());

        if(m_inputGPU.data == NULL)
        {
            m_inputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
        }
        if(m_outputGPU.data == NULL)
        {
            m_outputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
        }

        int imageLen = 0;
        double timeToDecode = 0.0;

        cv::Mat input = cv::Mat(imgSize, CV_8UC3);
        input.data = (uchar*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode); //gets the decoded image bytes (should take no time at all, since the image is already decoded)

        m_inputGPU.upload(input);

        cv::gpu::GpuMat matXGPU(mapX);
        cv::gpu::GpuMat matYGPU(mapY);

        switch (quality)
        {
            case 1:
                cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_LINEAR, cv::BORDER_CONSTANT); //22ms
                break;
            case 2:
                cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_CUBIC, cv::BORDER_CONSTANT); //45ms
                break;
            default: //or 0
                cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_NN, cv::BORDER_CONSTANT); //15ms
                break;
        }

        cv::Mat output = cv::Mat(imgSize, CV_8UC3 ...

(more)

edit retag flag offensive close merge delete

Comments

Try to call that function more than once, and discard the first call results. GPUs need some time to warp up, at the first call from an app.

sammy ( 2012-10-17 07:45:17 -0600 )edit

Yeah thats what I'm doing already, the measurements are from actual runtime, first few calls won't factor in.

Chris3D ( 2012-10-17 15:24:07 -0600 )edit

add a comment

cv::gpu::remap comparatively slow

Comments

1 answer

Links

Question Tools

Stats

Related questions

cv::gpu::remap comparatively slow edit

Comments

1 answer

Links

Question Tools

Stats

Related questions

cv::gpu::remap comparatively slow