cv::gpu::remap comparatively slow
Hello OpenCV-CUDA community,
because processing speed is very important for my application, I moved my LensUndistortion function from CPU to GPU processing. But the expected performance gain did not show, the opposite is the case, gpu::remap is slower than the cpu remap.
My Measurements for an 1280x720 image remap with linear interpolation:
i5: 9,7ms
i7m: 5,8ms
gts250: 23ms (upload: 2ms, remap: 17ms, download: 4ms)
gtx560m: 40ms ??
The question: am I doing something wrong or is this the expected behavior?
I'm Using OpenCV 2.4.2 with CUDA 4.2.
Here's my code: CPU:
double LensUndistort(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, CvMat* efficientMatX, CvMat* efficientMatY, int quality)
{
if(frameHelper == NULL || mapX == NULL || mapY == NULL || efficientMatX == NULL || efficientMatY == NULL)
return -1;
CHighPerformanceCounter calculations;
try
{
calculations.Tick();
if(m_pInput == NULL)
{
CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
m_pInput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
}
if(m_pOutput == NULL)
{
CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
m_pOutput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
}
//write our data into an iplImage container
//IplImage's imageData field looks like this... BGRBGR : imageData[0] = B; imageData[1] = G; and so on..
int imageLen = 0;
m_pInput->imageData = (char*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode);
switch (quality)
{
case 1:
cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_LINEAR + cv::BORDER_CONSTANT, cvScalarAll(0)); //9.7ms //this looks good, but takes a little longer than nearest neighbour interpolation
break;
case 2:
cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_CUBIC + cv::BORDER_CONSTANT, cvScalarAll(0)); //65ms //this looks good, but takes a lot longer than nearest neighbour interpolation
break;
default: //or 0
cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_NN + cv::BORDER_CONSTANT); //8.5ms //nearest neighbour interpolation is fastest! But looks shitty :(
break;
}
//set image
frameHelper->SetReversedImageBytes((BYTE*)(m_pOutput->imageData), imageLen);
}
catch( cv::Exception& e )
{
const char* err_msg = e.what();
CString err;
err.Format(_T("Error while LensUndistort(). Description: %s"), err_msg);
theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
return -1;
}
catch(CException* p_Ex)
{
TCHAR lpszError[MAX_TEMP_BUFFER];
p_Ex->GetErrorMessage(lpszError, MAX_TEMP_BUFFER);
CString err;
err.Format(_T("Error while LensUndistort(). Description: %s"), lpszError);
theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
p_Ex->Delete();
return -1;
}
return calculations.GetDeltaInMS();
}
GPU:
double LensUndistortGPU(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, int quality)
{
if(frameHelper == NULL || mapX == NULL || mapY == NULL)
return -1;
CHighPerformanceCounter calculations;
try
{
calculations.Tick();
cv::Size imgSize = cv::Size(frameHelper->GetWidth(), frameHelper->GetHeight());
if(m_inputGPU.data == NULL)
{
m_inputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
}
if(m_outputGPU.data == NULL)
{
m_outputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
}
int imageLen = 0;
double timeToDecode = 0.0;
cv::Mat input = cv::Mat(imgSize, CV_8UC3);
input.data = (uchar*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode); //gets the decoded image bytes (should take no time at all, since the image is already decoded)
m_inputGPU.upload(input);
cv::gpu::GpuMat matXGPU(mapX);
cv::gpu::GpuMat matYGPU(mapY);
switch (quality)
{
case 1:
cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_LINEAR, cv::BORDER_CONSTANT); //22ms
break;
case 2:
cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_CUBIC, cv::BORDER_CONSTANT); //45ms
break;
default: //or 0
cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_NN, cv::BORDER_CONSTANT); //15ms
break;
}
cv::Mat output = cv::Mat(imgSize, CV_8UC3 ...
Try to call that function more than once, and discard the first call results. GPUs need some time to warp up, at the first call from an app.
Yeah thats what I'm doing already, the measurements are from actual runtime, first few calls won't factor in.