Hello OpenCV-CUDA community,
because processing speed is very important for my application, I moved my LensUndistortion function from CPU to GPU processing. But the expected performance gain did not show, the opposite is the case, gpu::remap is slower than the cpu remap.
My Measurements for an 1280x720 image remap with linear interpolation:
i5: 9,7ms
i7m: 5,8ms
gts250: 23ms (upload: 2ms, remap: 17ms, download: 4ms)
gtx560m: 40ms ??
The question: am I doing something wrong or is this the expected behavior?
I'm Using OpenCV 2.4.2 with CUDA 4.2.
Here's my code: CPU:
double LensUndistort(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, CvMat* efficientMatX, CvMat* efficientMatY, int quality)
{
if(frameHelper == NULL || mapX == NULL || mapY == NULL || efficientMatX == NULL || efficientMatY == NULL)
return -1;
CHighPerformanceCounter calculations;
try
{
calculations.Tick();
if(m_pInput == NULL)
{
CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
m_pInput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
}
if(m_pOutput == NULL)
{
CvSize imgSize = cvSize(frameHelper->GetWidth(), frameHelper->GetHeight());
m_pOutput = cvCreateImage(imgSize, IPL_DEPTH_8U, 3);
}
//write our data into an iplImage container
//IplImage's imageData field looks like this... BGRBGR : imageData[0] = B; imageData[1] = G; and so on..
int imageLen = 0;
m_pInput->imageData = (char*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode);
switch (quality)
{
case 1:
cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_LINEAR + cv::BORDER_CONSTANT, cvScalarAll(0)); //9.7ms //this looks good, but takes a little longer than nearest neighbour interpolation
break;
case 2:
cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_CUBIC + cv::BORDER_CONSTANT, cvScalarAll(0)); //65ms //this looks good, but takes a lot longer than nearest neighbour interpolation
break;
default: //or 0
cvRemap(m_pInput, m_pOutput, efficientMatX, efficientMatY, CV_INTER_NN + cv::BORDER_CONSTANT); //8.5ms //nearest neighbour interpolation is fastest! But looks shitty :(
break;
}
//set image
frameHelper->SetReversedImageBytes((BYTE*)(m_pOutput->imageData), imageLen);
}
catch( cv::Exception& e )
{
const char* err_msg = e.what();
CString err;
err.Format(_T("Error while LensUndistort(). Description: %s"), err_msg);
theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
return -1;
}
catch(CException* p_Ex)
{
TCHAR lpszError[MAX_TEMP_BUFFER];
p_Ex->GetErrorMessage(lpszError, MAX_TEMP_BUFFER);
CString err;
err.Format(_T("Error while LensUndistort(). Description: %s"), lpszError);
theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
p_Ex->Delete();
return -1;
}
return calculations.GetDeltaInMS();
}
GPU:
double LensUndistortGPU(CIdarBaseFrame* frameHelper, IplImage* mapX, IplImage* mapY, int quality)
{
if(frameHelper == NULL || mapX == NULL || mapY == NULL)
return -1;
CHighPerformanceCounter calculations;
try
{
calculations.Tick();
cv::Size imgSize = cv::Size(frameHelper->GetWidth(), frameHelper->GetHeight());
if(m_inputGPU.data == NULL)
{
m_inputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
}
if(m_outputGPU.data == NULL)
{
m_outputGPU = cv::gpu::GpuMat(imgSize, CV_8UC3);
}
int imageLen = 0;
double timeToDecode = 0.0;
cv::Mat input = cv::Mat(imgSize, CV_8UC3);
input.data = (uchar*)frameHelper->GetReversedImageBytes(imageLen, timeToDecode); //gets the decoded image bytes (should take no time at all, since the image is already decoded)
m_inputGPU.upload(input);
cv::gpu::GpuMat matXGPU(mapX);
cv::gpu::GpuMat matYGPU(mapY);
switch (quality)
{
case 1:
cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_LINEAR, cv::BORDER_CONSTANT); //22ms
break;
case 2:
cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_CUBIC, cv::BORDER_CONSTANT); //45ms
break;
default: //or 0
cv::gpu::remap(m_inputGPU, m_outputGPU, matXGPU, matYGPU, CV_INTER_NN, cv::BORDER_CONSTANT); //15ms
break;
}
cv::Mat output = cv::Mat(imgSize, CV_8UC3);
m_outputGPU.download(output);
//set image
frameHelper->SetReversedImageBytes((BYTE*)(output.data), imageLen);
input.release();
output.release();
matXGPU.release();
matYGPU.release();
}
catch( cv::Exception& e )
{
const char* err_msg = e.what();
CString err;
err.Format(_T("Error while LensUndistort(). Description: %s"), err_msg);
theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
return -1;
}
catch(CException* p_Ex)
{
TCHAR lpszError[MAX_TEMP_BUFFER];
p_Ex->GetErrorMessage(lpszError, MAX_TEMP_BUFFER);
CString err;
err.Format(_T("Error while LensUndistort(). Description: %s"), lpszError);
theLog.Log(err, EVENTLOG_ERROR_TYPE, 0, 0, SERIOUS);
p_Ex->Delete();
return -1;
}
return calculations.GetDeltaInMS();
}