OpenCV matchTemplate CUDA large images & templates
Dear All,
I am interested in using template matching on large (satellite) images (at least 8192 by 8192 pixels), using templates from reference image sets that are typically 256 by 256 or 512 by 512 pixels in size. A normal use case is matching N by N templates against the image (N=5,7,9...).
I am using OpenCV 2.4.6 with CUDA 4.2. I managed to get the gpu version of matchTemplate going, but ran into the initiation timing issue. This causes the gpu version to be slower than the cpu version, when used in a single image/single template match. I have done careful timing analysis (see code below) and find that the code is spending 98% of the time on initiation. I know that this has to do with the JIT compilation of the CUDA related code, but the reference to check this further in the documentation on the nvcc compiler and the CUDA_DEVCODE_CACHE environment variable is leading nowhere to a solution (I set the environment variable, but nothing improves).
This should be a compile once, run often code case, so if someone got the code caching working correctly, I'd appreciate if that knowledge could be shared.
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include "opencv2/gpu/gpu.hpp"
#include <iostream>
#include <stdio.h>
using namespace std;
using namespace cv;
/// Global Variables
Mat img;
Mat templ;
Mat result;
int match_method;
/** @function main
Stripped down version, without GUI functionality
*/
int main( int argc, char** argv )
{
/// Load image and template
img = imread( argv[1], 1 );
templ = imread( argv[2], 1 );
match_method = atoi(argv[2]);
int result_cols = img.cols - templ.cols + 1;
int result_rows = img.rows - templ.rows + 1;
result.create( result_cols, result_rows, CV_32F);
size_t t0 = clock();
try
{
gpu::printCudaDeviceInfo(gpu::getDevice());
gpu::resetDevice();
}
catch (const std::exception& e)
{
//no GPU, DLL not compiled with GPU
printf("Exception thrown: %s\n", e.what());
return 0;
}
size_t t1 = clock();
printf("GPU initialize: %f ms\n", (double(t1 - t0)/CLOCKS_PER_SEC*1000.0));
gpu::GpuMat d_src, d_templ, d_dst;
d_templ.upload(templ);
printf("GPU load templ: %f ms\n", (double(clock() - t1)/CLOCKS_PER_SEC*1000.0));
d_src.upload(img);
printf("GPU load img: %f ms\n", (double(clock() - t1)/CLOCKS_PER_SEC*1000.0));
//d_templ.upload(templ);
//printf("GPU load templ: %f ms\n", (double(clock() - t1)/CLOCKS_PER_SEC*1000.0));
d_dst.upload(result);
printf("GPU load result: %f ms\n", (double(clock() - t1)/CLOCKS_PER_SEC*1000.0));
/// Do the Matching
size_t t2 = clock();
printf("GPU memory set-up: %f ms\n", (double(t2 - t1)/CLOCKS_PER_SEC*1000.0));
gpu::matchTemplate( d_src, d_templ, d_dst, match_method );
size_t t3 = clock();
printf("GPU template match: %f ms\n", (double(t3 - t2)/CLOCKS_PER_SEC*1000.0));
/// Localizing the best match with minMaxLoc
double minVal; double maxVal; Point minLoc; Point maxLoc;
Point matchLoc;
gpu::minMaxLoc( d_dst, &minVal, &maxVal, &minLoc, &maxLoc);
size_t t4 = clock();
printf("GPU minMaxLoc: %f ms\n", (double(t4 - t3)/CLOCKS_PER_SEC*1000.0));
/// For SQDIFF and SQDIFF_NORMED, the best matches are lower values. For all the other methods, the ...