Hey, I've started to learn a bit about Android GPU programming and wanted to implement the DFT with the new T-API in OpenCV 3.0. My Device is a Sony XPERIA Z1 which runs with OpenCL 1.1 (on Lollipop - hope that doesnt cause problems? Khronos website says, that Adreno 330 supports KitKat)
When comparing the two codes, the GPU-Version takes ~3200ms and the CPU-Version ~2800 ms .. What could be the issue? Any ideas?
Another problem, when I run the GPU-Version a second time I get the following error:
E/cv::error(): OpenCV Error: Assertion failed (u->refcount == 0 || u->tempUMat()) in virtual void cv::ocl::OpenCLAllocator::upload(cv::UMatData*, const void*, int, const size_t*, const size_t*, const size_t*, const size_t*) const, file C:/opencv3cl/modules/core/src/ocl.cpp, line 5025
A/libc: Fatal signal 6 (SIGABRT), code -6 in tid 28911 (.openclexample1)
Any ideas?
CPU-Version
void refNR (unsigned char* bufIn, unsigned char* bufOut, int* info)
{
clock_t startTimer, stopTimer;
String path = "/storage/emulated/0/DCIM/";
String filename = "TEST.JPG";
String filename_result = "TEST_CPU.png";
startTimer=clock();
Mat I = cv::imread(path+filename);
cvtColor(I, I, COLOR_BGR2GRAY);
LOGI("Imread took %g ms\n\n", 1000.0* (double)(clock() - startTimer)/(double)CLOCKS_PER_SEC) ;
Mat padded; //expand input image to optimal size
int m = getOptimalDFTSize( I.rows );
int n = getOptimalDFTSize( I.cols ); // on the border add zero values
copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, BORDER_CONSTANT, Scalar::all(0));
Mat complexI;
Mat planes[] = {Mat_<float>(padded), Mat::zeros(padded.size(), CV_32F)};
merge(planes, 2, complexI);
dft(complexI, complexI); // this way the result may fit in the source matrix
split(complexI, planes); // planes[0] = Re(DFT(I), planes[1] = Im(DFT(I))
magnitude(planes[0], planes[1], planes[0]);// planes[0] = magnitude
Mat magI = planes[0];
log(magI, magI);
magI = magI(Rect(0, 0, magI.cols & -2, magI.rows & -2));
// rearrange the quadrants of Fourier image so that the origin is at the image center
int cx = magI.cols/2;
int cy = magI.rows/2;
Mat q0(magI, Rect(0, 0, cx, cy)); // Top-Left - Create a ROI per quadrant
Mat q1(magI, Rect(cx, 0, cx, cy)); // Top-Right
Mat q2(magI, Rect(0, cy, cx, cy)); // Bottom-Left
Mat q3(magI, Rect(cx, cy, cx, cy)); // Bottom-Right
Mat tmp; // swap quadrants (Top-Left with Bottom-Right)
q0.copyTo(tmp);
q3.copyTo(q0);
tmp.copyTo(q3);
q1.copyTo(tmp); // swap quadrant (Top-Right with Bottom-Left)
q2.copyTo(q1);
tmp.copyTo(q2);
normalize(magI, magI, 0, 255, CV_MINMAX); // Transform the matrix with float values into a
// viewable image form (float between values 0 and 1).
imwrite(path+filename_result, magI);
stopTimer=clock();
double elapse = 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC;
LOGI("OpenCL code on the GPU took %g ms\n\n", 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC) ;
info[2] = (int)elapse;
return;
}
GPU-Version
void openCLNR (unsigned char* bufIn, unsigned char* bufOut, int* info)
{
//get all platforms (drivers)
std::vector<cl::Platform> all_platforms;
cl::Platform::get(&all_platforms);
if(all_platforms.size()==0){
LOGE(" No platforms found. Check OpenCL installation!");
return;
}
cl::Platform default_platform=all_platforms[0];
cl_context_properties props[] =
{ CL_GL_CONTEXT_KHR, 0,
CL_EGL_DISPLAY_KHR, 0,
CL_CONTEXT_PLATFORM, 0,
0 };
props[5] = (cl_context_properties) default_platform();
//get default device of the default platform
std::vector<cl::Device> all_devices;
default_platform.getDevices(CL_DEVICE_TYPE_ALL, &all_devices);
if(all_devices.size()==0){
LOGE(" No devices found. Check OpenCL installation!");
return;
}
cl::Device default_device=all_devices[0];
cl::Context context = cl::Context(CL_DEVICE_TYPE_GPU, props);
cv::ocl::attachContext(default_platform.getInfo<CL_PLATFORM_NAME>(), default_platform(), context(), default_device());
cv::ocl::setUseOpenCL(true);
clock_t startTimer, stopTimer;
String path = "/storage/emulated/0/DCIM/";
String filename = "TEST.JPG";
String filename_result = "TEST_GPU.png";
cv::ocl::setUseOpenCL(true);
startTimer=clock();
Mat mat = cv::imread(path+filename);
cvtColor(mat, mat, COLOR_BGR2GRAY);
LOGI("Imread took %g ms\n\n", 1000.0* (double)(clock() - startTimer)/(double)CLOCKS_PER_SEC) ;
UMat I = mat.getUMat(cv::ACCESS_READ);
UMat padded; //expand input image to optimal size
int m = getOptimalDFTSize( I.rows );
int n = getOptimalDFTSize( I.cols ); // on the border add zero values
copyMakeBorder(I, padded, 0, m - I.rows, 0, n - I.cols, BORDER_CONSTANT, Scalar::all(0));
UMat complexI;
std::vector<UMat> planes(2);
padded.convertTo(padded,CV_32FC1);
planes[0] = padded;
planes[1] = UMat::zeros(padded.size(), CV_32FC1);
merge(planes, complexI);
dft(complexI, complexI); // this way the result may fit in the source matrix
split(complexI, planes); // planes[0] = Re(DFT(I), planes[1] = Im(DFT(I))
magnitude(planes[0], planes[1], planes[0]);// planes[0] = magnitude
UMat magI = planes[0];
log(magI, magI);
magI = magI(Rect(0, 0, magI.cols & -2, magI.rows & -2));
// rearrange the quadrants of Fourier image so that the origin is at the image center
int cx = magI.cols/2;
int cy = magI.rows/2;
UMat q0(magI, Rect(0, 0, cx, cy)); // Top-Left - Create a ROI per quadrant
UMat q1(magI, Rect(cx, 0, cx, cy)); // Top-Right
UMat q2(magI, Rect(0, cy, cx, cy)); // Bottom-Left
UMat q3(magI, Rect(cx, cy, cx, cy)); // Bottom-Right
UMat tmp; // swap quadrants (Top-Left with Bottom-Right)
q0.copyTo(tmp);
q3.copyTo(q0);
tmp.copyTo(q3);
q1.copyTo(tmp); // swap quadrant (Top-Right with Bottom-Left)
q2.copyTo(q1);
tmp.copyTo(q2);
normalize(magI, magI, 0, 255, CV_MINMAX); // Transform the matrix with float values into a
// viewable image form (float between values 0 and 1).
imwrite(path+filename_result, magI);
cv::ocl::finish();
stopTimer=clock();
double elapse = 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC;
LOGI("OpenCL code on the GPU took %g ms\n\n", 1000.0* (double)(stopTimer - startTimer)/(double)CLOCKS_PER_SEC) ;
info[2] = (int)elapse;
return;
}