Slow matrix multiplication when using OpenCL enabled OpenCV
I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.
Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.
Here are the code used to run my simple test:
#include <opencv2\opencv.hpp>
#include <opencv2\opencv_modules.hpp>
#include <opencv2\core\ocl.hpp>
#include <iostream>
#include <fstream>
// Here are the functions used to measure time
double m_dTime = 0;
void perfMeasure_start()
{
m_dTime = cv::getTickCount();
}
void perfMeasure_end(std::string strLabel)
{
double currentTime = cv::getTickCount();
double dTimeTaken = (currentTime - m_dTime) / cv::getTickFrequency();
printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), dTimeTaken * 1000);
m_dTime = currentTime;
}
void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
perfMeasure_start();
for (int i = 0; i < 50; i++)
{
cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
}
perfMeasure_end("cvtColor");
for (int i = 0; i < 50; i++)
{
cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
}
perfMeasure_end("GaussianBlur");
for (int i = 0; i < 50; i++)
{
cv::Canny(matDst, matDst, 0, 50);
}
perfMeasure_end("Canny");
for (int i = 0; i < 50; i++)
{
cv::dilate(matDst, matDst, cv::noArray());
}
perfMeasure_end("Dilate");
for (int i = 0; i < 50; i++)
{
cv::add(matSrc, matSrc, matDst);
}
perfMeasure_end("Add");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, matDst, matDst);
}
perfMeasure_end("multiply");
for (int i = 0; i < 50; i++)
{
cv::multiply(matSrc, 2.5, matDst);
}
perfMeasure_end("multiply_scalar");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, matDst, matDst);
}
perfMeasure_end("divide");
for (int i = 0; i < 50; i++)
{
cv::divide(matSrc, 2.5, matDst);
}
perfMeasure_end("divide_Scalar");
for (int i = 0; i < 50; i++)
{
cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
}
perfMeasure_end("addWeighted");
}
void TestOpenCL(cv::InputArray matSrc)
{
printf("[PERF] -= Performance Check =-\n");
printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());
//////////// CPU MEASUREMENT CODE //////////////////////
printf("[PERF] CPU Process\n");
perfMeasure_start();
cv::Mat img, gray;
cv::Mat matImg, matDst;
// Read
matImg = matSrc.getMat();
perfMeasure_end("Read");
//Transfer
matImg.copyTo(img);
perfMeasure_end("Transfer CPU->CPU");
// Process
OpenCLProc(img, gray);
// Transfer
gray.copyTo(matDst);
perfMeasure_end("Transfer CPU->CPU");
std::cout << "\n";
//////////////////// GPU MEASUREMENT CODE 1 //////////////////////
printf("[PERF] GPU Process (copyTo)\n");
cv::UMat img1, gray1;
cv::Mat matImg1, matDst1;
// Read
matImg1 = matSrc.getMat();
perfMeasure_end("Read");
// Transfer
matImg1.copyTo(img1);
perfMeasure_end("Transfer CPU->GPU");
//Process
OpenCLProc(img1, gray1);
// Transfer
gray1.copyTo(matDst1);
perfMeasure_end("Transfer GPU->CPU");
std::cout << "\n";
return;
}
int main(int argc, const char** argv)
{
if (argv[1] == "")
std::cout << "Please insert image path" << std::endl;
else {
cv::Mat matSrc = cv::imread(argv[1]);
// First round of process to let the GPU initialize
TestOpenCL(matSrc);
// The code performance is made based on result from this function
TestOpenCL(matSrc);
}
return 0;
}
And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.
Is there a problem with the OpenCV ...
What's image size? what is m_util . to test your example we need full code...
Image size is 4252x2835
m_util is just a process time measurement class.I will modify the code to be compile friendly.
you can use Tickmeter class
Done, i think i have provided all the necessary code for testing.
I don't think that results are same than your