Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

Slow matrix multiplication when using OpenCL enabled OpenCV

I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.

Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.

Here are the code used to run my simple test:

void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
    m_util.perfMeasure_start();
    for (int i = 0; i < 50; i++)
    {
        cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
    }
    m_util.perfMeasure_end("cvtColor");

    for (int i = 0; i < 50; i++)
    {
        cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
    }
    m_util.perfMeasure_end("GaussianBlur");

    for (int i = 0; i < 50; i++)
    {
        cv::Canny(matDst, matDst, 0, 50);
    }
    m_util.perfMeasure_end("Canny");

    for (int i = 0; i < 50; i++)
    {
        cv::dilate(matDst, matDst, cv::noArray());
    }
    m_util.perfMeasure_end("Dilate");

    for (int i = 0; i < 50; i++)
    {
        cv::add(matSrc, matSrc, matDst);
    }
    m_util.perfMeasure_end("Add");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, matDst, matDst);
    }
    m_util.perfMeasure_end("multiply");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, 2.5, matDst);
    }
    m_util.perfMeasure_end("multiply_scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, matDst, matDst);
    }
    m_util.perfMeasure_end("divide");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, 2.5, matDst);
    }
    m_util.perfMeasure_end("divide_Scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
    }
    m_util.perfMeasure_end("addWeighted");

}

void TestOpenCL(cv::InputArray matSrc)
{


    //std::cout << "**************************************\n";
    m_util.printConsole("[PERF] -= Performance Check =-\n");
    //std::cout << "**************************************\n\n";

    m_util.printConsole("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

    //////////// CPU MEASUREMENT CODE //////////////////////
    m_util.printConsole("[PERF] CPU Process\n");
    m_util.perfMeasure_start();
    cv::Mat img, gray;
    cv::Mat matImg, matDst;

    // Read
    matImg = matSrc.getMat();
    m_util.perfMeasure_end("Read");

    //Transfer
    matImg.copyTo(img);
    m_util.perfMeasure_end("Transfer CPU->CPU");

    // Process
    OpenCLProc(img, gray);

    // Transfer
    gray.copyTo(matDst);
    m_util.perfMeasure_end("Transfer CPU->CPU");

    std::cout << "\n";


    //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
    m_util.printConsole("[PERF] GPU Process (copyTo)\n");
    cv::UMat img1, gray1;
    cv::Mat matImg1, matDst1;

    // Read
    matImg1 = matSrc.getMat();
    m_util.perfMeasure_end("Read");

    // Transfer
    matImg1.copyTo(img1);
    m_util.perfMeasure_end("Transfer CPU->GPU");

    //Process
    OpenCLProc(img1, gray1);

    // Transfer
    gray1.copyTo(matDst1);
    m_util.perfMeasure_end("Transfer GPU->CPU");
    std::cout << "\n";


    return;

}

And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.

Is there a problem with the OpenCV function? How can multiplication function takes longer than division?

enter image description here

This process is run on - Library : OpenCV3.3.1 build with OpenCL and TBB - CPU : Intel i7-6500U - GPU : Intel HD Graphics 520

Slow matrix multiplication when using OpenCL enabled OpenCV

I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.

Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.

Here are the code used to run my simple test:

    // Here are the functions used to measure time
    double m_dTime = 0;

    void perfMeasure_start()
    {
    m_dTime = cv::getTickCount();
    }

    void perfMeasure_end(std::string strLabel)
     {

    double currentTime = cv::getTickCount();
    double dTimeTaken = (currentTime - m_dTime) / cv::getTickFrequency();
    printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), dTimeTaken * 1000);
    m_dTime = currentTime;

     }

    void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
 {
    m_util.perfMeasure_start();
     m_dTime  = cv::getTickCount();
        for (int i = 0; i < 50; i++)
     {
         cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
     }
     m_util.perfMeasure_end("cvtColor");

     for (int i = 0; i < 50; i++)
     {
         cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
     }
     m_util.perfMeasure_end("GaussianBlur");

     for (int i = 0; i < 50; i++)
     {
         cv::Canny(matDst, matDst, 0, 50);
     }
     m_util.perfMeasure_end("Canny");

     for (int i = 0; i < 50; i++)
     {
         cv::dilate(matDst, matDst, cv::noArray());
     }
     m_util.perfMeasure_end("Dilate");

     for (int i = 0; i < 50; i++)
     {
         cv::add(matSrc, matSrc, matDst);
     }
     m_util.perfMeasure_end("Add");

     for (int i = 0; i < 50; i++)
     {
         cv::multiply(matSrc, matDst, matDst);
     }
     m_util.perfMeasure_end("multiply");

     for (int i = 0; i < 50; i++)
     {
         cv::multiply(matSrc, 2.5, matDst);
     }
     m_util.perfMeasure_end("multiply_scalar");

     for (int i = 0; i < 50; i++)
     {
         cv::divide(matSrc, matDst, matDst);
     }
     m_util.perfMeasure_end("divide");

     for (int i = 0; i < 50; i++)
     {
         cv::divide(matSrc, 2.5, matDst);
     }
     m_util.perfMeasure_end("divide_Scalar");

     for (int i = 0; i < 50; i++)
     {
         cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
     }
     m_util.perfMeasure_end("addWeighted");

 }

 void TestOpenCL(cv::InputArray matSrc)
 {


     //std::cout << "**************************************\n";
     m_util.printConsole("[PERF] -= Performance Check =-\n");
     //std::cout << "**************************************\n\n";

     m_util.printConsole("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

     //////////// CPU MEASUREMENT CODE //////////////////////
     m_util.printConsole("[PERF] CPU Process\n");
     m_util.perfMeasure_start();
     cv::Mat img, gray;
     cv::Mat matImg, matDst;

     // Read
     matImg = matSrc.getMat();
     m_util.perfMeasure_end("Read");

     //Transfer
     matImg.copyTo(img);
     m_util.perfMeasure_end("Transfer CPU->CPU");

     // Process
     OpenCLProc(img, gray);

     // Transfer
     gray.copyTo(matDst);
     m_util.perfMeasure_end("Transfer CPU->CPU");

     std::cout << "\n";


     //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
     m_util.printConsole("[PERF] GPU Process (copyTo)\n");
     cv::UMat img1, gray1;
     cv::Mat matImg1, matDst1;

     // Read
     matImg1 = matSrc.getMat();
     m_util.perfMeasure_end("Read");

     // Transfer
     matImg1.copyTo(img1);
     m_util.perfMeasure_end("Transfer CPU->GPU");

     //Process
     OpenCLProc(img1, gray1);

     // Transfer
     gray1.copyTo(matDst1);
     m_util.perfMeasure_end("Transfer GPU->CPU");
     std::cout << "\n";


     return;

    }


int main(int argc, const char** argv)
{
    if (argv[1] == "")
        std::cout << "Please insert image path" << std::endl;
    else {

        cv::Mat matSrc = cv::imread(argv[1]);

            // First round of process to let the GPU initialize
        TestOpenCL(matSrc);

            // The code performance is made based on result from this function
        TestOpenCL(matSrc);

    }


    return 0;
}

And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.

Is there a problem with the OpenCV function? How can multiplication function takes longer than division?

enter image description here

This process is run on - Library : OpenCV3.3.1 build with OpenCL and TBB - CPU : Intel i7-6500U - GPU : Intel HD Graphics 520

Slow matrix multiplication when using OpenCL enabled OpenCV

I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.

Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.

Here are the code used to run my simple test:

    // Here are the functions used to measure time
    double m_dTime = 0;

    void perfMeasure_start()
    {
     m_dTime = cv::getTickCount();
    }

    void perfMeasure_end(std::string strLabel)
     {
      double currentTime = cv::getTickCount();
     double dTimeTaken = (currentTime - m_dTime) / cv::getTickFrequency();
     printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), dTimeTaken * 1000);
     m_dTime = currentTime;
      }

    void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
    {
         m_dTime  = cv::getTickCount();
perfMeasure_start();
        for (int i = 0; i < 50; i++)
        {
            cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
        }
        m_util.perfMeasure_end("cvtColor");
perfMeasure_end("cvtColor");

        for (int i = 0; i < 50; i++)
        {
            cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
        }
        m_util.perfMeasure_end("GaussianBlur");
perfMeasure_end("GaussianBlur");

        for (int i = 0; i < 50; i++)
        {
            cv::Canny(matDst, matDst, 0, 50);
        }
        m_util.perfMeasure_end("Canny");
perfMeasure_end("Canny");

        for (int i = 0; i < 50; i++)
        {
            cv::dilate(matDst, matDst, cv::noArray());
        }
        m_util.perfMeasure_end("Dilate");
perfMeasure_end("Dilate");

        for (int i = 0; i < 50; i++)
        {
            cv::add(matSrc, matSrc, matDst);
        }
        m_util.perfMeasure_end("Add");
perfMeasure_end("Add");

        for (int i = 0; i < 50; i++)
        {
            cv::multiply(matSrc, matDst, matDst);
        }
        m_util.perfMeasure_end("multiply");
perfMeasure_end("multiply");

        for (int i = 0; i < 50; i++)
        {
            cv::multiply(matSrc, 2.5, matDst);
        }
        m_util.perfMeasure_end("multiply_scalar");
perfMeasure_end("multiply_scalar");

        for (int i = 0; i < 50; i++)
        {
            cv::divide(matSrc, matDst, matDst);
        }
        m_util.perfMeasure_end("divide");
perfMeasure_end("divide");

        for (int i = 0; i < 50; i++)
        {
            cv::divide(matSrc, 2.5, matDst);
        }
        m_util.perfMeasure_end("divide_Scalar");
perfMeasure_end("divide_Scalar");

        for (int i = 0; i < 50; i++)
        {
            cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
        }
        m_util.perfMeasure_end("addWeighted");
perfMeasure_end("addWeighted");

    }

    void TestOpenCL(cv::InputArray matSrc)
    {


        //std::cout << "**************************************\n";
        m_util.printConsole("[PERF]         printf("[PERF] -= Performance Check =-\n");
        //std::cout << "**************************************\n\n";

        m_util.printConsole("[PERF] =-\n");       
        printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

        //////////// CPU MEASUREMENT CODE //////////////////////
        m_util.printConsole("[PERF] printf("[PERF] CPU Process\n");
        m_util.perfMeasure_start();
        cv::Mat img, gray;
        cv::Mat matImg, matDst;

        // Read
        matImg = matSrc.getMat();
        m_util.perfMeasure_end("Read");
perfMeasure_end("Read");

        //Transfer
        matImg.copyTo(img);
        m_util.perfMeasure_end("Transfer perfMeasure_end("Transfer CPU->CPU");

        // Process
        OpenCLProc(img, gray);

        // Transfer
        gray.copyTo(matDst);
        m_util.perfMeasure_end("Transfer perfMeasure_end("Transfer CPU->CPU");

        std::cout << "\n";
 
        //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
        m_util.printConsole("[PERF] printf("[PERF] GPU Process (copyTo)\n");
        cv::UMat img1, gray1;
        cv::Mat matImg1, matDst1;

        // Read
        matImg1 = matSrc.getMat();
        m_util.perfMeasure_end("Read");
perfMeasure_end("Read");

        // Transfer
        matImg1.copyTo(img1);
        m_util.perfMeasure_end("Transfer perfMeasure_end("Transfer CPU->GPU");

        //Process
        OpenCLProc(img1, gray1);

        // Transfer
        gray1.copyTo(matDst1);
        m_util.perfMeasure_end("Transfer perfMeasure_end("Transfer GPU->CPU");
        std::cout << "\n";


        return;

    }


int main(int argc, const char** argv)
{
    if (argv[1] == "")
        std::cout << "Please insert image path" << std::endl;
    else {

        cv::Mat matSrc = cv::imread(argv[1]);

            // First round of process to let the GPU initialize
        TestOpenCL(matSrc);

            // The code performance is made based on result from this function
        TestOpenCL(matSrc);

    }


    return 0;
}

And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.

Is there a problem with the OpenCV function? How can multiplication function takes longer than division?

enter image description here

This process is run on - Library : OpenCV3.3.1 build with OpenCL and TBB - CPU : Intel i7-6500U - GPU : Intel HD Graphics 520

Slow matrix multiplication when using OpenCL enabled OpenCV

I made a simple program to test the performance of OpenCV with and without the help of GPU. GPU code is implemented using OpenCV/OpenCL Transparent API method.

Overall, GPU enabled process always runs faster than CPU except for matrix multiplication which is surprising considering multiplication process is bread and butter for a good GPU performance.

Here are the code used to run my simple test:

 #include <opencv2\opencv.hpp>
    #include <opencv2\opencv_modules.hpp>
    #include <opencv2\core\ocl.hpp>
    #include <iostream>
    #include <fstream>

    // Here are the functions used to measure time
    double m_dTime = 0;

    void perfMeasure_start()
    {
        m_dTime = cv::getTickCount();
    }

    void perfMeasure_end(std::string strLabel)
    {
        double currentTime = cv::getTickCount();
        double dTimeTaken = (currentTime - m_dTime) / cv::getTickFrequency();
        printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), dTimeTaken * 1000);
        m_dTime = currentTime;
     }

    void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
    {
        perfMeasure_start();
        for (int i = 0; i < 50; i++)
        {
            cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
        }
        perfMeasure_end("cvtColor");

        for (int i = 0; i < 50; i++)
        {
            cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
        }
        perfMeasure_end("GaussianBlur");

        for (int i = 0; i < 50; i++)
        {
            cv::Canny(matDst, matDst, 0, 50);
        }
        perfMeasure_end("Canny");

        for (int i = 0; i < 50; i++)
        {
            cv::dilate(matDst, matDst, cv::noArray());
        }
        perfMeasure_end("Dilate");

        for (int i = 0; i < 50; i++)
        {
            cv::add(matSrc, matSrc, matDst);
        }
        perfMeasure_end("Add");

        for (int i = 0; i < 50; i++)
        {
            cv::multiply(matSrc, matDst, matDst);
        }
        perfMeasure_end("multiply");

        for (int i = 0; i < 50; i++)
        {
            cv::multiply(matSrc, 2.5, matDst);
        }
        perfMeasure_end("multiply_scalar");

        for (int i = 0; i < 50; i++)
        {
            cv::divide(matSrc, matDst, matDst);
        }
        perfMeasure_end("divide");

        for (int i = 0; i < 50; i++)
        {
            cv::divide(matSrc, 2.5, matDst);
        }
        perfMeasure_end("divide_Scalar");

        for (int i = 0; i < 50; i++)
        {
            cv::addWeighted(matSrc,2.5 ,matDst,0.6 ,0,matDst);
        }
        perfMeasure_end("addWeighted");

    }

    void TestOpenCL(cv::InputArray matSrc)
    {
        printf("[PERF] -= Performance Check =-\n");       
        printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

        //////////// CPU MEASUREMENT CODE //////////////////////
        printf("[PERF] CPU Process\n");
        m_util.perfMeasure_start();
perfMeasure_start();
        cv::Mat img, gray;
        cv::Mat matImg, matDst;

        // Read
        matImg = matSrc.getMat();
        perfMeasure_end("Read");

        //Transfer
        matImg.copyTo(img);
        perfMeasure_end("Transfer CPU->CPU");

        // Process
        OpenCLProc(img, gray);

        // Transfer
        gray.copyTo(matDst);
        perfMeasure_end("Transfer CPU->CPU");

        std::cout << "\n";

        //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
        printf("[PERF] GPU Process (copyTo)\n");
        cv::UMat img1, gray1;
        cv::Mat matImg1, matDst1;

        // Read
        matImg1 = matSrc.getMat();
        perfMeasure_end("Read");

        // Transfer
        matImg1.copyTo(img1);
        perfMeasure_end("Transfer CPU->GPU");

        //Process
        OpenCLProc(img1, gray1);

        // Transfer
        gray1.copyTo(matDst1);
        perfMeasure_end("Transfer GPU->CPU");
        std::cout << "\n";


        return;

    }


int main(int argc, const char** argv)
{
    if (argv[1] == "")
        std::cout << "Please insert image path" << std::endl;
    else {

        cv::Mat matSrc = cv::imread(argv[1]);

            // First round of process to let the GPU initialize
        TestOpenCL(matSrc);

            // The code performance is made based on result from this function
        TestOpenCL(matSrc);

    }


    return 0;
}

And here are the result which i plotted in bar chat. Notice that GPU enabled processes always outperformed CPU implementation except for multiplication and weightedAdd function.

Is there a problem with the OpenCV function? How can multiplication function takes longer than division?

enter image description here

This process is run on - Library : OpenCV3.3.1 build with OpenCL and TBB - CPU : Intel i7-6500U - GPU : Intel HD Graphics 520