Ask Your Question

Revision history [back]

That's not an answer but is is easier to insert code and results:

only to compile kernels *******

[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 4.584 ms
[TIME]                 cvtColor : 59.819 ms
[TIME]             GaussianBlur : 584.149 ms
[TIME]                    Canny : 492.338 ms
[TIME]                   Dilate : 43.844 ms
[TIME]                      Add : 110.266 ms
[TIME]                 multiply : 88.820 ms
[TIME]          multiply_scalar : 767.707 ms
[TIME]                   divide : 354.996 ms
[TIME]            divide_Scalar : 1784.366 ms
[TIME]              addWeighted : 318.085 ms
[TIME]        Transfer CPU->CPU : 3.922 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME]        Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME]                 cvtColor : 72.888 ms
[TIME]             GaussianBlur : 343.475 ms
[TIME]                    Canny : 195.293 ms
[TIME]                   Dilate : 85.200 ms
[TIME]                      Add : 9.628 ms
[TIME]                 multiply : 9.469 ms
[TIME]          multiply_scalar : 13.250 ms
[TIME]                   divide : 29.945 ms
[TIME]            divide_Scalar : 32.137 ms
[TIME]              addWeighted : 94.774 ms
[TIME]        Transfer GPU->CPU : 19.806 ms

Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 5.736 ms
[TIME]                 cvtColor : 65.285 ms
[TIME]             GaussianBlur : 582.992 ms
[TIME]                    Canny : 428.836 ms
[TIME]                   Dilate : 44.551 ms
[TIME]                      Add : 93.305 ms
[TIME]                 multiply : 95.427 ms
[TIME]          multiply_scalar : 781.954 ms
[TIME]                   divide : 366.936 ms
[TIME]            divide_Scalar : 1782.807 ms
[TIME]              addWeighted : 336.931 ms
[TIME]        Transfer CPU->CPU : 8.400 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.063 ms
[TIME]        Transfer CPU->GPU : 11.910 ms
[TIME]                 cvtColor : 1.609 ms
[TIME]             GaussianBlur : 328.819 ms
[TIME]                    Canny : 168.397 ms
[TIME]                   Dilate : 77.406 ms
[TIME]                      Add : 3.617 ms
[TIME]                 multiply : 1.209 ms
[TIME]          multiply_scalar : 1.350 ms
[TIME]                   divide : 1.892 ms
[TIME]            divide_Scalar : 2.366 ms
[TIME]              addWeighted : 2.807 ms
[TIME]        Transfer GPU->CPU : 168.320 ms


#include <opencv2/opencv.hpp>
#include <iostream>

using namespace cv;
using namespace std;

// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;

void perfMeasure_start()
{
    aChrono.reset();
    aChrono.start();
}

void perfMeasure_end(std::string strLabel)
{
    aChrono.stop();
    printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
    aChrono.reset();
    aChrono.start();
}

void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
    perfMeasure_start();
    for (int i = 0; i < 50; i++)
    {
        cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
    }
    perfMeasure_end("cvtColor");

    for (int i = 0; i < 50; i++)
    {
        cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
    }
    perfMeasure_end("GaussianBlur");

    for (int i = 0; i < 50; i++)
    {
        cv::Canny(matDst, matDst, 0, 50);
    }
    perfMeasure_end("Canny");

    for (int i = 0; i < 50; i++)
    {
        cv::dilate(matDst, matDst, cv::noArray());
    }
    perfMeasure_end("Dilate");

    for (int i = 0; i < 50; i++)
    {
        cv::add(matSrc, matSrc, matDst);
    }
    perfMeasure_end("Add");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, matDst, matDst);
    }
    perfMeasure_end("multiply");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, 2.5, matDst);
    }
    perfMeasure_end("multiply_scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, matDst, matDst);
    }
    perfMeasure_end("divide");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, 2.5, matDst);
    }
    perfMeasure_end("divide_Scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::addWeighted(matSrc, 2.5, matDst, 0.6, 0, matDst);
    }
    perfMeasure_end("addWeighted");

}

void TestOpenCL(cv::InputArray matSrc)
{
    printf("[PERF] -= Performance Check =-\n");
    printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

    //////////// CPU MEASUREMENT CODE //////////////////////
    printf("[PERF] CPU Process\n");
   perfMeasure_start();
    cv::Mat img, gray;
    cv::Mat matImg, matDst;

    // Read
    matImg = matSrc.getMat();
    perfMeasure_end("Read");

    //Transfer
    matImg.copyTo(img);
    perfMeasure_end("Transfer CPU->CPU");

    // Process
    OpenCLProc(img, gray);

    // Transfer
    gray.copyTo(matDst);
    perfMeasure_end("Transfer CPU->CPU");

    std::cout << "\n";

    //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
    printf("[PERF] GPU Process (copyTo)\n");
    cv::UMat img1, gray1;
    cv::Mat matImg1, matDst1;

    // Read
    matImg1 = matSrc.getMat();
    perfMeasure_end("Read");

    // Transfer
    matImg1.copyTo(img1);
    perfMeasure_end("Transfer CPU->GPU");

    //Process
    OpenCLProc(img1, gray1);

    // Transfer
    gray1.copyTo(matDst1);
    perfMeasure_end("Transfer GPU->CPU");
    std::cout << "\n";


    return;

}


int main(int argc, const char** argv)
{
        cv::Mat matSrc = cv::imread("g:/lib/opencv/samples/data/lena.jpg");
        resize(matSrc, matSrc,Size(), 4, 4, INTER_LINEAR);
        // First round of process to let the GPU initialize
        cout << "only to compile kernels *************************\n";
        TestOpenCL(matSrc);

        // The code performance is made based on result from this function
        cout << "Real Test *************************\n";
        TestOpenCL(matSrc);
    return 0;
}

That's not an answer but is is easier to insert code and results:results

without image description or image description

only to compile kernels *******

[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 4.584 ms
[TIME]                 cvtColor : 59.819 ms
[TIME]             GaussianBlur : 584.149 ms
[TIME]                    Canny : 492.338 ms
[TIME]                   Dilate : 43.844 ms
[TIME]                      Add : 110.266 ms
[TIME]                 multiply : 88.820 ms
[TIME]          multiply_scalar : 767.707 ms
[TIME]                   divide : 354.996 ms
[TIME]            divide_Scalar : 1784.366 ms
[TIME]              addWeighted : 318.085 ms
[TIME]        Transfer CPU->CPU : 3.922 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME]        Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME]                 cvtColor : 72.888 ms
[TIME]             GaussianBlur : 343.475 ms
[TIME]                    Canny : 195.293 ms
[TIME]                   Dilate : 85.200 ms
[TIME]                      Add : 9.628 ms
[TIME]                 multiply : 9.469 ms
[TIME]          multiply_scalar : 13.250 ms
[TIME]                   divide : 29.945 ms
[TIME]            divide_Scalar : 32.137 ms
[TIME]              addWeighted : 94.774 ms
[TIME]        Transfer GPU->CPU : 19.806 ms

Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 5.736 ms
[TIME]                 cvtColor : 65.285 ms
[TIME]             GaussianBlur : 582.992 ms
[TIME]                    Canny : 428.836 ms
[TIME]                   Dilate : 44.551 ms
[TIME]                      Add : 93.305 ms
[TIME]                 multiply : 95.427 ms
[TIME]          multiply_scalar : 781.954 ms
[TIME]                   divide : 366.936 ms
[TIME]            divide_Scalar : 1782.807 ms
[TIME]              addWeighted : 336.931 ms
[TIME]        Transfer CPU->CPU : 8.400 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.063 ms
[TIME]        Transfer CPU->GPU : 11.910 ms
[TIME]                 cvtColor : 1.609 ms
[TIME]             GaussianBlur : 328.819 ms
[TIME]                    Canny : 168.397 ms
[TIME]                   Dilate : 77.406 ms
[TIME]                      Add : 3.617 ms
[TIME]                 multiply : 1.209 ms
[TIME]          multiply_scalar : 1.350 ms
[TIME]                   divide : 1.892 ms
[TIME]            divide_Scalar : 2.366 ms
[TIME]              addWeighted : 2.807 ms
[TIME]        Transfer GPU->CPU : 168.320 ms


#include <opencv2/opencv.hpp>
#include <iostream>

using namespace cv;
using namespace std;

// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;

void perfMeasure_start()
{
    aChrono.reset();
    aChrono.start();
}

void perfMeasure_end(std::string strLabel)
{
    aChrono.stop();
    printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
    aChrono.reset();
    aChrono.start();
}

void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
    perfMeasure_start();
    for (int i = 0; i < 50; i++)
    {
        cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
    }
    perfMeasure_end("cvtColor");

    for (int i = 0; i < 50; i++)
    {
        cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
    }
    perfMeasure_end("GaussianBlur");

    for (int i = 0; i < 50; i++)
    {
        cv::Canny(matDst, matDst, 0, 50);
    }
    perfMeasure_end("Canny");

    for (int i = 0; i < 50; i++)
    {
        cv::dilate(matDst, matDst, cv::noArray());
    }
    perfMeasure_end("Dilate");

    for (int i = 0; i < 50; i++)
    {
        cv::add(matSrc, matSrc, matDst);
    }
    perfMeasure_end("Add");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, matDst, matDst);
    }
    perfMeasure_end("multiply");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, 2.5, matDst);
    }
    perfMeasure_end("multiply_scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, matDst, matDst);
    }
    perfMeasure_end("divide");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, 2.5, matDst);
    }
    perfMeasure_end("divide_Scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::addWeighted(matSrc, 2.5, matDst, 0.6, 0, matDst);
    }
    perfMeasure_end("addWeighted");

}

void TestOpenCL(cv::InputArray matSrc)
{
    printf("[PERF] -= Performance Check =-\n");
    printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

    //////////// CPU MEASUREMENT CODE //////////////////////
    printf("[PERF] CPU Process\n");
   perfMeasure_start();
    cv::Mat img, gray;
    cv::Mat matImg, matDst;

    // Read
    matImg = matSrc.getMat();
    perfMeasure_end("Read");

    //Transfer
    matImg.copyTo(img);
    perfMeasure_end("Transfer CPU->CPU");

    // Process
    OpenCLProc(img, gray);

    // Transfer
    gray.copyTo(matDst);
    perfMeasure_end("Transfer CPU->CPU");

    std::cout << "\n";

    //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
    printf("[PERF] GPU Process (copyTo)\n");
    cv::UMat img1, gray1;
    cv::Mat matImg1, matDst1;

    // Read
    matImg1 = matSrc.getMat();
    perfMeasure_end("Read");

    // Transfer
    matImg1.copyTo(img1);
    perfMeasure_end("Transfer CPU->GPU");

    //Process
    OpenCLProc(img1, gray1);

    // Transfer
    gray1.copyTo(matDst1);
    perfMeasure_end("Transfer GPU->CPU");
    std::cout << "\n";


    return;

}


int main(int argc, const char** argv)
{
        cv::Mat matSrc = cv::imread("g:/lib/opencv/samples/data/lena.jpg");
        resize(matSrc, matSrc,Size(), 4, 4, INTER_LINEAR);
        // First round of process to let the GPU initialize
        cout << "only to compile kernels *************************\n";
        TestOpenCL(matSrc);

        // The code performance is made based on result from this function
        cout << "Real Test *************************\n";
        TestOpenCL(matSrc);
    return 0;
}

That's not an answer but is is easier to insert code and results

without patch for image description or for image description

only to compile kernels *******

[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 4.584 ms
[TIME]                 cvtColor : 59.819 ms
[TIME]             GaussianBlur : 584.149 ms
[TIME]                    Canny : 492.338 ms
[TIME]                   Dilate : 43.844 ms
[TIME]                      Add : 110.266 ms
[TIME]                 multiply : 88.820 ms
[TIME]          multiply_scalar : 767.707 ms
[TIME]                   divide : 354.996 ms
[TIME]            divide_Scalar : 1784.366 ms
[TIME]              addWeighted : 318.085 ms
[TIME]        Transfer CPU->CPU : 3.922 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.052 ms
[ INFO:0] Initialize OpenCL runtime...
[TIME]        Transfer CPU->GPU : 711.447 ms
[ INFO:0] Successfully initialized OpenCL cache directory: C:\Users\LAUREN~1.PC-\AppData\Local\Temp\opencv\3.4.0-dev\opencl_cache\
[ INFO:0] Preparing OpenCL cache configuration for context: NVIDIA_Corporation--GeForce_GTX_970--376_53
[TIME]                 cvtColor : 72.888 ms
[TIME]             GaussianBlur : 343.475 ms
[TIME]                    Canny : 195.293 ms
[TIME]                   Dilate : 85.200 ms
[TIME]                      Add : 9.628 ms
[TIME]                 multiply : 9.469 ms
[TIME]          multiply_scalar : 13.250 ms
[TIME]                   divide : 29.945 ms
[TIME]            divide_Scalar : 32.137 ms
[TIME]              addWeighted : 94.774 ms
[TIME]        Transfer GPU->CPU : 19.806 ms

Real Test *************************
[PERF] -= Performance Check =-
[PERF] ImageSize = 2048 x 2048
[PERF] CPU Process
[TIME]                     Read : 0.001 ms
[TIME]        Transfer CPU->CPU : 5.736 ms
[TIME]                 cvtColor : 65.285 ms
[TIME]             GaussianBlur : 582.992 ms
[TIME]                    Canny : 428.836 ms
[TIME]                   Dilate : 44.551 ms
[TIME]                      Add : 93.305 ms
[TIME]                 multiply : 95.427 ms
[TIME]          multiply_scalar : 781.954 ms
[TIME]                   divide : 366.936 ms
[TIME]            divide_Scalar : 1782.807 ms
[TIME]              addWeighted : 336.931 ms
[TIME]        Transfer CPU->CPU : 8.400 ms

[PERF] GPU Process (copyTo)
[TIME]                     Read : 0.063 ms
[TIME]        Transfer CPU->GPU : 11.910 ms
[TIME]                 cvtColor : 1.609 ms
[TIME]             GaussianBlur : 328.819 ms
[TIME]                    Canny : 168.397 ms
[TIME]                   Dilate : 77.406 ms
[TIME]                      Add : 3.617 ms
[TIME]                 multiply : 1.209 ms
[TIME]          multiply_scalar : 1.350 ms
[TIME]                   divide : 1.892 ms
[TIME]            divide_Scalar : 2.366 ms
[TIME]              addWeighted : 2.807 ms
[TIME]        Transfer GPU->CPU : 168.320 ms


#include <opencv2/opencv.hpp>
#include <iostream>

using namespace cv;
using namespace std;

// Here are the functions used to measure time
double m_dTime = 0;
TickMeter aChrono;

void perfMeasure_start()
{
    aChrono.reset();
    aChrono.start();
}

void perfMeasure_end(std::string strLabel)
{
    aChrono.stop();
    printf("[TIME]%25s : %.3lf ms\n", strLabel.c_str(), aChrono.getTimeMilli());
    aChrono.reset();
    aChrono.start();
}

void OpenCLProc(cv::InputArray matSrc, cv::OutputArray matDst)
{
    perfMeasure_start();
    for (int i = 0; i < 50; i++)
    {
        cv::cvtColor(matSrc, matDst, cv::COLOR_BGR2GRAY);
    }
    perfMeasure_end("cvtColor");

    for (int i = 0; i < 50; i++)
    {
        cv::GaussianBlur(matDst, matDst, cv::Size(9, 9), 1.5);
    }
    perfMeasure_end("GaussianBlur");

    for (int i = 0; i < 50; i++)
    {
        cv::Canny(matDst, matDst, 0, 50);
    }
    perfMeasure_end("Canny");

    for (int i = 0; i < 50; i++)
    {
        cv::dilate(matDst, matDst, cv::noArray());
    }
    perfMeasure_end("Dilate");

    for (int i = 0; i < 50; i++)
    {
        cv::add(matSrc, matSrc, matDst);
    }
    perfMeasure_end("Add");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, matDst, matDst);
    }
    perfMeasure_end("multiply");

    for (int i = 0; i < 50; i++)
    {
        cv::multiply(matSrc, 2.5, matDst);
    }
    perfMeasure_end("multiply_scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, matDst, matDst);
    }
    perfMeasure_end("divide");

    for (int i = 0; i < 50; i++)
    {
        cv::divide(matSrc, 2.5, matDst);
    }
    perfMeasure_end("divide_Scalar");

    for (int i = 0; i < 50; i++)
    {
        cv::addWeighted(matSrc, 2.5, matDst, 0.6, 0, matDst);
    }
    perfMeasure_end("addWeighted");

}

void TestOpenCL(cv::InputArray matSrc)
{
    printf("[PERF] -= Performance Check =-\n");
    printf("[PERF] ImageSize = %d x %d\n", matSrc.cols(), matSrc.rows());

    //////////// CPU MEASUREMENT CODE //////////////////////
    printf("[PERF] CPU Process\n");
   perfMeasure_start();
    cv::Mat img, gray;
    cv::Mat matImg, matDst;

    // Read
    matImg = matSrc.getMat();
    perfMeasure_end("Read");

    //Transfer
    matImg.copyTo(img);
    perfMeasure_end("Transfer CPU->CPU");

    // Process
    OpenCLProc(img, gray);

    // Transfer
    gray.copyTo(matDst);
    perfMeasure_end("Transfer CPU->CPU");

    std::cout << "\n";

    //////////////////// GPU MEASUREMENT CODE 1 //////////////////////
    printf("[PERF] GPU Process (copyTo)\n");
    cv::UMat img1, gray1;
    cv::Mat matImg1, matDst1;

    // Read
    matImg1 = matSrc.getMat();
    perfMeasure_end("Read");

    // Transfer
    matImg1.copyTo(img1);
    perfMeasure_end("Transfer CPU->GPU");

    //Process
    OpenCLProc(img1, gray1);

    // Transfer
    gray1.copyTo(matDst1);
    perfMeasure_end("Transfer GPU->CPU");
    std::cout << "\n";


    return;

}


int main(int argc, const char** argv)
{
        cv::Mat matSrc = cv::imread("g:/lib/opencv/samples/data/lena.jpg");
        resize(matSrc, matSrc,Size(), 4, 4, INTER_LINEAR);
        // First round of process to let the GPU initialize
        cout << "only to compile kernels *************************\n";
        TestOpenCL(matSrc);

        // The code performance is made based on result from this function
        cout << "Real Test *************************\n";
        TestOpenCL(matSrc);
    return 0;
}