Ask Your Question

Revision history [back]

For using TBB, I just perform cos() and using uchar for Mat.

The testing result for 1920x1080 image : TBB is faster 2.7 times than normal way.

I ignore some timing codes, but this should be OK to run.

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>

using namespace std ;
using namespace tbb ;
using namespace cv ;

class parallel_pixel
{
    uchar *p_row ;

public:
    parallel_pixel(uchar *row_ptr ) : p_row(row_ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p_row[i] = (uchar)cos( p_row[i] )  ;    // I just use cos()
        }
    }
} ;

int main()
{
    int width = 1920 ;
    int height = 1080 ;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy



    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; i++ )
    {
        p1[i] = cos( p1[i] ) ;
    }
    // timing stop



    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << "answer not match" <<  endl;
        }
    }

    return 0;
}

For using TBB, I just perform cos() and using uchar for Mat.

The testing result for 1920x1080 image : TBB is faster 2.7 times than normal way.

I ignore some timing codes, but this should be OK to run.

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>

using namespace std ;
using namespace tbb ;
using namespace cv ;

class parallel_pixel
{
    uchar *p_row *p ;

public:
    parallel_pixel(uchar *row_ptr *ptr ) : p_row(row_ptr) p(ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p_row[i] p[i] = (uchar)cos( p_row[i] p[i] )  ;    // I just use cos()
        }
    }
} ;

int main()
{
    int width = 1920 ;
    int height = 1080 ;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy



    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; i++ )
    {
        p1[i] = cos( p1[i] ) ;
    }
    // timing stop



    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << "answer not match" <<  endl;
        }
    }

    return 0;
}

For using TBB, I just perform cos() and using uchar for Mat.

The testing result for 1920x1080 image : TBB is faster 2.7 times than normal way.

I ignore some timing codes, but this should be OK to run.

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>

using namespace std ;
using namespace tbb ;
using namespace cv ;

class parallel_pixel
{
    uchar *p ;

public:
    parallel_pixel(uchar *ptr ) : p(ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p[i] = (uchar)cos( p[i] )  ;    // I just use cos()
        }
    }
} ;

int main()
{
    int width = 1920 ;
    int height = 1080 ;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy



    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; i++ )
    {
        p1[i] = cos( (uchar)cos( p1[i] ) ;
    }
    // timing stop



    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << "answer not match" <<  endl;
        }
    }

    return 0;
}

For using TBB, ( in this example, I just perform cos() and using single channel uchar for Mat. )

The testing result for 1920x1080 image : TBB is faster 2.7 times faster than the normal way.

I ignore some timing codes, but this should be OK to run.

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>

using namespace std ;
using namespace tbb ;
using namespace cv ;

class parallel_pixel
{
    uchar *p ;

public:
    parallel_pixel(uchar *ptr ) : p(ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p[i] = (uchar)cos( p[i] )  ;    // I just use cos()
        }
    }
} ;

int main()
{
    int width = 1920 ;
    int height = 1080 ;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;
;     // only for single channel

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy



    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; i++ ++i )
    {
        p1[i] = (uchar)cos( p1[i] ) ;
    }
    // timing stop



    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << "answer not match" <<  endl;
        }
    }

    return 0;
}

For using As suggested by Guanta below and http://answers.opencv.org/question/3730/how-to-use-parallel_for/

I compare 3 ways which are normal, TBB, and opencv_parallel.

( in this example, I just perform cos() and using with 1920x1080 to 19200x10800 , uchar, single channel uchar for Mat. Mat under 2.3GHz Core i5 MBP )

The testing result for 1920x1080 image : TBB Amazingly, the built in OpenCV ParallelLoopBody win !!

Here is 2.7 times faster than the normal way.

I ignore some timing codes, but this should be OK to run.code,

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
<tbb/tbb.h>                                    // for tbb
#include <opencv2/highgui/highgui.hpp>
<opencv2/highgui/highgui.hpp>    // ParallelLoopBody is include (core.hpp)

using namespace std ;
using namespace tbb ;
using namespace cv ;

// this class is for tbb, delete it if you don't needed it 
class parallel_pixel
{
private:
    uchar *p ;
 public:
    parallel_pixel(uchar *ptr ) : p(ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p[i] = (uchar)cos( p[i] )  ;    // I just use cos()
        }
    }
} ;

// this class is for OpenCV ParallelLoopBody
class Parallel_pixel_opencv : public ParallelLoopBody
{
private:
    uchar *p ;
public:
    Parallel_pixel_opencv(uchar* ptr ) : p(ptr) {}

    virtual void operator()( const Range &r ) const
    {
        for ( register int i = r.start; i != r.end; ++i)
        {
            p[i] = (uchar)cos( p[i] )  ;
        }
    }
};


int main()
{
    int width = 1920 ;
*3;
    int height = 1080 ;
*3;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;     // only for single channel

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb
    Mat old2 ;                                     // clone for ParallelLoopBody

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy
     old2 = src.clone() ;

    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; ++i )
    {
        p1[i] = (uchar)cos( p1[i] ) ;
    }
    // timing stop
 
    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop


    // --------- opencv way ----------
    uchar* p3 = old2.data ;

    // timing ParallelLoopBody start

    parallel_for_( Range(0,nElements) , Parallel_pixel_opencv(p3)) ;

    // timing ParallelLoopBody stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << "answer i << " tbb answer not match" <<  endl;
        }
        if( p1[i] != p3[i] )  {
            cout << i << " opencv answer not match" <<  endl;
        }
    }

    return 0;
}

The result is :

normal time: 754.778 ms

TBB time: 223.938 ms

opencv time: 200.656 ms

normal/tbb = 3.37048 (sorry, in last post I report 2.7 because my cpu is doing something else)

normal/opencv = 3.76155

click to hide/show revision 6
including OpenCV cv::ParallelLoopBody method suggest by Guanta

As suggested by Guanta below and http://answers.opencv.org/question/3730/how-to-use-parallel_for/

I compare 3 ways which are normal, TBB, and opencv_parallel.

( with 1920x1080 to 19200x10800 , uchar, single channel Mat under 2.3GHz Core i5 MBP )

Amazingly, the built in OpenCV ParallelLoopBody win !!

Here is the code,

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>                                    // for tbb
#include <opencv2/highgui/highgui.hpp>    // ParallelLoopBody is include (core.hpp)

using namespace std ;
using namespace tbb ;
using namespace cv ;

// this class is for tbb, delete it if you don't needed it 
class parallel_pixel
{
private:
    uchar *p ;
public:
    parallel_pixel(uchar *ptr ) : p(ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p[i] = (uchar)cos( p[i] )  ;    // I just use cos()
        }
    }
} ;

// this class is for OpenCV ParallelLoopBody
class Parallel_pixel_opencv : public ParallelLoopBody
{
private:
    uchar *p ;
public:
    Parallel_pixel_opencv(uchar* ptr ) : p(ptr) {}

    virtual void operator()( const Range &r ) const
    {
        for ( register int i = r.start; i != r.end; ++i)
        {
            p[i] = (uchar)cos( p[i] )  ;
        }
    }
};


int main()
{
    int width = 1920 *3;
    int height = 1080 *3;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;     // only for single channel

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb
    Mat old2 ;                                     // clone for ParallelLoopBody

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy
    old2 = src.clone() ;

    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; ++i )
    {
        p1[i] = (uchar)cos( p1[i] ) ;
    }
    // timing stop

    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop


    // --------- opencv way ----------
    uchar* p3 = old2.data ;

    // timing ParallelLoopBody start

    parallel_for_( Range(0,nElements) , Parallel_pixel_opencv(p3)) ;

    // timing ParallelLoopBody stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << i << " tbb answer not match" <<  endl;
        }
        if( p1[i] != p3[i] )  {
            cout << i << " opencv answer not match" <<  endl;
        }
    }

    return 0;
}

The result is :

normal time: 754.778 ms

TBB time: 223.938 ms

opencv time: 200.656 ms

normal/tbb = 3.37048 (sorry, in last post I report 2.7 because my cpu is doing something else)

normal/opencv = 3.76155

As suggested by Guanta below and http://answers.opencv.org/question/3730/how-to-use-parallel_for/

I compare 3 ways which are normal, TBB, and opencv_parallel.

( with 1920x1080 to 19200x10800 , uchar, single channel Mat under 2.3GHz Core i5 MBP )

Amazingly, the The built in OpenCV ParallelLoopBody win !!

Here is the code,

#include <iostream>
#include <cmath>
#include <tbb/tbb.h>                                    // for tbb
#include <opencv2/highgui/highgui.hpp>    // ParallelLoopBody is include included (core.hpp)

using namespace std ;
using namespace tbb ;
using namespace cv ;

// this class is for tbb, delete it if you don't needed it 
class parallel_pixel
{
private:
    uchar *p ;
public:
    parallel_pixel(uchar *ptr ) : p(ptr) { }

    void operator() ( const blocked_range<int>& r ) const
    {
        for ( int i = r.begin(); i != r.end(); i++ ) {
            p[i] = (uchar)cos( p[i] )  ;    // I just use cos()
        }
    }
} ;

// this class is for OpenCV ParallelLoopBody
class Parallel_pixel_opencv : public ParallelLoopBody
{
private:
    uchar *p ;
public:
    Parallel_pixel_opencv(uchar* ptr ) : p(ptr) {}

    virtual void operator()( const Range &r ) const
    {
        for ( register int i = r.start; i != r.end; ++i)
        {
            p[i] = (uchar)cos( p[i] )  ;
        }
    }
};


int main()
{
    int width = 1920 *3;
    int height = 1080 *3;

    // If too small nElements the tbb will take longer time, since tbb need to be started and copy
    int nElements = width*height ;     // only for single channel

    Mat src( Size(width,height) , CV_8UC1 ) ;       // for one_by_one run
    Mat old ;                                       // clone for tbb
    Mat old2 ;                                     // clone for ParallelLoopBody

    // just put some initial value
    int v = 0 ;
    for( int w = 0 ; w < src.rows ; ++w )
    {
        for( int h = 0 ; h < src.cols ; ++h )
        {
            src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
            v++ ;
        }
    }
    // initial end
    old = src.clone() ;    // save a copy
    old2 = src.clone() ;

    // --------- normal way ----------- 
    uchar* p1 = src.data ;    // p1 for normal way

    // normal way : one_by_one iteration
    // timing start
    for( int i = 0 ; i < nElements ; ++i )
    {
        p1[i] = (uchar)cos( p1[i] ) ;
    }
    // timing stop

    // --------- TBB way -----------
    task_scheduler_init init ;    // start tbb
    uchar* p2 = old.data ;    // p2 for tbb way

    // timing tbb start, 
    // parameter = 800 is testing on my computer has best performance

    parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;

    // timing tbb stop


    // --------- opencv way ----------
    uchar* p3 = old2.data ;

    // timing ParallelLoopBody start

    parallel_for_( Range(0,nElements) , Parallel_pixel_opencv(p3)) ;

    // timing ParallelLoopBody stop



    // checking if normal way has the same result as tbb way
    for( int i = 0 ; i < nElements ; ++i ) {
        if( p1[i] != p2[i] ) {
            cout << i << " tbb answer not match" <<  endl;
        }
        if( p1[i] != p3[i] )  {
            cout << i << " opencv answer not match" <<  endl;
        }
    }

    return 0;
}

The result is :

normal time: 754.778 ms

TBB time: 223.938 ms

opencv time: 200.656 ms

normal/tbb = 3.37048 (sorry, in last post I report 2.7 because my cpu is doing something else)

normal/opencv = 3.76155