1 | initial version |
For using TBB, I just perform cos() and using uchar for Mat.
The testing result for 1920x1080 image : TBB is faster 2.7 times than normal way.
I ignore some timing codes, but this should be OK to run.
#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>
using namespace std ;
using namespace tbb ;
using namespace cv ;
class parallel_pixel
{
uchar *p_row ;
public:
parallel_pixel(uchar *row_ptr ) : p_row(row_ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p_row[i] = (uchar)cos( p_row[i] ) ; // I just use cos()
}
}
} ;
int main()
{
int width = 1920 ;
int height = 1080 ;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ;
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; i++ )
{
p1[i] = cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << "answer not match" << endl;
}
}
return 0;
}
2 | No.2 Revision |
For using TBB, I just perform cos() and using uchar for Mat.
The testing result for 1920x1080 image : TBB is faster 2.7 times than normal way.
I ignore some timing codes, but this should be OK to run.
#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>
using namespace std ;
using namespace tbb ;
using namespace cv ;
class parallel_pixel
{
uchar *p_row *p ;
public:
parallel_pixel(uchar *row_ptr *ptr ) : p_row(row_ptr) p(ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p_row[i] p[i] = (uchar)cos( p_row[i] p[i] ) ; // I just use cos()
}
}
} ;
int main()
{
int width = 1920 ;
int height = 1080 ;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ;
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; i++ )
{
p1[i] = cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << "answer not match" << endl;
}
}
return 0;
}
3 | No.3 Revision |
For using TBB, I just perform cos() and using uchar for Mat.
The testing result for 1920x1080 image : TBB is faster 2.7 times than normal way.
I ignore some timing codes, but this should be OK to run.
#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>
using namespace std ;
using namespace tbb ;
using namespace cv ;
class parallel_pixel
{
uchar *p ;
public:
parallel_pixel(uchar *ptr ) : p(ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p[i] = (uchar)cos( p[i] ) ; // I just use cos()
}
}
} ;
int main()
{
int width = 1920 ;
int height = 1080 ;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ;
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; i++ )
{
p1[i] = cos( (uchar)cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << "answer not match" << endl;
}
}
return 0;
}
4 | No.4 Revision |
For using TBB, ( in this example, I just perform cos() and using single channel uchar for Mat. )
The testing result for 1920x1080 image : TBB is faster 2.7 times faster than the normal way.
I ignore some timing codes, but this should be OK to run.
#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
#include <opencv2/highgui/highgui.hpp>
using namespace std ;
using namespace tbb ;
using namespace cv ;
class parallel_pixel
{
uchar *p ;
public:
parallel_pixel(uchar *ptr ) : p(ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p[i] = (uchar)cos( p[i] ) ; // I just use cos()
}
}
} ;
int main()
{
int width = 1920 ;
int height = 1080 ;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ;
; // only for single channel
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; i++ ++i )
{
p1[i] = (uchar)cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << "answer not match" << endl;
}
}
return 0;
}
5 | No.5 Revision |
For using As suggested by Guanta below and http://answers.opencv.org/question/3730/how-to-use-parallel_for/
I compare 3 ways which are normal, TBB, and opencv_parallel.
( in this example, I just perform cos() and using with 1920x1080 to 19200x10800 , uchar, single channel uchar for Mat. Mat under 2.3GHz Core i5 MBP )
The testing result for 1920x1080 image : TBB Amazingly, the built in OpenCV ParallelLoopBody win !!
Here is 2.7 times faster than the normal way.
I ignore some timing codes, but this should be OK to run.code,
#include <iostream>
#include <cmath>
#include <tbb/tbb.h>
<tbb/tbb.h> // for tbb
#include <opencv2/highgui/highgui.hpp>
<opencv2/highgui/highgui.hpp> // ParallelLoopBody is include (core.hpp)
using namespace std ;
using namespace tbb ;
using namespace cv ;
// this class is for tbb, delete it if you don't needed it
class parallel_pixel
{
private:
uchar *p ;
public:
parallel_pixel(uchar *ptr ) : p(ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p[i] = (uchar)cos( p[i] ) ; // I just use cos()
}
}
} ;
// this class is for OpenCV ParallelLoopBody
class Parallel_pixel_opencv : public ParallelLoopBody
{
private:
uchar *p ;
public:
Parallel_pixel_opencv(uchar* ptr ) : p(ptr) {}
virtual void operator()( const Range &r ) const
{
for ( register int i = r.start; i != r.end; ++i)
{
p[i] = (uchar)cos( p[i] ) ;
}
}
};
int main()
{
int width = 1920 ;
*3;
int height = 1080 ;
*3;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ; // only for single channel
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
Mat old2 ; // clone for ParallelLoopBody
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
old2 = src.clone() ;
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; ++i )
{
p1[i] = (uchar)cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// --------- opencv way ----------
uchar* p3 = old2.data ;
// timing ParallelLoopBody start
parallel_for_( Range(0,nElements) , Parallel_pixel_opencv(p3)) ;
// timing ParallelLoopBody stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << "answer i << " tbb answer not match" << endl;
}
if( p1[i] != p3[i] ) {
cout << i << " opencv answer not match" << endl;
}
}
return 0;
}
The result is :
normal time: 754.778 ms
TBB time: 223.938 ms
opencv time: 200.656 ms
normal/tbb = 3.37048 (sorry, in last post I report 2.7 because my cpu is doing something else)
normal/opencv = 3.76155
6 | including OpenCV cv::ParallelLoopBody method suggest by Guanta |
As suggested by Guanta below and http://answers.opencv.org/question/3730/how-to-use-parallel_for/
I compare 3 ways which are normal, TBB, and opencv_parallel.
( with 1920x1080 to 19200x10800 , uchar, single channel Mat under 2.3GHz Core i5 MBP )
Amazingly, the built in OpenCV ParallelLoopBody win !!
Here is the code,
#include <iostream>
#include <cmath>
#include <tbb/tbb.h> // for tbb
#include <opencv2/highgui/highgui.hpp> // ParallelLoopBody is include (core.hpp)
using namespace std ;
using namespace tbb ;
using namespace cv ;
// this class is for tbb, delete it if you don't needed it
class parallel_pixel
{
private:
uchar *p ;
public:
parallel_pixel(uchar *ptr ) : p(ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p[i] = (uchar)cos( p[i] ) ; // I just use cos()
}
}
} ;
// this class is for OpenCV ParallelLoopBody
class Parallel_pixel_opencv : public ParallelLoopBody
{
private:
uchar *p ;
public:
Parallel_pixel_opencv(uchar* ptr ) : p(ptr) {}
virtual void operator()( const Range &r ) const
{
for ( register int i = r.start; i != r.end; ++i)
{
p[i] = (uchar)cos( p[i] ) ;
}
}
};
int main()
{
int width = 1920 *3;
int height = 1080 *3;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ; // only for single channel
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
Mat old2 ; // clone for ParallelLoopBody
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
old2 = src.clone() ;
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; ++i )
{
p1[i] = (uchar)cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// --------- opencv way ----------
uchar* p3 = old2.data ;
// timing ParallelLoopBody start
parallel_for_( Range(0,nElements) , Parallel_pixel_opencv(p3)) ;
// timing ParallelLoopBody stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << i << " tbb answer not match" << endl;
}
if( p1[i] != p3[i] ) {
cout << i << " opencv answer not match" << endl;
}
}
return 0;
}
The result is :
normal time: 754.778 ms
TBB time: 223.938 ms
opencv time: 200.656 ms
normal/tbb = 3.37048 (sorry, in last post I report 2.7 because my cpu is doing something else)
normal/opencv = 3.76155
7 | No.7 Revision |
As suggested by Guanta below and http://answers.opencv.org/question/3730/how-to-use-parallel_for/
I compare 3 ways which are normal, TBB, and opencv_parallel.
( with 1920x1080 to 19200x10800 , uchar, single channel Mat under 2.3GHz Core i5 MBP )
Amazingly, the The built in OpenCV ParallelLoopBody win !!
Here is the code,
#include <iostream>
#include <cmath>
#include <tbb/tbb.h> // for tbb
#include <opencv2/highgui/highgui.hpp> // ParallelLoopBody is include included (core.hpp)
using namespace std ;
using namespace tbb ;
using namespace cv ;
// this class is for tbb, delete it if you don't needed it
class parallel_pixel
{
private:
uchar *p ;
public:
parallel_pixel(uchar *ptr ) : p(ptr) { }
void operator() ( const blocked_range<int>& r ) const
{
for ( int i = r.begin(); i != r.end(); i++ ) {
p[i] = (uchar)cos( p[i] ) ; // I just use cos()
}
}
} ;
// this class is for OpenCV ParallelLoopBody
class Parallel_pixel_opencv : public ParallelLoopBody
{
private:
uchar *p ;
public:
Parallel_pixel_opencv(uchar* ptr ) : p(ptr) {}
virtual void operator()( const Range &r ) const
{
for ( register int i = r.start; i != r.end; ++i)
{
p[i] = (uchar)cos( p[i] ) ;
}
}
};
int main()
{
int width = 1920 *3;
int height = 1080 *3;
// If too small nElements the tbb will take longer time, since tbb need to be started and copy
int nElements = width*height ; // only for single channel
Mat src( Size(width,height) , CV_8UC1 ) ; // for one_by_one run
Mat old ; // clone for tbb
Mat old2 ; // clone for ParallelLoopBody
// just put some initial value
int v = 0 ;
for( int w = 0 ; w < src.rows ; ++w )
{
for( int h = 0 ; h < src.cols ; ++h )
{
src.at<uchar>(w,h) = saturate_cast<uchar>(v) ;
v++ ;
}
}
// initial end
old = src.clone() ; // save a copy
old2 = src.clone() ;
// --------- normal way -----------
uchar* p1 = src.data ; // p1 for normal way
// normal way : one_by_one iteration
// timing start
for( int i = 0 ; i < nElements ; ++i )
{
p1[i] = (uchar)cos( p1[i] ) ;
}
// timing stop
// --------- TBB way -----------
task_scheduler_init init ; // start tbb
uchar* p2 = old.data ; // p2 for tbb way
// timing tbb start,
// parameter = 800 is testing on my computer has best performance
parallel_for(blocked_range<int>(0, nElements, 800), parallel_pixel(p2) ) ;
// timing tbb stop
// --------- opencv way ----------
uchar* p3 = old2.data ;
// timing ParallelLoopBody start
parallel_for_( Range(0,nElements) , Parallel_pixel_opencv(p3)) ;
// timing ParallelLoopBody stop
// checking if normal way has the same result as tbb way
for( int i = 0 ; i < nElements ; ++i ) {
if( p1[i] != p2[i] ) {
cout << i << " tbb answer not match" << endl;
}
if( p1[i] != p3[i] ) {
cout << i << " opencv answer not match" << endl;
}
}
return 0;
}
The result is :
normal time: 754.778 ms
TBB time: 223.938 ms
opencv time: 200.656 ms
normal/tbb = 3.37048 (sorry, in last post I report 2.7 because my cpu is doing something else)
normal/opencv = 3.76155