1 | initial version |
The following are my guesses as a non expert.
I think that in the first case, the product val * mats[m]
will produce a temporary mat variable with the appropriate size whereas mats.at< Type >( r, c ) * val
needs only a temporary Type variable.
What library did you use for multithreading your second code? Is it 8 times faster when you compare the second code multi-threaded vs your first code multi-threaded or vs your first code single-threaded? Also, what is the size of your matrix as some optimization methods are effective only on large data?
Here some comparisons / attempts to improve the performance:
at
: What is the most effective way to access cv::Mat elements in a loop?parallel_for_
in addition: How to use parallel_for?The results are on my computer (50 iterations, 100 matrices of size: 1000x800):
Original method: sum1=[5.0812e+009, 0, 0, 0] ; t1=8.87578 s
Matrix method: sum2=[5.0812e+009, 0, 0, 0] ; t2=17.1907 s
Pointer access: sum3=[5.0812e+009, 0, 0, 0] ; t3=5.40258 s
Pointer access + unroll loop: sum4=[5.0812e+009, 0, 0, 0] ; t4=5.24404 s
Pointer access + unroll loop + parallel_for_: sum5=[5.0812e+009, 0, 0, 0] ; t5=4.79474 s
The best improvment seems to be achieved when switching from matrix multiplication with a scalar to iterating and perform directly the multiplication on matrix elements (17.1907 s vs 8.87578 s).
Switching to pointer access gives also a resonable improvment (8.87578 s vs 5.40258 s).
The code I used for benchmarking:
#include <opencv2/opencv.hpp>
//Original method
template< typename Type >
void ScaleMatAndSum( cv::Mat& scale, const cv::Mat& mats, const Type val )
{
for( int r = 0; r < mats.rows; r++ )
{
for( int c = 0; c < mats.cols; c++ )
{
scale.at< Type >( r, c ) += mats.at< Type >( r, c ) * val;
}
}
}
//Pointer access
template< typename Type >
void ScaleMatAndSum_ptr( cv::Mat& scale, const cv::Mat& mats, const Type val )
{
Type *ptr_scale_rows;
const Type *ptr_mat_rows;
for( int r = 0; r < mats.rows; r++ )
{
ptr_scale_rows = scale.ptr<Type>(r);
ptr_mat_rows = mats.ptr<Type>(r);
for( int c = 0; c < mats.cols; c++ )
{
ptr_scale_rows[c] += ptr_mat_rows[c] * val;
}
}
}
//Pointer access + unroll loop
template< typename Type >
void ScaleMatAndSum_ptr_unroll_loop( cv::Mat& scale, const cv::Mat& mats, const Type val )
{
Type *ptr_scale_rows;
const Type *ptr_mat_rows;
for( int r = 0; r < mats.rows; r++ )
{
ptr_scale_rows = scale.ptr<Type>(r);
ptr_mat_rows = mats.ptr<Type>(r);
for( int c = 0; c < mats.cols; c += 4 )
{
ptr_scale_rows[c] += ptr_mat_rows[c] * val;
ptr_scale_rows[c+1] += ptr_mat_rows[c+1] * val;
ptr_scale_rows[c+2] += ptr_mat_rows[c+2] * val;
ptr_scale_rows[c+3] += ptr_mat_rows[c+3] * val;
}
}
}
//Pointer access + unroll loop + ParallelLoopBody
template <class Type>
class Parallel_ScaleAndSum: public cv::ParallelLoopBody
{
private:
cv::Mat m_mat;
Type m_mul;
cv::Mat *m_result;
public:
Parallel_ScaleAndSum(cv::Mat *result, const cv::Mat &mat, const Type &mul)
: m_mat(mat), m_mul(mul), m_result(result)
{
}
virtual void operator()( const cv::Range &range ) const
{
Type *ptr_scale_rows;
const Type *ptr_mat_rows;
for(int r = range.start; r < range.end; r++)
{
ptr_scale_rows = m_result->ptr<Type>(r);
ptr_mat_rows = m_mat.ptr<Type>(r);
for(int c = 0; c < m_mat.cols; c += 4)
{
ptr_scale_rows[c] += ptr_mat_rows[c] * m_mul;
ptr_scale_rows[c+1] += ptr_mat_rows[c+1] * m_mul;
ptr_scale_rows[c+2] += ptr_mat_rows[c+2] * m_mul;
ptr_scale_rows[c+3] += ptr_mat_rows[c+3] * m_mul;
}
}
}
};
//@url=https://stackoverflow.com/questions/9905093/how-to-check-whether-two-matrixes-are-identical-in-opencv
bool areEqual(const cv::Mat& a, const cv::Mat& b) {
cv::Mat temp;
cv::bitwise_xor(a,b,temp); //It vectorizes well with SSE/NEON
return !( cv::countNonZero(temp) );
}
int main()
{
cv::RNG rng(12345);
int nbIterations = 50;
size_t nbMatrices = 100;
std::vector<cv::Mat> vectorOfMatrices(nbMatrices);
std::vector<double> vectorOfMul(nbMatrices);
cv::Size matrixSize(1000, 800);
for(size_t i = 0; i < nbMatrices; i++)
{
//Generate random matrix
cv::Mat m(matrixSize, CV_64F);
rng.fill(m, cv::RNG::UNIFORM, -5000.0, 5000.0);
vectorOfMatrices[i] = m;
//Generate random double scalar between -20.0 and +20.0
vectorOfMul[i] = rng.uniform(-20.0, 20.0);
}
//Original method
cv::Mat result1 = cv::Mat::zeros(matrixSize, CV_64F);
double t1 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
ScaleMatAndSum(result1, vectorOfMatrices[i], vectorOfMul[i]);
}
}
t1 = ((double) cv::getTickCount() - t1) / cv::getTickFrequency();
cv::Scalar sum1 = cv::sum(result1);
std::cout << "Original method: sum1=" << sum1 << " ; t1=" << t1 << " s" << std::endl;
//Matrix method
cv::Mat result2 = cv::Mat::zeros(matrixSize, CV_64F);
double t2 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
result2 += vectorOfMatrices[i]*vectorOfMul[i];
}
}
t2 = ((double) cv::getTickCount() - t2) / cv::getTickFrequency();
cv::Scalar sum2 = cv::sum(result2);
std::cout << "Matrix method: sum2=" << sum2 << " ; t2=" << t2 << " s" << std::endl;
//Use ptr access
cv::Mat result3 = cv::Mat::zeros(matrixSize, CV_64F);
double t3 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
ScaleMatAndSum_ptr(result3, vectorOfMatrices[i], vectorOfMul[i]);
}
}
t3 = ((double) cv::getTickCount() - t3) / cv::getTickFrequency();
cv::Scalar sum3 = cv::sum(result3);
std::cout << "Pointer access: sum3=" << sum3 << " ; t3=" << t3 << " s" << std::endl;
//Use ptr access + unroll loop
cv::Mat result4 = cv::Mat::zeros(matrixSize, CV_64F);
double t4 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
ScaleMatAndSum_ptr_unroll_loop(result4, vectorOfMatrices[i], vectorOfMul[i]);
}
}
t4 = ((double) cv::getTickCount() - t4) / cv::getTickFrequency();
cv::Scalar sum4 = cv::sum(result4);
std::cout << "Pointer access + unroll loop: sum4=" << sum4 << " ; t4=" << t4 << " s" << std::endl;
//Use parallel for + ptr access + unroll loop
cv::Mat result5 = cv::Mat::zeros(matrixSize, CV_64F);
double t5 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
cv::parallel_for_(cv::Range(0, (int) result5.rows), Parallel_ScaleAndSum<double>(&result5, vectorOfMatrices[i], vectorOfMul[i]));
}
}
t5 = ((double) cv::getTickCount() - t5) / cv::getTickFrequency();
cv::Scalar sum5 = cv::sum(result5);
std::cout << "Pointer access + unroll loop + parallel_for_: sum5=" << sum5 << " ; t5=" << t5 << " s" << std::endl;
//Check results
bool equal12 = areEqual(result1, result2);
bool equal13 = areEqual(result1, result3);
bool equal14 = areEqual(result1, result4);
bool equal15 = areEqual(result1, result5);
std::cout << "equal12=" << equal12 << " ; equal13=" << equal13 << " ; equal14=" << equal14 << " ; equal15=" << equal15 << std::endl;
return 0;
}
Other solutions could be:
2 | No.2 Revision |
The following are my guesses as a non expert.
I think that in the first case, the product val * mats[m]
will produce a temporary mat variable with the appropriate size whereas mats.at< Type >( r, c ) * val
needs only a temporary Type variable.
What library did you use for multithreading your second code? Is it 8 times faster when you compare the second code multi-threaded vs your first code multi-threaded or vs your first code single-threaded? Also, what is the size of your matrix as some optimization methods are effective only on large data?
Here some comparisons / attempts to improve the performance:
at
: What is the most effective way to access cv::Mat elements in a loop?parallel_for_
in addition: How to use parallel_for?The results are on my computer (50 iterations, 100 matrices of size: 1000x800):
Original method: sum1=[5.0812e+009, 0, 0, 0] ; t1=8.87578 s
Matrix method: sum2=[5.0812e+009, 0, 0, 0] ; t2=17.1907 s
Pointer access: sum3=[5.0812e+009, 0, 0, 0] ; t3=5.40258 s
Pointer access + unroll loop: sum4=[5.0812e+009, 0, 0, 0] ; t4=5.24404 s
Pointer access + unroll loop + parallel_for_: sum5=[5.0812e+009, 0, 0, 0] ; t5=4.79474 s
The best improvment seems to be achieved when switching from matrix multiplication with a scalar to iterating and perform directly the multiplication on matrix elements (17.1907 s vs 8.87578 s).
Switching to pointer access gives also a resonable improvment (8.87578 s vs 5.40258 s).
The code I used for benchmarking:
#include <opencv2/opencv.hpp>
//Original method
template< typename Type >
void ScaleMatAndSum( cv::Mat& scale, const cv::Mat& mats, const Type val )
{
for( int r = 0; r < mats.rows; r++ )
{
for( int c = 0; c < mats.cols; c++ )
{
scale.at< Type >( r, c ) += mats.at< Type >( r, c ) * val;
}
}
}
//Pointer access
template< typename Type >
void ScaleMatAndSum_ptr( cv::Mat& scale, const cv::Mat& mats, const Type val )
{
Type *ptr_scale_rows;
const Type *ptr_mat_rows;
for( int r = 0; r < mats.rows; r++ )
{
ptr_scale_rows = scale.ptr<Type>(r);
ptr_mat_rows = mats.ptr<Type>(r);
for( int c = 0; c < mats.cols; c++ )
{
ptr_scale_rows[c] += ptr_mat_rows[c] * val;
}
}
}
//Pointer access + unroll loop
template< typename Type >
void ScaleMatAndSum_ptr_unroll_loop( cv::Mat& scale, const cv::Mat& mats, const Type val )
{
Type *ptr_scale_rows;
const Type *ptr_mat_rows;
for( int r = 0; r < mats.rows; r++ )
{
ptr_scale_rows = scale.ptr<Type>(r);
ptr_mat_rows = mats.ptr<Type>(r);
for( int c = 0; c < mats.cols; c += 4 )
{
ptr_scale_rows[c] += ptr_mat_rows[c] * val;
ptr_scale_rows[c+1] += ptr_mat_rows[c+1] * val;
ptr_scale_rows[c+2] += ptr_mat_rows[c+2] * val;
ptr_scale_rows[c+3] += ptr_mat_rows[c+3] * val;
}
}
}
//Pointer access + unroll loop + ParallelLoopBody
template <class Type>
class Parallel_ScaleAndSum: public cv::ParallelLoopBody
{
private:
cv::Mat m_mat;
Type m_mul;
cv::Mat *m_result;
public:
Parallel_ScaleAndSum(cv::Mat *result, const cv::Mat &mat, const Type &mul)
: m_mat(mat), m_mul(mul), m_result(result)
{
}
virtual void operator()( const cv::Range &range ) const
{
Type *ptr_scale_rows;
const Type *ptr_mat_rows;
for(int r = range.start; r < range.end; r++)
{
ptr_scale_rows = m_result->ptr<Type>(r);
ptr_mat_rows = m_mat.ptr<Type>(r);
for(int c = 0; c < m_mat.cols; c += 4)
{
ptr_scale_rows[c] += ptr_mat_rows[c] * m_mul;
ptr_scale_rows[c+1] += ptr_mat_rows[c+1] * m_mul;
ptr_scale_rows[c+2] += ptr_mat_rows[c+2] * m_mul;
ptr_scale_rows[c+3] += ptr_mat_rows[c+3] * m_mul;
}
}
}
};
//@url=https://stackoverflow.com/questions/9905093/how-to-check-whether-two-matrixes-are-identical-in-opencv
bool areEqual(const cv::Mat& a, const cv::Mat& b) {
cv::Mat temp;
cv::bitwise_xor(a,b,temp); //It vectorizes well with SSE/NEON
return !( cv::countNonZero(temp) );
}
int main()
{
cv::RNG rng(12345);
int nbIterations = 50;
size_t nbMatrices = 100;
std::vector<cv::Mat> vectorOfMatrices(nbMatrices);
std::vector<double> vectorOfMul(nbMatrices);
cv::Size matrixSize(1000, 800);
for(size_t i = 0; i < nbMatrices; i++)
{
//Generate random matrix
cv::Mat m(matrixSize, CV_64F);
rng.fill(m, cv::RNG::UNIFORM, -5000.0, 5000.0);
vectorOfMatrices[i] = m;
//Generate random double scalar between -20.0 and +20.0
vectorOfMul[i] = rng.uniform(-20.0, 20.0);
}
//Original method
cv::Mat result1 = cv::Mat::zeros(matrixSize, CV_64F);
double t1 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
ScaleMatAndSum(result1, vectorOfMatrices[i], vectorOfMul[i]);
}
}
t1 = ((double) cv::getTickCount() - t1) / cv::getTickFrequency();
cv::Scalar sum1 = cv::sum(result1);
std::cout << "Original method: sum1=" << sum1 << " ; t1=" << t1 << " s" << std::endl;
//Matrix method
cv::Mat result2 = cv::Mat::zeros(matrixSize, CV_64F);
double t2 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
result2 += vectorOfMatrices[i]*vectorOfMul[i];
}
}
t2 = ((double) cv::getTickCount() - t2) / cv::getTickFrequency();
cv::Scalar sum2 = cv::sum(result2);
std::cout << "Matrix method: sum2=" << sum2 << " ; t2=" << t2 << " s" << std::endl;
//Use ptr access
cv::Mat result3 = cv::Mat::zeros(matrixSize, CV_64F);
double t3 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
ScaleMatAndSum_ptr(result3, vectorOfMatrices[i], vectorOfMul[i]);
}
}
t3 = ((double) cv::getTickCount() - t3) / cv::getTickFrequency();
cv::Scalar sum3 = cv::sum(result3);
std::cout << "Pointer access: sum3=" << sum3 << " ; t3=" << t3 << " s" << std::endl;
//Use ptr access + unroll loop
cv::Mat result4 = cv::Mat::zeros(matrixSize, CV_64F);
double t4 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
ScaleMatAndSum_ptr_unroll_loop(result4, vectorOfMatrices[i], vectorOfMul[i]);
}
}
t4 = ((double) cv::getTickCount() - t4) / cv::getTickFrequency();
cv::Scalar sum4 = cv::sum(result4);
std::cout << "Pointer access + unroll loop: sum4=" << sum4 << " ; t4=" << t4 << " s" << std::endl;
//Use parallel for + ptr access + unroll loop
cv::Mat result5 = cv::Mat::zeros(matrixSize, CV_64F);
double t5 = (double) cv::getTickCount();
for(int cpt = 0; cpt < nbIterations; cpt++)
{
for(size_t i = 0; i < nbMatrices; i++)
{
cv::parallel_for_(cv::Range(0, (int) result5.rows), Parallel_ScaleAndSum<double>(&result5, vectorOfMatrices[i], vectorOfMul[i]));
}
}
t5 = ((double) cv::getTickCount() - t5) / cv::getTickFrequency();
cv::Scalar sum5 = cv::sum(result5);
std::cout << "Pointer access + unroll loop + parallel_for_: sum5=" << sum5 << " ; t5=" << t5 << " s" << std::endl;
//Check results
bool equal12 = areEqual(result1, result2);
bool equal13 = areEqual(result1, result3);
bool equal14 = areEqual(result1, result4);
bool equal15 = areEqual(result1, result5);
std::cout << "equal12=" << equal12 << " ; equal13=" << equal13 << " ; equal14=" << equal14 << " ; equal15=" << equal15 << std::endl;
return 0;
}
Other solutions could be:
Finally, the best is to test the different methods on your real data as the performances will depend mostly on the size of your data and your hardware.