1 | initial version |
I made a quick test for Canny and GaussianBlur. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel Canny I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
2 | No.2 Revision |
I made a quick test for Canny and GaussianBlur. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel Canny GaussianBlur I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
3 | No.3 Revision |
I made a quick test for Canny and GaussianBlur. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel GaussianBlur I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
4 | No.4 Revision |
I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur. GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel GaussianBlur I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
5 | No.5 Revision |
I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel GaussianBlur I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, _sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
6 | No.6 Revision |
I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel GaussianBlur I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double _sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
EDIT: whatever comes to my mind: You said that you are running in a VM. Check if you set the number of cores in your VM > 1. Otherwise I have no more ideas.
7 | No.7 Revision |
I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:
Canny:
And for GaussianBlur:
For parallel GaussianBlur I used this implementation:
class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double _sigmaY, int _borderType) :
src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
{}
ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; }
inline void operator()(const Range &r) const {
cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
}
private:
const Mat &src;
Mat &dst;
Size kSize;
double sigmaX, sigmaY;
int borderType;
};
void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs()));
if(numThreads == 1 || borderType & BORDER_ISOLATED) {
cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
} else {
_dst.create(_src.size(), _src.type());
Mat src = _src.getMat(), dst = _dst.getMat();
parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
}
}
It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?
EDIT: whatever comes to my mind: You said that you are running working in a VM. Check if you set the number of cores in your VM > 1. Otherwise I have no more ideas.