Revision history - OpenCV Q&A Forum

I made a quick test for Canny and GaussianBlur. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel Canny I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

I made a quick test for Canny and GaussianBlur. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel ~~Canny~~ GaussianBlur I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

I made a quick test for Canny and GaussianBlur. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel GaussianBlur I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

I made a quick test (no loops, but some iterations manually) for Canny and ~~GaussianBlur.~~ GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel GaussianBlur I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel GaussianBlur I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double sigmaY, _sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel GaussianBlur I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double _sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

EDIT: whatever comes to my mind: You said that you are running in a VM. Check if you set the number of cores in your VM > 1. Otherwise I have no more ideas.

I made a quick test (no loops, but some iterations manually) for Canny and GaussianBlur inside my image processing library at work. My system is a i7 7700 with 4 cores and 8 threads on Windows with Visual Studio 2015. I used a random grayscale image with 4MP. The results are:

Canny:

1 thread 13.5ms
2 threads 9ms
4 threads 6.5ms
8 threads 6.5ms

And for GaussianBlur:

1 thread: 4ms
2 threads: 3ms
4 threads: 2ms
8 threads: 1ms

For parallel GaussianBlur I used this implementation:

class ParallelGaussianBlurImpl_ : public ParallelLoopBody
{
public:
    ParallelGaussianBlurImpl_(const Mat &_src, Mat &_dst, Size _kSize, double _sigmaX, double _sigmaY, int _borderType) :
        src(_src), dst(_dst), kSize(_kSize), sigmaX(_sigmaX), sigmaY(_sigmaY), borderType(_borderType)
        {} 

    ParallelGaussianBlurImpl_& operator=(const ParallelGaussianBlurImpl_&) { return *this; } 

    inline void operator()(const Range &r) const {
        cv::GaussianBlur(src.rowRange(r.start, r.end), dst.rowRange(r.start, r.end), kSize, sigmaX, sigmaY, borderType);
    } 

private:
    const Mat &src;
    Mat &dst;
    Size kSize;
    double sigmaX, sigmaY;
    int borderType;
};

void parallelGaussianBlur(InputArray _src, OutputArray _dst, Size kSize, double sigmaX, double sigmaY, int borderType) {
    const int numThreads = ocl::useOpenCL() ? 1 : max(1, min(getNumThreads(), getNumberOfCPUs())); 

    if(numThreads == 1 || borderType & BORDER_ISOLATED) {
        cv::GaussianBlur(_src, _dst, kSize, sigmaX, sigmaY, borderType);
    } else {
        _dst.create(_src.size(), _src.type());
        Mat src = _src.getMat(), dst = _dst.getMat();
        parallel_for_(Range(0, _src.rows()), ParallelGaussianBlurImpl_(src, dst, kSize, sigmaX, sigmaY, borderType), numThreads);
    }
}

It seems that Canny does not scale that good with hyperthreading, but GaussianBlur do. It's possible, that some other processes falsify the test a bit, but the tendency is unambiguous. Did you make tests with setNumThreads() or just with different builds?

EDIT: whatever comes to my mind: You said that you are ~~running~~ working in a VM. Check if you set the number of cores in your VM > 1. Otherwise I have no more ideas.

Revision history [back]