I am running this tutorial example:
int mandelbrot(const std::complex<float> &z0, const int max) {
std::complex<float> z = z0;
for(int t = 0; t < max; t++) {
if(z.real()*z.real() + z.imag()*z.imag() > 4.0f) return t;
z = z * z + z0;
}
return max;
}
int mandelbrotFormula(const std::complex<float> &z0, const int maxIter = 500) {
int value = mandelbrot(z0, maxIter);
if(maxIter - value == 0) {
return 0;
}
return cvRound(sqrt(value / (float)maxIter) * 255);
}
void test(){
cv::Mat mandelbrotImg(4800, 5400, CV_8U);
float x1 = -2.1f, x2 = 0.6f;
float y1 = -1.2f, y2 = 1.2f;
float scaleX = mandelbrotImg.cols / (x2 - x1);
float scaleY = mandelbrotImg.rows / (y2 - y1);
cv::parallel_for_(cv::Range(0, mandelbrotImg.rows*mandelbrotImg.cols), [&](const cv::Range& range) {
for(int r = range.start; r < range.end; r++) {
int i = r / mandelbrotImg.cols;
int j = r % mandelbrotImg.cols;
float x0 = j / scaleX + x1;
float y0 = i / scaleY + y1;
std::complex<float> z0(x0, y0);
uchar value = (uchar)mandelbrotFormula(z0);
mandelbrotImg.ptr<uchar>(i)[j] = value;
}
});
}
If I call cv::setNumThreads(0);
, the code run in single call and range is between 0 and number of pixels. This is correct behavior. However, when I run with cv::setNumThreads(4);, the lambda is called N times(where N is the number of the pixels) and range is always single item (e.g. [0,1] , [1,2] ...).
This makes the parallel version much slower than the serial version (10x slower).
Any idea why this is happening?