Ask Your Question

Revision history [back]

OpenCV TBB: matchTemplate optimization

I am trying to use OpenCV to boost matchTemplate function, but I cannot understand, why perfomance of parallel version is almost the same or in most cases even worse as simple, non-parallel version. I have tried to calculate overhead of parallel process and those numbers are amazingly small and I cannot figure out, why perfomance is not improved. Here is all code, you can just copy and run.

#include "stdafx.h"
#include "opencv2\opencv.hpp"

using namespace cv;
using namespace std;

class Parallel_process : public cv::ParallelLoopBody
{

private:
    cv::Mat img;
    cv::Mat& retVal;
    Mat patchVal;
    int size;
    int diff;

public:
    Parallel_process(cv::Mat inputImage, cv::Mat& outImage, Mat patch,
                        int sizeVal, int diffVal)
                : img(inputImage), retVal(outImage), patchVal(patch),
                    size(sizeVal), diff(diffVal){}
    virtual void operator()(const cv::Range& range) const
    {
        for(int i = range.start; i < range.end; i++)
        {
                    //taking only 1/diff part of the image for each process.
            Mat in(img, Rect(0, (retVal.rows - 1) / diff * i, retVal.cols + patchVal.cols - 1, (retVal.rows - 1) / diff + patchVal.rows));
            Mat out(retVal, Rect(0, (retVal.rows - 1) / diff * i, retVal.cols, (retVal.rows - 1) / diff + 1));
            matchTemplate(in, patchVal, out, CV_TM_CCORR_NORMED);
        }
    }
};

int _tmain(int argc, _TCHAR* argv[])
{

    cv::VideoCapture cam(0);
    cv::Mat frame;
    cv::Mat img;
    bool stop = false;

    while (!stop) {
        if (!cam.read(frame)) {
            break;
        }
        img = frame.clone();

        Mat patch = frame(Rect(300, 200, 200, 200)); 
        Mat result; result.create(frame.size() - patch.size() + Size(1, 1), CV_32FC1);
        imshow("Main", frame);

        double duration;
        duration = static_cast<double>(cv::getTickCount());

        //choose type:
        // >> parallel version
        parallel_for_(Range(0, 8), Parallel_process(img, result, patch, 7, 8));
        // >> simple version
        //matchTemplate(img, patch, result, CV_TM_CCORR_NORMED);

        duration = static_cast<double>(cv::getTickCount()) - duration;
        duration = duration / cv::getTickFrequency(); // the elapsed time in ms
        cout << "Match Duration: " << duration << endl;

        imshow("Result", result);
        if (cv::waitKey(10) > 0) {stop = true;}
    }
    return(0);
}

You can choose which method to use for matchTemplate calculation (parallel or non-parallel). By the way, when using parallel version, I am taking just part of the image (let's say 1/8, that is defined by "diff" variable) and I do all of the calculations just for that part. In such way, each process handles only one part of the image, that is 1/diff.

Could someone explain me why this happens? I tried similar thing to Gaussian filter and there improvement is not very huge (about 30%), but it is at least better...

Thanks a lot.