Revision history [back]

You could use threads just for run your algoritms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation.

I'm showing how to apply 2 different algorithms over same frame, using 2 sequential calls and simple threading. The example below suffering of poor threading implementation because thread construction will introduce hig overhead.

On my computer, results shows than sequential way is about 2time faster than simple threading !

#include <thread>

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::stopl << "Average processing time:" << std::stopl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "";

    std::cout << std::stopl << "Press a Enter to terminate ";
    std::cin.get();
}

You could use threads just for run your algoritms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation.

On my computer, results ~~shows~~ show than sequential way is ~~about 2time~~ faster than simple ~~threading !~~threading, it depends on background computer load, sequential might be up to 3 time faster!

#include <thread>

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::stopl << "Average processing time:" << std::stopl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "";

    std::cout << std::stopl << "Press a Enter to terminate ";
    std::cin.get();
}

You could use threads just for run your algoritms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel ~~implementation.~~implementation using a stream from a webcam as input.

On my computer, results show than sequential way is faster than simple threading, it depends on background computer load, sequential might be up to 3 time faster!

#include <thread>

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::stopl << "Average processing time:" << std::stopl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "";

    std::cout << std::stopl << "Press a Enter to terminate ";
    std::cin.get();
}

You could just use threads ~~just for~~ to run your ~~algoritms~~ algorithms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be this is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation using a stream from a webcam as input.

On my computer, results show ~~than~~ that the sequential way is faster than simple threading, it depends on background computer load, sequential might be up to 3 time ~~faster!~~faster !

#include <thread>

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::stopl << "Average processing time:" << std::stopl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "";

    std::cout << std::stopl << "Press a Enter to terminate ";
    std::cin.get();
}

You could just use threads to run your algorithms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be this is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation using a stream from a webcam as input.

On my computer, results show that the sequential way is faster than simple threading, it depends on background computer load, sequential might be up to 3 2 time ~~faster !~~faster.

EDIT: Added measure of treading overhead.. Look at my timing:

Debug version within MS VisualStudio 2013: Parallel: 16.3ms Sequential: 12.8ms Overhead:3.5ms
Release version within MS VisualStudio 2013: Parallel: 8.1ms Sequential: 4.3ms Overhead:4.9ms
Release version from command line: Parallel: 3.6ms Sequential: 2.7ms Overhead:0.6ms

the code:

#include <thread>
#include <opencv2/opencv.hpp>
using namespace cv;

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

// empty function to measure overhead
void Test()
{
   return;
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0, testParallel = 0,testSequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        // measure threading overhead (2 calls)
        int n = 2;
        start = clock();
        Test();
        Test();
        stop = clock();
        testSequential += (stop - start);

        start = clock();
        std::thread thTest1(&Test);
        std::thread thTest2(&Test);
        thTest1.join();
        thTest2.join();
        stop = clock();
        testParallel += (stop - start);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;
    double overHead = 1000.0*(testParallel - testSequential) / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::stopl std::endl << "Average processing time:" << std::stopl
time (2 calls):" << std::endl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "\t Overhead: " << overHead << "ms"
        << "";

    std::cout << std::stopl std::endl << "Press a Enter to terminate ";
    std::cin.get();
}

You could just use threads to run your algorithms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be this is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation using a stream from a webcam as input.

On my computer, results show that the sequential way is faster than simple threading, it depends on background computer load, sequential might be up to 2 time faster.

EDIT: Added measure of treading overhead.. Look at my timing:

Debug version within MS VisualStudio 2013: Parallel: 16.3ms Sequential: 12.8ms Parallel:16.3ms Sequential:12.8ms Overhead:3.5ms
Release version within MS VisualStudio 2013: Parallel: 8.1ms Sequential: 4.3ms Parallel:8.1ms Sequential:4.3ms Overhead:4.9ms
Release version from command line: ~~Parallel:~~ 3.6ms Sequential: 2.7ms Parallel:3.6ms Sequential:2.7ms Overhead:0.6ms

the code:

#include <thread>
#include <opencv2/opencv.hpp>
using namespace cv;

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

// empty function to measure overhead
void Test()
{
   return;
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0, testParallel = 0,testSequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        // measure threading overhead (2 calls)
        int n = 2;
        start = clock();
        Test();
        Test();
        stop = clock();
        testSequential += (stop - start);

        start = clock();
        std::thread thTest1(&Test);
        std::thread thTest2(&Test);
        thTest1.join();
        thTest2.join();
        stop = clock();
        testParallel += (stop - start);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;
    double overHead = 1000.0*(testParallel - testSequential) / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::endl << "Average processing time (2 calls):" << std::endl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "\t Overhead: " << overHead << "ms"
        << "";

    std::cout << std::endl << "Press a Enter to terminate ";
    std::cin.get();
}

You could just use threads to run your algorithms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be this is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation using a stream from a webcam as input.

On my computer, results show that the sequential way is faster than simple threading, it depends on background computer load, sequential might be up to 2 time faster.

EDIT: Added measure of treading overhead.. Look at my ~~timing:~~timing (win7/64, intel i3):

webcam @320x240, OCV 2.4.10:
- Debug ~~version~~ ver within MS VisualStudio 2013: Parallel:16.3ms Sequential:12.8ms Overhead:3.5ms
- Release ~~version~~ ver within MS VisualStudio 2013: Parallel:8.1ms Sequential:4.3ms Overhead:4.9ms
- Release ~~version~~ ver from command line: Parallel:3.6ms Sequential:2.7ms Overhead:0.6ms
webcam @640x480, OCV 2.4.10:Parallel:11.65ms Sequential:11.48ms Overhead:0.67ms
webcam @640x480, OCV 3.0.0:Parallel:8.67ms Sequential:8.37ms Overhead:0.69ms

the code:

#include <thread>
#include <opencv2/opencv.hpp>
using namespace cv;

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

// empty function to measure overhead
void Test()
{
   return;
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0, testParallel = 0,testSequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        // measure threading overhead (2 calls)
        int n = 2;
        start = clock();
        Test();
        Test();
        stop = clock();
        testSequential += (stop - start);

        start = clock();
        std::thread thTest1(&Test);
        std::thread thTest2(&Test);
        thTest1.join();
        thTest2.join();
        stop = clock();
        testParallel += (stop - start);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;
    double overHead = 1000.0*(testParallel - testSequential) / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::endl << "Average processing time (2 calls):" << std::endl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "\t Overhead: " << overHead << "ms"
        << "";

    std::cout << std::endl << "Press a Enter to terminate ";
    std::cin.get();
}

You could just use threads to run your algorithms but don't expect faster performance because:

OpenCV has a lot of internal parallelization;
To go at full speed using threads you need a well designed threading architecture (like producer/consumers) and may be this is out of your scope;

Below is simple example, here I'm comparing sequential vs parallel implementation using a stream from a webcam as input.

On my computer, results show that the sequential way is faster than simple threading, it depends on background computer load, sequential might be up to 2 time faster.

EDIT: Added measure of treading overhead.. Look at my timing (win7/64, intel ~~i3):~~i3 2x2.53Ghz):

webcam @320x240, OCV 2.4.10:
- Debug ver within MS VisualStudio 2013: Parallel:16.3ms Sequential:12.8ms Overhead:3.5ms
- Release ver within MS VisualStudio 2013: Parallel:8.1ms Sequential:4.3ms Overhead:4.9ms
- Release ver from command line: Parallel:3.6ms Sequential:2.7ms Overhead:0.6ms
webcam @640x480, OCV 2.4.10:Parallel:11.65ms Sequential:11.48ms Overhead:0.67ms
webcam @640x480, OCV 3.0.0:Parallel:8.67ms Sequential:8.37ms Overhead:0.69ms

EDIT2: Considering tuannhtn answer, looks interesting to investigate a bit over different results

For sure advanced parallel programming in IPP improves overall performance but really on Intel i3 I can't see any improvement between sequential and parallel approach. I suppose that difference is due to different processor architecture.

Core Duo 2x2.4 and Intel i3 2x2.53 have 2 cores but CoreDuo doesn't have Hyper-Threading and SmartCache.

When Hyper-Threading is available, some operations share the execution resources automatically in parallel (I/O, cache, bus interface..) on more logical processor. Hyper-Threading and SmartCache make more efficient use of available execution resources boosting sequential approach.

On CoreDuo load balancing on is demanded to developer than parallel approach gets better result.

This can explains why parallel approach is better on CoreDuo but is close to sequential approach on Intel i3. Looking at performance with video 640x480:

CoreDuo/Ocv3.0.0/Win7/64: Parallel:8.66ms Sequential:13.47ms Overhead:0.6ms
i3/Ocv3.0.0/Win//64: Parallel:8.67ms Sequential:8.37ms Overhead:0.69ms

the code:

#include <thread>
#include <opencv2/opencv.hpp>
using namespace cv;

// here we use canny 
void Algo1(const cv::Mat &src, cv::Mat *dst)
{
    cvtColor(src, *dst, CV_BGR2GRAY);
    GaussianBlur(*dst, *dst, Size(7, 7), 1.5, 1.5);
    Canny(*dst, *dst, 0, 30, 3);
}

// here we use morphology gradient
void Algo2(const cv::Mat &src, cv::Mat *dst)
{
    int morph_size = 1;
    cv::Size sz(2 * morph_size + 1, 2 * morph_size + 1);
    cv::Point anchor(morph_size, morph_size);
    Mat element = getStructuringElement(MORPH_RECT, sz, anchor);
    morphologyEx(src, *dst, MORPH_GRADIENT, element);
}

// empty function to measure overhead
void Test()
{
   return;
}

int main()
{
    VideoCapture cap(-1); // open the default camera
    if (!cap.isOpened()) // check if we succeeded
        return -1;

    clock_t  parallel = 0, sequential = 0, testParallel = 0,testSequential = 0;
    clock_t start, stop;
    int cnt=0;
    for (;;)
    {
        Mat src,dst1,dst2;
        cap >> src; // get a new frame from camera
        imshow("src", src);

        //Try it with sequential way
        start = clock();
        Algo1(src, &dst1);
        Algo2(src, &dst2);
        stop = clock();
        sequential += (stop - start);

        imshow("Sequential Algo1", dst1);
        imshow("Sequential Algo2", dst2);

        // try simple parallel processing way
        start = clock();
        std::thread th1(&Algo1, src, &dst1);
        std::thread th2(&Algo2, src, &dst2);
        th1.join();
        th2.join();
        stop = clock();
        parallel += (stop - start);

        imshow("Paralllel Algo1", dst1);
        imshow("Paralllel Algo2", dst2);

        // measure threading overhead (2 calls)
        int n = 2;
        start = clock();
        Test();
        Test();
        stop = clock();
        testSequential += (stop - start);

        start = clock();
        std::thread thTest1(&Test);
        std::thread thTest2(&Test);
        thTest1.join();
        thTest2.join();
        stop = clock();
        testParallel += (stop - start);

        cnt++;

        if (waitKey(30) >= 0)
            break;
    }

    double parTime = 1000.0*parallel / cnt / (double)CLOCKS_PER_SEC;
    double seqTime = 1000.0*sequential / cnt / (double)CLOCKS_PER_SEC;
    double overHead = 1000.0*(testParallel - testSequential) / cnt / (double)CLOCKS_PER_SEC;

    std::cout << std::endl << "Average processing time (2 calls):" << std::endl
        << "Parallel: " << parTime<< "ms"
        << "\t Sequential: " << seqTime << "ms"
        << "\t Overhead: " << overHead << "ms"
        << "";

    std::cout << std::endl << "Press a Enter to terminate ";
    std::cin.get();
}