Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

speed improvement cascade gpu

Hi, I just relate the improvement I got by chance, working with cascadeclassifier_gpu.cpp (v 2.4.10) I tried this file with a webcam video stream and one face picture in front of the cam. I was disappointed by the performance of my new gpu_2 against gpu_1. Gpu_1 computes at 5.5 fps, gpu_2 at 7.5 fps. In order to simplify the main code I wrote a function dealing with gpu code only. The result was astonishing, gpu_2 computes 10 x faster in average, between 40 and 100 fps average 70, same for gpu_1 between 30 and 40 fps, average 35. I precise that the high speed computing appears only when there is a detection, without the speed slow down at 7.5 for gpu 2, 5.5 for gpu_2 What is going on ? does anybody have an idea ? Regards

Linux 3.19.8-100.fc20.x86_64 #1 SMP nvidia driver 340.76 GPU_1 geforce 9500 GT 500M 32 cores GPU_2 geforce GT 720 1024M 192 cores

* part of the original code without the use of a function:

    (image.empty() ? frame : image).copyTo(frame_cpu);
    frame_gpu.upload(image.empty() ? frame : image);

    convertAndResize(frame_gpu, gray_gpu, resized_gpu, scaleFactor);
    convertAndResize(frame_cpu, gray_cpu, resized_cpu, scaleFactor);

    TickMeter tm;
    tm.start();
cascade_gpu.findLargestObject = findLargestObject;

 detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2,
                                                      (filterRects || findLargestObject) ? 4 : 0);
    facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);

    resized_gpu.download(resized_cpu);
    for (int i = 0; i < detections_num; ++i)
    {
            rectangle(resized_cpu, faces_downloaded.ptr<cv::Rect>()[i], Scalar(255));
    }

    tm.stop();
    double detectionTime = tm.getTimeMilli();
    double fps = 1000 / detectionTime;

* the function created

static int getTargets (Mat &frame, CascadeClassifier_GPU &cascade_gpu, double scaleFactor, Rect* &faceRects, Mat &resized_cpu) { int detections_num=0; GpuMat facesBuf_gpu, frame_gpu, gray_gpu, resized_gpu; Mat faces_downloaded;

    frame_gpu.upload( frame );
    convertAndResize(frame_gpu, gray_gpu, resized_gpu, scaleFactor);

cascade_gpu.findLargestObject = true;

detections_num = cascade_gpu.detectMultiScale(resized_gpu, facesBuf_gpu, 1.2, 4);
    facesBuf_gpu.colRange(0, detections_num).download(faces_downloaded);

 resized_gpu.download(resized_cpu);

if( detections_num > 0)
    {
      faceRects = faces_downloaded.ptr<Rect>();
}
 return detections_num;

}

* main code modified

    (image.empty() ? frame : image).copyTo(frame_cpu);
    convertAndResize(frame_cpu, gray_cpu, resized_cpu, scaleFactor);
    TickMeter tm;
    tm.start();

 Rect *faceRects ;
 detections_num = getTargets(frame, cascade_gpu, scaleFactor, faceRects, resized_cpu);  

    for (int i = 0; i < detections_num; ++i)
    {
    rectangle(resized_cpu, faceRects[i], Scalar(255));
    }

    tm.stop();