Good Day!
My question would be, that i'm currently trying to optimize my C++ program for GPU. My PC (relevant part):
- Geforce GTX 780
- I5-6600K
- Corsair vengeance 2.6GHZ memory 16GB
My code is pretty big, because it's connected with an AI, and i also use Landmark detection aswell, so now i will post only the relevant part of the code. Basictly my problem is, then every settings how i try gives slower results on GPU then CPU.
My code:
double cascade_ScaleFactor=1.2;
cascade_MinNumberNeighbor=3;
void facedetector(cv::Mat& frame, BufferFaceGPU& b)
{
double processT,processT_total;
/****************************/
/***********GPU**************/
/****************************/
if (GPUx==1){
/***********VERSION 1.0 OLD*************/
cascade_gpu->setMinObjectSize(cascadeMinSize);
cascade_gpu->setMaxObjectSize(cascadeMaxSize);
processT_total = (double)cv::getTickCount();
std::vector<Rect> faces;
cv::Mat cpu_frame_gray;
processT = (double)cv::getTickCount();
b.gpu_frame.upload(frame);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_upload.txt", processT);
processT = (double)cv::getTickCount();
cv::cuda::cvtColor(b.gpu_frame, b.gpu_frame, CV_BGR2GRAY);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_cvtColor.txt", processT);
processT = (double)cv::getTickCount();
cv::cuda::equalizeHist(b.gpu_frame, b.gpu_frame);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_equalizeHist.txt", processT);
processT = (double)cv::getTickCount();
cascade_gpu->detectMultiScale(b.gpu_frame, b.gpu_faces);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_detectMultiScale.txt", processT);
processT = (double)cv::getTickCount();
cascade_gpu->convert(b.gpu_faces, faces);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_convert.txt", processT);
processT = (double)cv::getTickCount();
b.gpu_frame.download(cpu_frame_gray);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_download.txt", processT);
if (!faces.empty())
{
processT = (double)cv::getTickCount();
get_landmarks(faces, cpu_frame_gray, frame);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_getLandmarks.txt", processT);
}
processT_total = (double)cv::getTickCount() - processT_total;
processT_total /= (double)cv::getTickFrequency();
read_write_data_tofile("GPU_data_total.txt", processT_total);
}
/****************************/
/***********CPU**************/
/****************************/
else if(GPUx==2){
cv::Mat frame_gray;
std::vector<Rect> faces;
processT_total = (double)cv::getTickCount();
processT = (double)cv::getTickCount();
cv::cvtColor(frame, frame_gray, CV_BGR2GRAY);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("CPU_data_cvtColor.txt", processT);
processT = (double)cv::getTickCount();
cv::equalizeHist(frame_gray, frame_gray);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("CPU_data_equalizeHist.txt", processT);
processT = (double)cv::getTickCount();
face_cascade.detectMultiScale(frame_gray, faces, cascade_ScaleFactor, cascade_MinNumberNeighbor, 0 | CV_HAAR_SCALE_IMAGE, cascadeMinSize,cascadeMaxSize);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("CPU_data_detecetMultiScale.txt", processT);
if (!faces.empty())
{
processT = (double)cv::getTickCount();
get_landmarks(faces, frame_gray, frame);
processT = (double)cv::getTickCount() - processT;
processT /= (double)cv::getTickFrequency();
read_write_data_tofile("CPU_data_getLandmarks.txt", processT);
}
processT_total = (double)cv::getTickCount() - processT_total;
processT_total /= (double)cv::getTickFrequency();
read_write_data_tofile("CPU_data_total.txt", processT_total);
}
else{
errormsg("Something went wrong!\nEXIT");
}
}
Sorry for the long code. I tried a bunch of optimalization (e.g. the max and min size is in a PID controller, and it's alwasy have to search for just a reasenable size of faces).
I'm monitoring the FPS and the process times also, and get almost a 1/4 of the CPU processed FPS. My results in monitoring is just like that:
On the last picture you can clearly see that the most difference is between the detectMultiScale function. I also use structure for the GPU aswell:
struct BufferFaceGPU{
cv::cuda::GpuMat gpu_frame, gpu_faces;
};
My current idea is maybe on Visual Studio 2013 i built my solution wrong perhalps? I don't actually know, that do i need to set for example In the Build Customization CUDA on?
.. or did i forget something in the preferences maybe?
I'm not sure, what im doing wrong. Can you help, or any suggestion?
Thanks in advance.