Revision history [back]

GPU cv::scaleAdd is slower than CPU cv::cuda::scaleAdd

Hi, I used the following code to compare execution time between CPU and GPU versions of scaleAdd. I am in Ubuntu 16.04, cuda 8.0, 740M card, and core i7 5th gen in my laptop.

However, the results I get is that the CPU version is always faster.

Could you execute the code and provide me your results with machine configuration?

I compile with

OPENCV_LIBPATH=/usr/local/lib
OPENCV_LIBS=`pkg-config opencv --cflags --libs`
g++ -c scaleAdd.cpp -Wall -Wextra -m64 -lm  -L$(OPENCV_LIBPATH) $(OPENCV_LIBS)

image description

#include <opencv2/core.hpp>
#include "opencv2/cudaarithm.hpp"
#include <iostream>
using namespace std;

class Timer
{
private:
  double tick;
public:
  Timer()
  {
  }
  void tic()
  {
    tick = (double)cv::getTickCount();
  }
  double toc()
  {
    return ((double)cv::getTickCount() - tick)/cv::getTickFrequency();
  }
};


int main()
{
  size_t LOOPS = 100;
  size_t MAX_SIZE=6000;
  Timer clock;
  float filter= 1;
  cv::cuda::GpuMat buffer;
  cv::cuda::GpuMat result;

  cout<< "Time in ms" << endl;
  cout<< "N,BW(GBS),GPU,CPU"<< endl;

  for (size_t N=200; N<MAX_SIZE; N*=1.1)
    {
      //setup buffer
      cv::Mat h_buffer = cv::Mat::ones(N, N, CV_32FC1);
      cv::Mat h_result = cv::Mat::ones(N, N, CV_32FC1);

      clock.tic();
      buffer.upload(h_buffer);
      double bw = N*N*32.0/8.0/clock.toc()/1e9;
      result.upload(h_result);

      clock.tic();
      for (size_t i=0; i<LOOPS; i++)
    cv::cuda::scaleAdd(buffer, filter, result, result);
      double gpu = clock.toc()/(double)LOOPS;

      clock.tic();
      for (size_t i=0; i<LOOPS; i++)
    cv::scaleAdd(h_buffer, filter, h_result, h_result);
      double cpu = clock.toc()/(double)LOOPS;

      cout<< N << "," << bw << "," << gpu << "," << cpu << endl;
    }

}