GPU runs much slower than CPU
You may have seen this popular tutorial for GPU: I tried to implement the addition kernel of this webpage and compare the processing time between CPU and GPU. Code:
#include <stdio.h>
#include <iostream>
#include <math.h>
#include <conio.h>
#include <stdlib.h>
#include <conio.h>
#include < Windows.h>
#include <opencv2/core.hpp>
//#include <opencv2/opencv.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace cv;
// Kernel function to add the elements of two arrays
void add(long int n, float *x, float *y)
int index = threadIdx.x;
int stride = blockDim.x;
int i = blockIdx.x*blockDim.x + threadIdx.x;
int STEP_LEN = blockDim.x* gridDim.x;
//*xx = int( n / (STEP_LEN));
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x)
y[i] = x[i] + y[i];
/*for (int j= 0; j< n/(STEP_LEN); j++)
y[(j* STEP_LEN)+ i] = x[(j* STEP_LEN)+ i] + y[(j*STEP_LEN) + i];*/
int main(void)
long int N = 1 << 20; // 1M elements
double tt;
float *x, *y;
// Run kernel on 1M elements on the GPU
int blockSize = 256;
while (blockSize != 1)
cudaMallocManaged(&x, N * sizeof(float));
cudaMallocManaged(&y, N * sizeof(float));
// initialize x and y arrays on the host
for (long int i = 0; i < N; i++) {
x[i] = 1.0f;
y[i] = 2.0f;
//printf("fabs(y[%ld] - 3.0f)= %g\n", i, y[i] - 3.0f);
for (long int i = 0; i < N; i++)
y[i] = 2.0f;
std::cout << "Enter blockSize (1 to terminate): ";
std::cin >> blockSize;
int numBlocks = (N + blockSize - 1) / blockSize;
tt = (double)getTickCount();
add << <numBlocks, blockSize >> >(N, x, y);
tt = ((double)getTickCount() - tt) / getTickFrequency();
//add << <8, 64>> >(N, x, y);
// Wait for GPU to finish before accessing on host
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
float net_err = 0;
for (long int i = 0; i < N; i++)
//std::cout << "i1= " << (long int)(i) << ") " << y[i] << std::endl;
maxError = fmax(maxError, fabs(y[i] - 3.0f));
net_err += fabs(y[i] - 3.0f);
std::cout << "Max error: " << maxError << ", net_err= " << net_err << std::endl;
std::cout << tt << "seconds spent ." << std::endl;
std::cout << "------------------------------------------------------------------------------------" << std::endl;
// Free memory
for (register int j1 = 0; j1 < 10; j1++)
x = (float*)malloc(N * sizeof(float));
y = (float*)malloc(N * sizeof(float));
for (register long int i = 0; i < N; i++)
y[i] = 2.0f;
tt = (double)getTickCount();
for (register long int i = 0; i < N; i++)
y[i] = x[i] + y[i];
tt = ((double)getTickCount() - tt) / getTickFrequency();
std::cout << tt << "seconds spent ." << std::endl;
std::cout << "******************************************************" << std::endl;
std::cout << "Press any key to finish..." << std::endl;
return 0;
