Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

GPU runs much slower than CPU

You may have seen this popular tutorial for GPU: https://devblogs.nvidia.com/even-easier-introduction-cuda/ I tried to implement the addition kernel of this webpage and compare the processing time between CPU and GPU:

include <stdio.h>

include <iostream>

include <math.h>

include <conio.h>

include <stdlib.h>

include <conio.h>

include < Windows.h>

include <opencv2 core.hpp="">

//#include <opencv2 opencv.hpp="">

include <opencv2 imgcodecs.hpp="">

include <opencv2 highgui.hpp="">

include "cuda_runtime.h"

include "device_launch_parameters.h"

using namespace cv;

// Kernel function to add the elements of two arrays __global__ void add(long int n, float x, float *y) { int index = threadIdx.x; int stride = blockDim.x; int i = blockIdx.xblockDim.x + threadIdx.x; int STEP_LEN = blockDim.x* gridDim.x; //xx = int( n / (STEP_LEN)); for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { y[i] = x[i] + y[i]; } /for (int j= 0; j< n/(STEP_LEN); j++) y[(j* STEP_LEN)+ i] = x[(j* STEP_LEN)+ i] + y[(jSTEP_LEN) + i];/ }

int main(void) { long int N = 1 << 20; // 1M elements

double tt;
float *x, *y;

// Run kernel on 1M elements on the GPU
int blockSize = 256;
while (blockSize != 1)
{
    cudaMallocManaged(&x, N * sizeof(float));
    cudaMallocManaged(&y, N * sizeof(float));

    // initialize x and y arrays on the host
    for (long int i = 0; i < N; i++) {
        x[i] = 1.0f;
        y[i] = 2.0f;
        //printf("fabs(y[%ld] - 3.0f)= %g\n", i, y[i] - 3.0f);
    }
    for (long int i = 0; i < N; i++)
        y[i] = 2.0f;
    std::cout << "Enter blockSize (1 to terminate): ";
    std::cin >> blockSize;
    int numBlocks = (N + blockSize - 1) / blockSize;
    tt = (double)getTickCount();
    add << <numBlocks, blockSize >> >(N, x, y);
    tt = ((double)getTickCount() - tt) / getTickFrequency();
    //add << <8, 64>> >(N, x, y);

    // Wait for GPU to finish before accessing on host
    cudaDeviceSynchronize();

    // Check for errors (all values should be 3.0f)
    float maxError = 0.0f;
    float net_err = 0;
    for (long int i = 0; i < N; i++)
    {
        //std::cout << "i1= " << (long int)(i) << ") " << y[i] << std::endl;
        maxError = fmax(maxError, fabs(y[i] - 3.0f));
        net_err += fabs(y[i] - 3.0f);
    }

    std::cout << "Max error: " << maxError << ", net_err= " << net_err << std::endl;
    std::cout << tt << "seconds spent ." << std::endl;
    std::cout << "------------------------------------------------------------------------------------" << std::endl;

    // Free memory
    cudaFree(x);
    cudaFree(y);
}

for (register int j1 = 0; j1 < 10; j1++)
{
    x = (float*)malloc(N * sizeof(float));
    y = (float*)malloc(N * sizeof(float));
    for (register long int i = 0; i < N; i++)
        y[i] = 2.0f;

    tt = (double)getTickCount();
    for (register long int i = 0; i < N; i++)
        y[i] = x[i] + y[i];
    tt = ((double)getTickCount() - tt) / getTickFrequency();
    std::cout << tt << "seconds spent ." << std::endl;
    std::cout << "******************************************************" << std::endl;
    free(x);
    free(y);
}

std::cout << "Press any key to finish..." << std::endl;
getch();
return 0;

} "blockSize" is the number of threads in the threadblocks. Here is the output of the program, The number of threadblocks hace been specified as {512, 256, 128, 64} and the processing time is calculated. Afterwards, the same summation is executed by CPU the the processing time is similarly calculated. Enter blockSize (1 to terminate): 512 Max error: 0, net_err= 0

0.0307048seconds spent .

Enter blockSize (1 to terminate): 256 Max error: 0, net_err= 0

0.0307976seconds spent .

Enter blockSize (1 to terminate): 128 Max error: 0, net_err= 0

0.031793seconds spent .

Enter blockSize (1 to terminate): 64 Max error: 0, net_err= 0

0.0304692seconds spent .

Enter blockSize (1 to terminate): 1 Max error: 0, net_err= 0

0.0280911seconds spent .

0.0027317seconds spent .


0.0027427seconds spent .


0.0025947seconds spent .


0.0026085seconds spent .


0.0027727seconds spent .


0.0026877seconds spent .


0.0033705seconds spent .


0.0028491seconds spent .


0.0031258seconds spent .


0.0029499seconds spent .


Press any key to finish...

As it can be seen from the output, the GPU runs almost ten times slower than the CPU. The time for GPU memory allocation and release is not considered. The measured time for the GPU is only for the kernel execytion. The GPU is supposed to act much more quickly. Why is the GPU output significantly slower than the CPU? Thank you.

GPU runs much slower than CPU

You may have seen this popular tutorial for GPU: https://devblogs.nvidia.com/even-easier-introduction-cuda/ I tried to implement the addition kernel of this webpage and compare the processing time between CPU and GPU:

include <stdio.h>

include <iostream>

include <math.h>

include <conio.h>

include <stdlib.h>

include <conio.h>

include GPU. Code:

#include <stdio.h>
#include <iostream>
#include <math.h>
#include <conio.h>

#include <stdlib.h>
#include <conio.h>
#include < Windows.h>

include <opencv2 core.hpp="">

Windows.h> #include <opencv2/core.hpp> //#include <opencv2 opencv.hpp="">

include <opencv2 imgcodecs.hpp="">

include <opencv2 highgui.hpp="">

include "cuda_runtime.h"

include "device_launch_parameters.h"

<opencv2/opencv.hpp> #include <opencv2/imgcodecs.hpp> #include <opencv2/highgui.hpp> #include "cuda_runtime.h" #include "device_launch_parameters.h" using namespace cv;

cv; // Kernel function to add the elements of two arrays __global__ void add(long int n, float x, *x, float *y) { int index = threadIdx.x; int stride = blockDim.x; int i = blockIdx.xblockDim.x blockIdx.x*blockDim.x + threadIdx.x; int STEP_LEN = blockDim.x* gridDim.x; //xx //*xx = int( n / (STEP_LEN)); for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n; i += blockDim.x * gridDim.x) { y[i] = x[i] + y[i]; } /for /*for (int j= 0; j< n/(STEP_LEN); j++) y[(j* STEP_LEN)+ i] = x[(j* STEP_LEN)+ i] + y[(jSTEP_LEN) y[(j*STEP_LEN) + i];/ }

i];*/ } int main(void) { long int N = 1 << 20; // 1M elements

elements

    double tt;
 float *x, *y;

 // Run kernel on 1M elements on the GPU
 int blockSize = 256;
 while (blockSize != 1)
 {
     cudaMallocManaged(&x, N * sizeof(float));
     cudaMallocManaged(&y, N * sizeof(float));

     // initialize x and y arrays on the host
     for (long int i = 0; i < N; i++) {
         x[i] = 1.0f;
         y[i] = 2.0f;
         //printf("fabs(y[%ld] - 3.0f)= %g\n", i, y[i] - 3.0f);
     }
     for (long int i = 0; i < N; i++)
         y[i] = 2.0f;
     std::cout << "Enter blockSize (1 to terminate): ";
     std::cin >> blockSize;
     int numBlocks = (N + blockSize - 1) / blockSize;
     tt = (double)getTickCount();
     add << <numBlocks, blockSize >> >(N, x, y);
     tt = ((double)getTickCount() - tt) / getTickFrequency();
     //add << <8, 64>> >(N, x, y);

     // Wait for GPU to finish before accessing on host
     cudaDeviceSynchronize();

     // Check for errors (all values should be 3.0f)
     float maxError = 0.0f;
     float net_err = 0;
     for (long int i = 0; i < N; i++)
     {
         //std::cout << "i1= " << (long int)(i) << ") " << y[i] << std::endl;
         maxError = fmax(maxError, fabs(y[i] - 3.0f));
         net_err += fabs(y[i] - 3.0f);
     }

     std::cout << "Max error: " << maxError << ", net_err= " << net_err << std::endl;
     std::cout << tt << "seconds spent ." << std::endl;
     std::cout << "------------------------------------------------------------------------------------" << std::endl;

     // Free memory
     cudaFree(x);
     cudaFree(y);
 }

 for (register int j1 = 0; j1 < 10; j1++)
 {
     x = (float*)malloc(N * sizeof(float));
     y = (float*)malloc(N * sizeof(float));
     for (register long int i = 0; i < N; i++)
         y[i] = 2.0f;

     tt = (double)getTickCount();
     for (register long int i = 0; i < N; i++)
         y[i] = x[i] + y[i];
     tt = ((double)getTickCount() - tt) / getTickFrequency();
     std::cout << tt << "seconds spent ." << std::endl;
     std::cout << "******************************************************" << std::endl;
     free(x);
     free(y);
 }

 std::cout << "Press any key to finish..." << std::endl;
 getch();
 return 0;
}

} "blockSize" is the number of threads in the threadblocks. Here is the output of the program, The number of threadblocks hace been specified as {512, 256, 128, 64} and the processing time is calculated. Afterwards, the same summation is executed by CPU the the processing time is similarly calculated. Enter blockSize (1 to terminate): 512 Max error: 0, net_err= 0

0.0307048seconds spent .

Enter blockSize (1 to terminate): 256 Max error: 0, net_err= 0

0.0307976seconds spent .

Enter blockSize (1 to terminate): 128 Max error: 0, net_err= 0

0.031793seconds spent .

Enter blockSize (1 to terminate): 64 Max error: 0, net_err= 0

0.0304692seconds spent .

Enter blockSize (1 to terminate): 1 Max error: 0, net_err= 0

0.0280911seconds spent .

0.0027317seconds spent .


0.0027427seconds spent .


0.0025947seconds spent .


0.0026085seconds spent .


0.0027727seconds spent .


0.0026877seconds spent .


0.0033705seconds spent .


0.0028491seconds spent .


0.0031258seconds spent .


0.0029499seconds spent .


Press any key to finish...

As it can be seen from the output, the GPU runs almost ten times slower than the CPU. The time for GPU memory allocation and release is not considered. The measured time for the GPU is only for the kernel execytion. The GPU is supposed to act much more quickly. Why is the GPU output significantly slower than the CPU? Thank you.