Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

Wrong GpuMat matrix elements filled by cuda kernel

Hi all,

my problem is, that I create a GpuMat, then call a cuda kernel with the GpuMats pointer etc, fill the elements of the matrix (called gpumatdiffsqr), but when I'm back on the CPU, the Matrix elements are wrong.

My cpp file

cv::gpu::GpuMat gpumatdiffsqr(gpumatconcat.size(), CV_32FC1, 100);

simple3cpp(gpumato.ptr<uchar>(), gpumato.step, gpumato.cols, gpumato.rows,
    gpumatconcat.ptr<uchar>(), gpumatconcat.step, gpumatconcat.cols, gpumatconcat.rows,
    gpumatdiffsqr.ptr<float>(), gpumatdiffsqr.step, gpumatdiffsqr.elemSize());

cv::Mat tmp;
gpumatdiffsqr.download(tmp);
std::cout << tmp << std::endl;

My cu file:

__global__ void simple3(unsigned char* data, size_t step, const int cols, const int rows,
    unsigned char* data2, size_t step2, const int cols2, const int rows2,
    float* diffsqr_matrix, size_t diffaqr_step, size_t diffsqr_elemSize){
  //thread.x = row thread.y = col

  //calculate difference and square of patch "data" to all blocks in "data2"
  float diff = data[(threadIdx.x*step)+(threadIdx.y*sizeof(unsigned char))] - data2[(threadIdx.x*step)+((blockIdx.x*cols*sizeof(unsigned char))+(threadIdx.y*sizeof(unsigned char)))];
  float diffsqr = diff * diff;

  diffsqr_matrix[(threadIdx.x*diffaqr_step)+((blockIdx.x*cols*sizeof(float))+(threadIdx.y*sizeof(float)))] = (float) diffsqr;
  float test =   diffsqr_matrix[(threadIdx.x*diffaqr_step)+((blockIdx.x*cols*diffsqr_elemSize)+(threadIdx.y*diffsqr_elemSize))];
  __syncthreads();

  printf("%d %d %d: %f %f %f\n", blockIdx.x, threadIdx.x, threadIdx.y, diff, diffsqr, test);
}

The input is:

gpumatconcat:

[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3; 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7; 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11; 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15]

gpumato:

[20, 20, 20, 20; 20, 20, 20, 20; 20, 20, 20, 20; 20, 20, 20, 20]

the output of the printf inside the kernel is:

3 0 0: 20.000000 400.000000 400.000000 3 1 0: 16.000000 256.000000 256.000000 3 2 0: 12.000000 144.000000 144.000000 3 3 0: 8.000000 64.000000 64.000000 3 0 1: 19.000000 361.000000 361.000000 3 1 1: 15.000000 225.000000 225.000000 3 2 1: 11.000000 121.000000 121.000000 3 3 1: 7.000000 49.000000 49.000000 ...

so that works fine. However the output of

gpumatdiffsqr.download(tmp);
std::cout << tmp << std::endl;

is something like:

[400, 1.4751525e-39, 1.4751525e-39, 1.4755323e-39, 361, 1.9176691e-38, 1.9176691e-38, 1.917668e-38, 324, 3.4969683e-39, 1.4751525e-39, 1.4766281e-39, 289, 1.9174466e-38, 6.8062748e-39, 1.4751525e-39; ...

I can't figure out my error. Pointer and pointer steps of gpumatdifsqr should be fine.

Wrong GpuMat matrix elements filled by cuda kernel

Hi all,

my problem is, that I create a GpuMat, then call a cuda kernel with the GpuMats pointer etc, fill the elements of the matrix (called gpumatdiffsqr), but when I'm back on the CPU, the Matrix elements are wrong.

My cpp file

cv::gpu::GpuMat gpumatdiffsqr(gpumatconcat.size(), CV_32FC1, 100);

simple3cpp(gpumato.ptr<uchar>(), gpumato.step, gpumato.cols, gpumato.rows,
    gpumatconcat.ptr<uchar>(), gpumatconcat.step, gpumatconcat.cols, gpumatconcat.rows,
    gpumatdiffsqr.ptr<float>(), gpumatdiffsqr.step, gpumatdiffsqr.elemSize());

cv::Mat tmp;
gpumatdiffsqr.download(tmp);
std::cout << tmp << std::endl;

My cu file:

__global__ void simple3(unsigned char* data, size_t step, const int cols, const int rows,
    unsigned char* data2, size_t step2, const int cols2, const int rows2,
    float* diffsqr_matrix, size_t diffaqr_step, size_t diffsqr_elemSize){
  //thread.x = row thread.y = col

  //calculate difference and square of patch "data" to all blocks in "data2"
  float diff = data[(threadIdx.x*step)+(threadIdx.y*sizeof(unsigned char))] - data2[(threadIdx.x*step)+((blockIdx.x*cols*sizeof(unsigned char))+(threadIdx.y*sizeof(unsigned char)))];
  float diffsqr = diff * diff;

  diffsqr_matrix[(threadIdx.x*diffaqr_step)+((blockIdx.x*cols*sizeof(float))+(threadIdx.y*sizeof(float)))] = (float) diffsqr;
  float test =   diffsqr_matrix[(threadIdx.x*diffaqr_step)+((blockIdx.x*cols*diffsqr_elemSize)+(threadIdx.y*diffsqr_elemSize))];
  __syncthreads();

  printf("%d %d %d: %f %f %f\n", blockIdx.x, threadIdx.x, threadIdx.y, diff, diffsqr, test);
}

The input is:

gpumatconcat:

[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3;
4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7;
8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11;
12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15]

15]

gpumato:

[20, 20, 20, 20;
20, 20, 20, 20;
20, 20, 20, 20;
20, 20, 20, 20]

20]

the output of the printf inside the kernel is:

3 0 0: 20.000000 400.000000 400.000000
3 1 0: 16.000000 256.000000 256.000000
3 2 0: 12.000000 144.000000 144.000000
3 3 0: 8.000000 64.000000 64.000000
3 0 1: 19.000000 361.000000 361.000000
3 1 1: 15.000000 225.000000 225.000000
3 2 1: 11.000000 121.000000 121.000000
3 3 1: 7.000000 49.000000 49.000000

...

so that works fine. However the output of

gpumatdiffsqr.download(tmp);
std::cout << tmp << std::endl;

is something like:

[400, 1.4751525e-39, 1.4751525e-39, 1.4755323e-39, 361, 1.9176691e-38, 1.9176691e-38, 1.917668e-38, 324, 3.4969683e-39, 1.4751525e-39, 1.4766281e-39, 289, 1.9174466e-38, 6.8062748e-39, 1.4751525e-39; ...

I can't figure out my error. Pointer and pointer steps of gpumatdifsqr should be fine.

Wrong GpuMat matrix elements filled by cuda kernel

Hi all,

my problem is, that I create a GpuMat, then call a cuda kernel with the GpuMats pointer etc, fill the elements of the matrix (called gpumatdiffsqr), but when I'm back on the CPU, the Matrix elements are wrong.

My cpp file

cv::gpu::GpuMat gpumatdiffsqr(gpumatconcat.size(), CV_32FC1, 100);

simple3cpp(gpumato.ptr<uchar>(), gpumato.step, gpumato.cols, gpumato.rows,
    gpumatconcat.ptr<uchar>(), gpumatconcat.step, gpumatconcat.cols, gpumatconcat.rows,
    gpumatdiffsqr.ptr<float>(), gpumatdiffsqr.step, gpumatdiffsqr.elemSize());

cv::Mat tmp;
gpumatdiffsqr.download(tmp);
std::cout << tmp << std::endl;

My cu file:

__global__ void simple3(unsigned char* data, size_t step, const int cols, const int rows,
    unsigned char* data2, size_t step2, const int cols2, const int rows2,
    float* diffsqr_matrix, size_t diffaqr_step, size_t diffsqr_elemSize){
  //thread.x = row thread.y = col

  //calculate difference and square of patch "data" to all blocks in "data2"
  float diff = data[(threadIdx.x*step)+(threadIdx.y*sizeof(unsigned char))] - data2[(threadIdx.x*step)+((blockIdx.x*cols*sizeof(unsigned char))+(threadIdx.y*sizeof(unsigned char)))];
  float diffsqr = diff * diff;

  diffsqr_matrix[(threadIdx.x*diffaqr_step)+((blockIdx.x*cols*sizeof(float))+(threadIdx.y*sizeof(float)))] = (float) diffsqr;
  float test =   diffsqr_matrix[(threadIdx.x*diffaqr_step)+((blockIdx.x*cols*diffsqr_elemSize)+(threadIdx.y*diffsqr_elemSize))];
  __syncthreads();

  printf("%d %d %d: %f %f %f\n", blockIdx.x, threadIdx.x, threadIdx.y, diff, diffsqr, test);
}

The input is:

gpumatconcat:

[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3;
  4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7;
  8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11, 8, 9, 10, 11;
  12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15, 12, 13, 14, 15]

gpumato:

[20, 20, 20, 20;
  20, 20, 20, 20;
  20, 20, 20, 20;
  20, 20, 20, 20]

The calculation is similar to gpumatdiffsqr = gpumato - gpumatconcat; (gpumato is applied to blocks in gpumatconcat) the output of the printf inside the kernel is:

3 0 0: 20.000000 400.000000 400.000000
3 1 0: 16.000000 256.000000 256.000000
3 2 0: 12.000000 144.000000 144.000000
3 3 0: 8.000000 64.000000 64.000000
3 0 1: 19.000000 361.000000 361.000000
3 1 1: 15.000000 225.000000 225.000000
3 2 1: 11.000000 121.000000 121.000000
3 3 1: 7.000000 49.000000 49.000000

...

so that works fine. However the output of

gpumatdiffsqr.download(tmp);
std::cout << tmp << std::endl;

is something like:

[400, 1.4751525e-39, 1.4751525e-39, 1.4755323e-39, 361, 1.9176691e-38, 1.9176691e-38, 1.917668e-38, 324, 3.4969683e-39, 1.4751525e-39, 1.4766281e-39, 289, 1.9174466e-38, 6.8062748e-39, 1.4751525e-39; ...

I can't figure out my error. Pointer and pointer steps of gpumatdifsqr should be fine.