Revision history [back]

using GPU module with own code

I am new to CUDA programming, and am trying to use OpenCV's GPU module with my own CUDA code, but am having problem getting it to work so was wondering if anyone here can point out what I am doing wrong.

I have made a very simple toy problem.

----------------------------------
// main.cpp 

#include <iostream>
#include <fstream>


#include "opencv2/highgui/highgui.hpp"
#include "opencv2/gpu/gpu.hpp"

#include "test.h"

using namespace std;
using namespace cv;
using namespace cv::gpu;

int main()
{
    setDevice(0);

    Mat image = imread("./testset/image_0001.png");
    Mat bw_image(image.size(), CV_32FC1); 
    cvtColor(image, bw_image, CV_RGB2GRAY);
    GpuMat  d_image(bw_image);

    GpuMat  d_image_result(d_image.size(), d_image.type() );


    test_func(d_image, d_image_result);

    Mat mmm;
    d_image_result.download(mmm);

  return 0;
}


-----------------------------------------------------------
// test.h

#ifndef __TEST__
#define __TEST__

#ifndef SKIP_INCLUDES
#include <vector>
#include <memory>
#include <iosfwd>
#endif

#include "opencv2/core/gpumat.hpp"

using namespace std;
using namespace cv;
using namespace cv::gpu;

void do_test(PtrStepSzb src,PtrStepSzb dst);

CV_EXPORTS void test_func(const GpuMat& src, GpuMat& dst)
{
  do_test(src, dst);
}

#endif /* __TEST__ */

-----------------------------------------------------------------------
// test.cu

using namespace std;
using namespace cv;
using namespace cv::gpu;

__global__ void do_test_kernel(PtrStepSz<float> src, PtrStepSz<float> dst)
{

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if(x<dst.cols && y < dst.rows)
        {dst.ptr(y)[x] = src.ptr(y)[x];}
}

void    do_test(const PtrStepSz<float>& src, PtrStepSz<float>& dst)
{
    dim3 block(32,8);
    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

    do_test_kernel<<<grid,block>>>(src,dst);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
};

-------------------------------------------------------------

Now running the code explained above, results in this runtime error:

OpenCV Error: Gpu API call (unspecified launch failure) in caller, file /.../opencv-2.4.7/modules/gpu/src/cuda/matrix_reductions.cu, line 437 terminate called after throwing an instance of 'cv::Exception' what(): /.../opencv-2.4.7/modules/gpu/src/cuda/matrix_reductions.cu:437: error: (-217) unspecified launch failure in function caller

Can anyone please tell me what is the problem with my code?

using GPU module with own code

I have made a very simple toy problem.

----------------------------------
// main.cpp 

#include <iostream>
#include <fstream>


#include "opencv2/highgui/highgui.hpp"
#include "opencv2/gpu/gpu.hpp"

#include "test.h"

using namespace std;
using namespace cv;
using namespace cv::gpu;

int main()
{
    setDevice(0);

    Mat image = imread("./testset/image_0001.png");
    Mat bw_image(image.size(), CV_32FC1); 
    cvtColor(image, bw_image, CV_RGB2GRAY);
    GpuMat  d_image(bw_image);

    GpuMat  d_image_result(d_image.size(), d_image.type() );


    test_func(d_image, d_image_result);

    Mat mmm;
    d_image_result.download(mmm);

  return 0;
}


-----------------------------------------------------------
// test.h

#ifndef __TEST__
#define __TEST__

#ifndef SKIP_INCLUDES
#include <vector>
#include <memory>
#include <iosfwd>
#endif

#include "opencv2/core/gpumat.hpp"

using namespace std;
using namespace cv;
using namespace cv::gpu;

void do_test(PtrStepSzb src,PtrStepSzb dst);

CV_EXPORTS void test_func(const GpuMat& src, GpuMat& dst)
{
  do_test(src, dst);
}

#endif /* __TEST__ */

-----------------------------------------------------------------------
// test.cu

using namespace std;
using namespace cv;
using namespace cv::gpu;

__global__ void do_test_kernel(PtrStepSz<float> src, PtrStepSz<float> dst)
{

    int x = threadIdx.x + blockIdx.x * blockDim.x;
    int y = threadIdx.y + blockIdx.y * blockDim.y;

    if(x<dst.cols && y < dst.rows)
        {dst.ptr(y)[x] = src.ptr(y)[x];}
}

void    do_test(const PtrStepSz<float>& src, PtrStepSz<float>& dst)
{
    dim3 block(32,8);
    dim3 grid(divUp(dst.cols, block.x), divUp(dst.rows, block.y));

    do_test_kernel<<<grid,block>>>(src,dst);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
};

-------------------------------------------------------------

Now running the code explained above, results in this runtime error:

Can anyone please tell me what is the problem with my code?