Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

I hope the original poster already solved his problem, but the error highlighted is that you have to supply the descriptors using cv::cuda::GpuMat and not with cv::Mat, as you use the GPU matcher class.

Nevertheless, I post here 2 example codes (in OpenCV 3.0) to achieve ORB detection/extraction and descriptors matching using the CUDA module which, I hope, could be helpful to someone else:

  • example_with_full_gpu(): detect ORB keypoints, compute ORB descriptors and perform the knn-matching with only calls to cuda functions
  • example_with_gpu_matching(): only the matching use the GPU, to demonstrate that it is possible to use all the features available in features2d.hpp or xfeatures2d.hpp and match with the GPU

    include <iostream>

    include <opencv2 opencv.hpp="">

    include <opencv2 core="" cuda.hpp="">

    include <opencv2 cudaimgproc.hpp="">

    include <opencv2 cudafeatures2d.hpp="">

    void example_with_full_gpu(const cv::Mat &img1, const cv::Mat img2) { //Upload from host memory to gpu device memeory cv::cuda::GpuMat img1_gpu(img1), img2_gpu(img2); cv::cuda::GpuMat img1_gray_gpu, img2_gray_gpu;

    //Convert RGB to grayscale as gpu detectAndCompute only allow grayscale GpuMat cv::cuda::cvtColor(img1_gpu, img1_gray_gpu, CV_BGR2GRAY); cv::cuda::cvtColor(img2_gpu, img2_gray_gpu, CV_BGR2GRAY);

    //Create a GPU ORB feature object //blurForDescriptor=true seems to give better results //http://answers.opencv.org/question/10835/orb_gpu-not-as-good-as-orbcpu/ cv::Ptr<cv::cuda::orb> orb = cv::cuda::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20, true);

    cv::cuda::GpuMat keypoints1_gpu, descriptors1_gpu; //Detect ORB keypoints and extract descriptors on train image (box.png) orb->detectAndComputeAsync(img1_gray_gpu, cv::cuda::GpuMat(), keypoints1_gpu, descriptors1_gpu); std::vector<cv::keypoint> keypoints1; //Convert from CUDA object to std::vector<cv::keypoint> orb->convert(keypoints1_gpu, keypoints1); std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1_gpu=" << descriptors1_gpu.rows << "x" << descriptors1_gpu.cols << std::endl;

    std::vector<cv::keypoint> keypoints2; cv::cuda::GpuMat descriptors2_gpu; //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png) //The conversion from internal data to std::vector<cv::keypoint> is done implicitly in detectAndCompute() orb->detectAndCompute(img2_gray_gpu, cv::cuda::GpuMat(), keypoints2, descriptors2_gpu); std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2_gpu=" << descriptors2_gpu.rows << "x" << descriptors2_gpu.cols << std::endl;

    //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB) cv::Ptr<cv::cuda::descriptormatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);

    std::vector<std::vector<cv::dmatch> > knn_matches; //Match each query descriptor to a train descriptor matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2); std::cout << "knn_matches=" << knn_matches.size() << std::endl;

    std::vector<cv::dmatch> matches; //Filter the matches using the ratio test for(std::vector<std::vector<cv::dmatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) { if(it->size() > 1 && (it)[0].distance/(it)[1].distance < 0.8) { matches.push_back((*it)[0]); } }

    cv::Mat imgRes; //Display and save the image with matches cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes); cv::imshow("imgRes", imgRes); cv::imwrite("GPU_ORB-matching.png", imgRes);

    cv::waitKey(0);
    }

    void example_with_gpu_matching(const cv::Mat &img1, const cv::Mat img2) { //Create a CPU ORB feature object cv::Ptr<cv::feature2d> orb = cv::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20);

    std::vector<cv::keypoint> keypoints1; cv::Mat descriptors1; //Detect ORB keypoints and extract descriptors on train image (box.png) orb->detectAndCompute(img1, cv::Mat(), keypoints1, descriptors1); std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1=" << descriptors1.rows << "x" << descriptors1.cols << std::endl;

    std::vector<cv::keypoint> keypoints2; cv::Mat descriptors2; //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png) orb->detectAndCompute(img2, cv::Mat(), keypoints2, descriptors2); std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2=" << descriptors2.rows << "x" << descriptors2.cols << std::endl;

    //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB) cv::Ptr<cv::cuda::descriptormatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);

    //Upload from host memory to gpu device memeory cv::cuda::GpuMat descriptors1_gpu(descriptors1), descriptors2_gpu; //Upload from host memory to gpu device memeory (another way to do it) descriptors2_gpu.upload(descriptors2);

    std::vector<std::vector<cv::dmatch> > knn_matches; //Match each query descriptor to a train descriptor matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2); std::cout << "knn_matches=" << knn_matches.size() << std::endl;

    std::vector<cv::dmatch> matches; //Filter the matches using the ratio test for(std::vector<std::vector<cv::dmatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) { if(it->size() > 1 && (it)[0].distance/(it)[1].distance < 0.8) { matches.push_back((*it)[0]); } }

    cv::Mat imgRes; //Display and save the image with matches cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes); cv::imshow("imgRes", imgRes);
    cv::imwrite("CPU_ORB+GPU_matching.png", imgRes);

    cv::waitKey(0);
    }

    int main() { std::cout << "OpenCV version=" << std::hex << CV_VERSION << std::dec << std::endl;

    cv::Mat img1, img2; img1 = cv::imread("C:/OpenCV/opencv-3.0.0-rc1/sources/samples/data/box.png"); img2 = cv::imread("C:/OpenCV/opencv-3.0.0-rc1/sources/samples/data/box_in_scene.png");

    example_with_full_gpu(img1, img2); example_with_gpu_matching(img1, img2);

    return 0; }

The resulting images for first all GPU and after CPU + GPU matching:

GPU

CPU+GPU

The results are not exactly the same, I don't know why exactly.

I hope the original poster already solved his problem, but the error highlighted is that you have to supply the descriptors using cv::cuda::GpuMat and not with cv::Mat, as you use the GPU matcher class.

Nevertheless, I post here 2 example codes (in OpenCV 3.0) to achieve ORB detection/extraction and descriptors matching using the CUDA module which, I hope, could be helpful to someone else:

  • example_with_full_gpu(): detect ORB keypoints, compute ORB descriptors and perform the knn-matching with only calls to cuda functions
  • example_with_gpu_matching(): only the matching use the GPU, to demonstrate that it is possible to use all the features available in features2d.hpp or xfeatures2d.hpp and match with the GPU

    include <iostream>

    include <opencv2 opencv.hpp="">

    include <opencv2 core="" cuda.hpp="">

    include <opencv2 cudaimgproc.hpp="">

    include <opencv2 cudafeatures2d.hpp="">

    void example_with_full_gpu(const cv::Mat &img1, const cv::Mat img2) { //Upload from host memory to gpu device memeory cv::cuda::GpuMat img1_gpu(img1), img2_gpu(img2); cv::cuda::GpuMat img1_gray_gpu, img2_gray_gpu;

    //Convert RGB to grayscale as gpu detectAndCompute only allow grayscale GpuMat
     cv::cuda::cvtColor(img1_gpu, img1_gray_gpu, CV_BGR2GRAY);
     cv::cuda::cvtColor(img2_gpu, img2_gray_gpu, CV_BGR2GRAY);

    CV_BGR2GRAY); //Create a GPU ORB feature object //blurForDescriptor=true seems to give better results //http://answers.opencv.org/question/10835/orb_gpu-not-as-good-as-orbcpu/ cv::Ptr<cv::cuda::orb> cv::Ptr<cv::cuda::ORB> orb = cv::cuda::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20, true);

    true); cv::cuda::GpuMat keypoints1_gpu, descriptors1_gpu; //Detect ORB keypoints and extract descriptors on train image (box.png) orb->detectAndComputeAsync(img1_gray_gpu, cv::cuda::GpuMat(), keypoints1_gpu, descriptors1_gpu); std::vector<cv::keypoint> std::vector<cv::KeyPoint> keypoints1; //Convert from CUDA object to std::vector<cv::keypoint> std::vector<cv::KeyPoint> orb->convert(keypoints1_gpu, keypoints1); std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1_gpu=" << descriptors1_gpu.rows << "x" << descriptors1_gpu.cols << std::endl;

    std::vector<cv::keypoint> std::endl; std::vector<cv::KeyPoint> keypoints2; cv::cuda::GpuMat descriptors2_gpu; //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png) //The conversion from internal data to std::vector<cv::keypoint> std::vector<cv::KeyPoint> is done implicitly in detectAndCompute() orb->detectAndCompute(img2_gray_gpu, cv::cuda::GpuMat(), keypoints2, descriptors2_gpu); std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2_gpu=" << descriptors2_gpu.rows << "x" << descriptors2_gpu.cols << std::endl;

    std::endl; //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB) cv::Ptr<cv::cuda::descriptormatcher> cv::Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);

    std::vector<std::vector<cv::dmatch> cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING); std::vector<std::vector<cv::DMatch> > knn_matches; //Match each query descriptor to a train descriptor matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2); std::cout << "knn_matches=" << knn_matches.size() << std::endl;

    std::vector<cv::dmatch> std::endl; std::vector<cv::DMatch> matches; //Filter the matches using the ratio test for(std::vector<std::vector<cv::dmatch> for(std::vector<std::vector<cv::DMatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) { if(it->size() > 1 && (it)[0].distance/(it)[1].distance (*it)[0].distance/(*it)[1].distance < 0.8) { matches.push_back((*it)[0]); } }

    } cv::Mat imgRes; //Display and save the image with matches cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes); cv::imshow("imgRes", imgRes); cv::imwrite("GPU_ORB-matching.png", imgRes);

    cv::waitKey(0);
    imgRes); cv::waitKey(0);

    }

    void example_with_gpu_matching(const cv::Mat &img1, const cv::Mat img2) { //Create a CPU ORB feature object cv::Ptr<cv::feature2d> orb = cv::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20);

    std::vector<cv::keypoint>

    std::vector<cv::KeyPoint> keypoints1;
     cv::Mat descriptors1;
     //Detect ORB keypoints and extract descriptors on train image (box.png)
     orb->detectAndCompute(img1, cv::Mat(), keypoints1, descriptors1);
     std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1=" << descriptors1.rows
    << "x" << descriptors1.cols << std::endl;

    std::vector<cv::keypoint> std::endl; std::vector<cv::KeyPoint> keypoints2; cv::Mat descriptors2; //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png) orb->detectAndCompute(img2, cv::Mat(), keypoints2, descriptors2); std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2=" << descriptors2.rows << "x" << descriptors2.cols << std::endl;

    std::endl; //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB) cv::Ptr<cv::cuda::descriptormatcher> cv::Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);

    cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING); //Upload from host memory to gpu device memeory cv::cuda::GpuMat descriptors1_gpu(descriptors1), descriptors2_gpu; //Upload from host memory to gpu device memeory (another way to do it) descriptors2_gpu.upload(descriptors2);

    std::vector<std::vector<cv::dmatch> descriptors2_gpu.upload(descriptors2); std::vector<std::vector<cv::DMatch> > knn_matches; //Match each query descriptor to a train descriptor matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2); std::cout << "knn_matches=" << knn_matches.size() << std::endl;

    std::vector<cv::dmatch> std::endl; std::vector<cv::DMatch> matches; //Filter the matches using the ratio test for(std::vector<std::vector<cv::dmatch> for(std::vector<std::vector<cv::DMatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) { if(it->size() > 1 && (it)[0].distance/(it)[1].distance (*it)[0].distance/(*it)[1].distance < 0.8) { matches.push_back((*it)[0]); } }

    } cv::Mat imgRes; //Display and save the image with matches cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes); cv::imshow("imgRes", imgRes);
    cv::imwrite("CPU_ORB+GPU_matching.png", imgRes);

    cv::waitKey(0);
    imgRes); cv::waitKey(0);

    }

    int main() { std::cout << "OpenCV version=" << std::hex << CV_VERSION << std::dec << std::endl;

    cv::Mat img1, img2;
     img1 = cv::imread("C:/OpenCV/opencv-3.0.0-rc1/sources/samples/data/box.png");
    cv::imread("samples/data/box.png");
    img2 = cv::imread("C:/OpenCV/opencv-3.0.0-rc1/sources/samples/data/box_in_scene.png");

    cv::imread("samples/data/box_in_scene.png"); example_with_full_gpu(img1, img2); example_with_gpu_matching(img1, img2);

    img2); return 0;

    }

The resulting images for first all GPU and after CPU + GPU matching:

GPU

CPU+GPU

The results are not exactly the same, I don't know why exactly.

I hope the original poster already solved his problem, but the error highlighted is that you have to supply the descriptors using cv::cuda::GpuMat and not with cv::Mat, as you use the GPU matcher class.

Nevertheless, I post here 2 example codes (in OpenCV 3.0) to achieve ORB detection/extraction and descriptors matching using the CUDA module which, I hope, could be helpful to someone else:

  • example_with_full_gpu(): detect ORB keypoints, compute ORB descriptors and perform the knn-matching with only calls to cuda functions
  • example_with_gpu_matching(): only the matching use the GPU, to demonstrate that it is possible to use all the features available in features2d.hpp or xfeatures2d.hpp and match with the GPU

    include <iostream>

    include <opencv2 opencv.hpp="">

    include <opencv2 core="" cuda.hpp="">

    include <opencv2 cudaimgproc.hpp="">

    include <opencv2 cudafeatures2d.hpp="">

    #include <iostream>
    #include <opencv2/opencv.hpp>
    #include <opencv2/core/cuda.hpp>
    #include <opencv2/cudaimgproc.hpp>
    #include <opencv2/cudafeatures2d.hpp>
    
    void example_with_full_gpu(const cv::Mat &img1, const cv::Mat img2) {
     //Upload from host memory to gpu device memeory
     cv::cuda::GpuMat img1_gpu(img1), img2_gpu(img2);
     cv::cuda::GpuMat img1_gray_gpu, img2_gray_gpu;

    img2_gray_gpu;
    //Convert RGB to grayscale as gpu detectAndCompute only allow grayscale GpuMat
    cv::cuda::cvtColor(img1_gpu, img1_gray_gpu, CV_BGR2GRAY);
    cv::cuda::cvtColor(img2_gpu, img2_gray_gpu, CV_BGR2GRAY);
    //Create a GPU ORB feature object
    //blurForDescriptor=true seems to give better results
    //http://answers.opencv.org/question/10835/orb_gpu-not-as-good-as-orbcpu/
    cv::Ptr<cv::cuda::ORB> orb = cv::cuda::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20, true);
    cv::cuda::GpuMat keypoints1_gpu, descriptors1_gpu;
    //Detect ORB keypoints and extract descriptors on train image (box.png)
    orb->detectAndComputeAsync(img1_gray_gpu, cv::cuda::GpuMat(), keypoints1_gpu, descriptors1_gpu);
    std::vector<cv::KeyPoint> keypoints1;
    //Convert from CUDA object to std::vector<cv::KeyPoint>
    orb->convert(keypoints1_gpu, keypoints1);
    std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1_gpu=" << descriptors1_gpu.rows
    << "x" << descriptors1_gpu.cols << std::endl;
    std::vector<cv::KeyPoint> keypoints2;
    cv::cuda::GpuMat descriptors2_gpu;
    //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png)
    //The conversion from internal data to std::vector<cv::KeyPoint> is done implicitly in detectAndCompute()
    orb->detectAndCompute(img2_gray_gpu, cv::cuda::GpuMat(), keypoints2, descriptors2_gpu);
    std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2_gpu=" << descriptors2_gpu.rows
    << "x" << descriptors2_gpu.cols << std::endl;
    //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB)
    cv::Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);
    std::vector<std::vector<cv::DMatch> > knn_matches;
    //Match each query descriptor to a train descriptor
    matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2);
    std::cout << "knn_matches=" << knn_matches.size() << std::endl;
    std::vector<cv::DMatch> matches;
    //Filter the matches using the ratio test
    for(std::vector<std::vector<cv::DMatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) {
    if(it->size() > 1 && (*it)[0].distance/(*it)[1].distance < 0.8) {
    matches.push_back((*it)[0]);
    }
    }
    cv::Mat imgRes;
    //Display and save the image with matches
    cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes);
    cv::imshow("imgRes", imgRes);
    cv::imwrite("GPU_ORB-matching.png", imgRes);
    cv::waitKey(0);
    

    }

    cv::waitKey(0); } void example_with_gpu_matching(const cv::Mat &img1, const cv::Mat img2) { //Create a CPU ORB feature object cv::Ptr<cv::feature2d> cv::Ptr<cv::Feature2D> orb = cv::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20);

    20);
    std::vector<cv::KeyPoint> keypoints1;
    cv::Mat descriptors1;
    //Detect ORB keypoints and extract descriptors on train image (box.png)
    orb->detectAndCompute(img1, cv::Mat(), keypoints1, descriptors1);
    std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1=" << descriptors1.rows
    << "x" << descriptors1.cols << std::endl;
    std::vector<cv::KeyPoint> keypoints2;
    cv::Mat descriptors2;
    //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png)
    orb->detectAndCompute(img2, cv::Mat(), keypoints2, descriptors2);
    std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2=" << descriptors2.rows
    << "x" << descriptors2.cols << std::endl;
    //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB)
    cv::Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);
    //Upload from host memory to gpu device memeory
    cv::cuda::GpuMat descriptors1_gpu(descriptors1), descriptors2_gpu;
    //Upload from host memory to gpu device memeory (another way to do it)
    descriptors2_gpu.upload(descriptors2);
    std::vector<std::vector<cv::DMatch> > knn_matches;
    //Match each query descriptor to a train descriptor
    matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2);
    std::cout << "knn_matches=" << knn_matches.size() << std::endl;
    std::vector<cv::DMatch> matches;
    //Filter the matches using the ratio test
    for(std::vector<std::vector<cv::DMatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) {
    if(it->size() > 1 && (*it)[0].distance/(*it)[1].distance < 0.8) {
    matches.push_back((*it)[0]);
    }
    }
    cv::Mat imgRes;
    //Display and save the image with matches
    cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes);
    cv::imshow("imgRes", imgRes);
    cv::imwrite("CPU_ORB+GPU_matching.png", imgRes);
    cv::waitKey(0);
    

    }

    cv::waitKey(0); } int main() { std::cout << "OpenCV version=" << std::hex << CV_VERSION << std::dec << std::endl;

    std::endl;
    cv::Mat img1, img2;
    img1 = cv::imread("samples/data/box.png");
    img2 = cv::imread("samples/data/box_in_scene.png");
    example_with_full_gpu(img1, img2);
    example_with_gpu_matching(img1, img2);
    return 0;
    }
    

    }

The resulting images for first all GPU and after CPU + GPU matching:

GPU

CPU+GPU

The results are not exactly the same, I don't know why exactly.

I hope the original poster already solved his problem, but the error highlighted is that you have to supply the descriptors using cv::cuda::GpuMat and not with cv::Mat, as you use the GPU matcher class.

Nevertheless, I post here 2 example codes (in OpenCV 3.0) to achieve ORB detection/extraction and descriptors matching using the CUDA module which, I hope, could be helpful to someone else:

  • example_with_full_gpu(): detect ORB keypoints, compute ORB descriptors and perform the knn-matching with only calls to cuda functions
  • example_with_gpu_matching(): only the matching use the GPU, to demonstrate that it is possible to use all the features available in features2d.hpp or xfeatures2d.hpp and match with the GPU

    #include <iostream>
    #include <opencv2/opencv.hpp>
    #include <opencv2/core/cuda.hpp>
    #include <opencv2/cudaimgproc.hpp>
    #include <opencv2/cudafeatures2d.hpp>
    
    void example_with_full_gpu(const cv::Mat &img1, const cv::Mat img2) {
    //Upload from host memory to gpu device memeory
    cv::cuda::GpuMat img1_gpu(img1), img2_gpu(img2);
    cv::cuda::GpuMat img1_gray_gpu, img2_gray_gpu;
    
    //Convert RGB to grayscale as gpu detectAndCompute only allow grayscale GpuMat
    cv::cuda::cvtColor(img1_gpu, img1_gray_gpu, CV_BGR2GRAY);
    cv::cuda::cvtColor(img2_gpu, img2_gray_gpu, CV_BGR2GRAY);
    
    //Create a GPU ORB feature object
    //blurForDescriptor=true seems to give better results
    //http://answers.opencv.org/question/10835/orb_gpu-not-as-good-as-orbcpu/
    cv::Ptr<cv::cuda::ORB> orb = cv::cuda::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20, true);
    
    cv::cuda::GpuMat keypoints1_gpu, descriptors1_gpu;
    //Detect ORB keypoints and extract descriptors on train image (box.png)
    orb->detectAndComputeAsync(img1_gray_gpu, cv::cuda::GpuMat(), keypoints1_gpu, descriptors1_gpu);
    std::vector<cv::KeyPoint> keypoints1;
    //Convert from CUDA object to std::vector<cv::KeyPoint>
    orb->convert(keypoints1_gpu, keypoints1);
    std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1_gpu=" << descriptors1_gpu.rows 
        << "x" << descriptors1_gpu.cols << std::endl;
    
    std::vector<cv::KeyPoint> keypoints2;
    cv::cuda::GpuMat descriptors2_gpu;
    //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png)
    //The conversion from internal data to std::vector<cv::KeyPoint> is done implicitly in detectAndCompute()
    orb->detectAndCompute(img2_gray_gpu, cv::cuda::GpuMat(), keypoints2, descriptors2_gpu);
    std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2_gpu=" << descriptors2_gpu.rows 
        << "x" << descriptors2_gpu.cols << std::endl;
    
    //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB)
    cv::Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);
    
    std::vector<std::vector<cv::DMatch> > knn_matches;
    //Match each query descriptor to a train descriptor
    matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2);
    std::cout << "knn_matches=" << knn_matches.size() << std::endl;
    
    std::vector<cv::DMatch> matches;
    //Filter the matches using the ratio test
    for(std::vector<std::vector<cv::DMatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) {
        if(it->size() > 1 && (*it)[0].distance/(*it)[1].distance < 0.8) {
            matches.push_back((*it)[0]);
        }
    }
    
    cv::Mat imgRes;
    //Display and save the image with matches
    cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes);
    cv::imshow("imgRes", imgRes);
    cv::imwrite("GPU_ORB-matching.png", imgRes);
    
    cv::waitKey(0); 
    }
    
    void example_with_gpu_matching(const cv::Mat &img1, const cv::Mat img2) {
    //Create a CPU ORB feature object
    cv::Ptr<cv::Feature2D> orb = cv::ORB::create(500, 1.2f, 8, 31, 0, 2, 0, 31, 20);
    
    std::vector<cv::KeyPoint> keypoints1;
    cv::Mat descriptors1;
    //Detect ORB keypoints and extract descriptors on train image (box.png)
    orb->detectAndCompute(img1, cv::Mat(), keypoints1, descriptors1);
    std::cout << "keypoints1=" << keypoints1.size() << " ; descriptors1=" << descriptors1.rows 
        << "x" << descriptors1.cols << std::endl;
    
    std::vector<cv::KeyPoint> keypoints2;
    cv::Mat descriptors2;
    //Detect ORB keypoints and extract descriptors on query image (box_in_scene.png)
    orb->detectAndCompute(img2, cv::Mat(), keypoints2, descriptors2);
    std::cout << "keypoints2=" << keypoints2.size() << " ; descriptors2=" << descriptors2.rows 
        << "x" << descriptors2.cols << std::endl;
    
    //Create a GPU brute-force matcher with Hamming distance as we use a binary descriptor (ORB)
    cv::Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(cv::NORM_HAMMING);
    
    //Upload from host memory to gpu device memeory
    cv::cuda::GpuMat descriptors1_gpu(descriptors1), descriptors2_gpu;
    //Upload from host memory to gpu device memeory (another way to do it)
    descriptors2_gpu.upload(descriptors2);
    
    std::vector<std::vector<cv::DMatch> > knn_matches;
    //Match each query descriptor to a train descriptor
    matcher->knnMatch(descriptors2_gpu, descriptors1_gpu, knn_matches, 2);
    std::cout << "knn_matches=" << knn_matches.size() << std::endl;
    
    std::vector<cv::DMatch> matches;
    //Filter the matches using the ratio test
    for(std::vector<std::vector<cv::DMatch> >::const_iterator it = knn_matches.begin(); it != knn_matches.end(); ++it) {
        if(it->size() > 1 && (*it)[0].distance/(*it)[1].distance < 0.8) {
            matches.push_back((*it)[0]);
        }
    }
    
    cv::Mat imgRes;
    //Display and save the image with matches
    cv::drawMatches(img2, keypoints2, img1, keypoints1, matches, imgRes);
    cv::imshow("imgRes", imgRes);   
    cv::imwrite("CPU_ORB+GPU_matching.png", imgRes);
    
    cv::waitKey(0); 
    }
    
    int main() {
    std::cout << "OpenCV version=" << std::hex << CV_VERSION << std::dec << std::endl;
    
    cv::Mat img1, img2;
    img1 = cv::imread("samples/data/box.png");
    img2 = cv::imread("samples/data/box_in_scene.png");
    
    example_with_full_gpu(img1, img2);
    example_with_gpu_matching(img1, img2);
    
    return 0;
    }
    

The resulting images for first all GPU and after CPU + GPU matching:

GPU

CPU+GPU

The results are not exactly the same, I don't know why exactly.exactly (the only thing I found is that).