Uploading gpumat is too slow in jetson TX1
Hi. I'm now working on jetson TX1 with cuda8.0. I compiled opencv3.1 with cuda and it looks successful.
However, when I tried some samples, I found cuda calclation pretty slow mostly because of uploading images.
For example, I tried surf feature extraction and matching from opencv sample cord. The result is like below:
upLoad = 39.9022
Device 0: "NVIDIA Tegra X1" 3995Mb, sm_53, Driver/Runtime ver.8.0/8.0
FOUND 158 keypoints on first image
FOUND 137 keypoints on second image
Findcuda = 0.000123487 Extraction = 0.0952315
Matching = 0.00152424 Download = 0.00137919
This means uploading two images took about 40sec! Are there any solutions? Thank you.
I put my code here.
#include <iostream>
#include "opencv2/opencv_modules.hpp"
#ifdef HAVE_OPENCV_XFEATURES2D
#include "opencv2/core.hpp"
#include "opencv2/features2d.hpp"
#include "opencv2/highgui.hpp"
#include "opencv2/cudafeatures2d.hpp"
#include "opencv2/xfeatures2d/cuda.hpp"
using namespace std;
using namespace cv;
using namespace cv::cuda;
static void help()
{
cout << "\nThis program demonstrates using SURF_CUDA features detector, descriptor extractor and BruteForceMatcher_CUDA" << endl;
cout << "\nUsage:\n\tsurf_keypoint_matcher --left <image1> --right <image2>" << endl;
}
int main(int argc, char* argv[])
{
if (argc != 5)
{
help();
return -1;
}
GpuMat img1, img2;
cv::Mat raw1,raw2;
raw1 = imread(argv[2], IMREAD_GRAYSCALE);
raw2= imread(argv[4], IMREAD_GRAYSCALE);
int64 t0 = cv::getTickCount();
for (int i = 1; i < argc; ++i)
{
if (string(argv[i]) == "--left")
{
img1.upload(raw1);
CV_Assert(!img1.empty());
}
else if (string(argv[i]) == "--right")
{
img2.upload(raw2);
CV_Assert(!img2.empty());
}
else if (string(argv[i]) == "--help")
{
help();
return -1;
}
}
int64 t1 = cv::getTickCount();
cout << " upLoad = " << (t1-t0)/cv::getTickFrequency() << endl;
cv::cuda::printShortCudaDeviceInfo(cv::cuda::getDevice());
int64 t2 = cv::getTickCount();
SURF_CUDA surf;
// detecting keypoints & computing descriptors
GpuMat keypoints1GPU, keypoints2GPU;
GpuMat descriptors1GPU, descriptors2GPU;
surf(img1, GpuMat(), keypoints1GPU, descriptors1GPU);
surf(img2, GpuMat(), keypoints2GPU, descriptors2GPU);
int64 t3 = cv::getTickCount();
cout << "FOUND " << keypoints1GPU.cols << " keypoints on first image" << endl;
cout << "FOUND " << keypoints2GPU.cols << " keypoints on second image" << endl;
cout << " Findcuda = " << (t2-t1)/cv::getTickFrequency() << " Extraction = " << (t3-t2)/cv::getTickFrequency() << endl;
// matching descriptors
Ptr<cv::cuda::DescriptorMatcher> matcher = cv::cuda::DescriptorMatcher::createBFMatcher(surf.defaultNorm());
vector<DMatch> matches;
matcher->match(descriptors1GPU, descriptors2GPU, matches);
int64 t4 = cv::getTickCount();
// downloading results
vector<KeyPoint> keypoints1, keypoints2;
vector<float> descriptors1, descriptors2;
surf.downloadKeypoints(keypoints1GPU, keypoints1);
surf.downloadKeypoints(keypoints2GPU, keypoints2);
surf.downloadDescriptors(descriptors1GPU, descriptors1);
surf.downloadDescriptors(descriptors2GPU, descriptors2);
int64 t5 = cv::getTickCount();
cout << " Matching = " << (t4-t3)/cv::getTickFrequency() << " Download = " << (t5-t4)/cv::getTickFrequency() << endl;
// drawing the results
Mat img_matches;
drawMatches(Mat(img1), keypoints1, Mat(img2), keypoints2, matches, img_matches);
namedWindow("matches", 0);
imshow("matches", img_matches);
waitKey(0);
return 0;
}
#else
int main()
{
std::cerr << "OpenCV was built without xfeatures2d module" << std::endl;
return 0;
}
#endif
OpenCV does a GPU initialization the first time the GPU is actually called. SO please, do a GPU command first, before starting your timing. Initialization is now included because the first GPU call is is the upload function.
Hi Steven. I have tried to call
Function at first. But it still take the same time in the uploading process. Is it what you mean?
Hmm that is weird indeed. Could you simply do the upload a 1000 times, time it and divide? I am curious if the overhead averages out or not.