Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

Imread with preallocated data pointer (cuda unified memory) is reallocating with the exact same size, but shouldn't

Hello!
I have a problem with processing a set of images using opencv and cuda Unified Memory.
I am trying to use a Mat created once using a pre-allocated data buffer with cudaMallocManaged. When reading the image, the data seem to be reallocated during the mat::create in the imread function, but create should not be re-allocating since my previous data pointer has the exact same size. Here's an example of my issue

#include <iostream>
#include <experimental/filesystem>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>

#define COLS 5344
#define ROWS 4016

namespace fs = std::experimental::filesystem;

int main(int argc, char const *argv[])
{
    int index = 0;
    //---- get arguments ----
    if (argc < 2) {
        std::cout << "number of arguments invalid" << std::endl;
        std::cout << "1 - Input Folder" << std::endl;
        std::cout << "2 - Output Folder" << std::endl;
        exit(1);
    }
    std::string input_folder = argv[1];
    std::string output_folder = argv[2];

    // Unified pointers
    void *src_ptr, *dst_ptr;
    if (cudaSuccess != cudaMallocManaged(&src_ptr, ROWS*COLS*3)) return -1;
    cv::Mat src(ROWS, COLS, CV_8UC3, src_ptr);
    cv::cuda::GpuMat d_src(ROWS, COLS, CV_8UC3, src_ptr);
    if (cudaSuccess != cudaMallocManaged(&dst_ptr, ROWS*COLS*3)) return -2;
    cv::Mat dst(rows, cols, CV_8UC3, dst_ptr);
    cv::cuda::GpuMat d_dst(rows, cols, CV_8UC3, dst_ptr);

    for (const auto& entry : fs::directory_iterator(input_folder))
    {
        std::cout << "Before imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Reading image
        src = cv::imread(entry.path());

        std::cout << "After imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Some action on cuda device
        cv::cuda::rotate(d_src, d_dst, d_dst.size(), 180.0);

        // write output file
        std::string out_name = output_folder + "preprocessed_image" + std::to_string(index) + ".jpg";
        index++;
        cv::imwrite(out_name, dst);
        break; // only the first one is needed for the example
    }
    return 0;
}

With this example, when the imread is perform (so a mat create), the data pointer is reallocated for the first loop. It seems weird since mat create should verify mat::total * mat::elemsize, and not reallocate if the result is the same as what is read in the image. And here this is the case. I'm printing the addresses of src_ptr and the data address of the src mat : before the imread, they are identical, after the imread, the second one changed.
My goal is to avoid explicit data transfer between host and cuda device. I really don't know why the data pointer is reallocated, i'm looking forward to some advice in order to correct this issu or avoid it. Thank you in advance.

Imread with preallocated data pointer (cuda unified memory) is reallocating with the exact same size, but shouldn't

Hello!
I have a problem with processing a set of images using opencv and cuda Unified Memory.
I am trying to use a Mat created once using a pre-allocated data buffer with cudaMallocManaged. When reading the image, the data seem to be reallocated during the mat::create in the imread function, but create should not be re-allocating since my previous data pointer has the exact same size. Here's an example of my issue

#include <iostream>
#include <experimental/filesystem>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>

#define COLS 5344
#define ROWS 4016

namespace fs = std::experimental::filesystem;

int main(int argc, char const *argv[])
{
    int index = 0;
    //---- get arguments ----
    if (argc < 2) {
        std::cout << "number of arguments invalid" << std::endl;
        std::cout << "1 - Input Folder" << std::endl;
        std::cout << "2 - Output Folder" << std::endl;
        exit(1);
    }
    std::string input_folder = argv[1];
    std::string output_folder = argv[2];

    // Unified pointers
    void *src_ptr, *dst_ptr;
    if (cudaSuccess != cudaMallocManaged(&src_ptr, ROWS*COLS*3)) return -1;
    cv::Mat src(ROWS, COLS, CV_8UC3, src_ptr);
    cv::cuda::GpuMat d_src(ROWS, COLS, CV_8UC3, src_ptr);
    if (cudaSuccess != cudaMallocManaged(&dst_ptr, ROWS*COLS*3)) return -2;
    cv::Mat dst(rows, cols, CV_8UC3, dst_ptr);
    cv::cuda::GpuMat d_dst(rows, cols, CV_8UC3, dst_ptr);

    for (const auto& entry : fs::directory_iterator(input_folder))
    {
        std::cout << "Before imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Reading image
        src = cv::imread(entry.path());

        std::cout << "After imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Some action on cuda device
        cv::cuda::rotate(d_src, d_dst, d_dst.size(), 180.0);

        // write output file
        std::string out_name = output_folder + "preprocessed_image" + std::to_string(index) + ".jpg";
        index++;
        cv::imwrite(out_name, dst);
        break; // only the first one is needed for the example
    }
    return 0;
}

With this example, when the imread is perform (so a mat create), the data pointer is reallocated for the first loop. It seems weird since mat create should verify mat::total * mat::elemsize, and not reallocate if the result is the same as what is read in the image. And here this is the case. I'm printing the addresses of src_ptr and the data address of the src mat : before the imread, they are identical, after the imread, the second one changed.
My goal is to avoid explicit data transfer between host and cuda device. I really don't know why the data pointer is reallocated, i'm looking forward to some advice in order to correct this issu or avoid it. Thank you in advance.

[EDIT] The unique "solution" i found is to read to a temp Mat and then use copyto to the src one. With this solution the address doesn't change, but a copy is performed...

Imread with preallocated data pointer (cuda unified memory) is reallocating with the exact same size, but shouldn't

Hello!
I have a problem with processing a set of images using opencv and cuda Unified Memory.
I am trying to use a Mat created once using a pre-allocated data buffer with cudaMallocManaged. When reading the image, the data seem to be reallocated during the mat::create in the imread function, but create should not be re-allocating since my previous data pointer has the exact same size. Here's an example of my issue

#include <iostream>
#include <experimental/filesystem>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>

#define COLS 5344
#define ROWS 4016

namespace fs = std::experimental::filesystem;

int main(int argc, char const *argv[])
{
    int index = 0;
    //---- get arguments ----
    if (argc < 2) {
        std::cout << "number of arguments invalid" << std::endl;
        std::cout << "1 - Input Folder" << std::endl;
        std::cout << "2 - Output Folder" << std::endl;
        exit(1);
    }
    std::string input_folder = argv[1];
    std::string output_folder = argv[2];

    // Unified pointers
    void *src_ptr, *dst_ptr;
    if (cudaSuccess != cudaMallocManaged(&src_ptr, ROWS*COLS*3)) return -1;
    cv::Mat src(ROWS, COLS, CV_8UC3, src_ptr);
    cv::cuda::GpuMat d_src(ROWS, COLS, CV_8UC3, src_ptr);
    if (cudaSuccess != cudaMallocManaged(&dst_ptr, ROWS*COLS*3)) return -2;
    cv::Mat dst(rows, cols, CV_8UC3, dst_ptr);
    cv::cuda::GpuMat d_dst(rows, cols, CV_8UC3, dst_ptr);

    for (const auto& entry : fs::directory_iterator(input_folder))
    {
        std::cout << "Before imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Reading image
        src = cv::imread(entry.path());

        std::cout << "After imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Some action on cuda device
        cv::cuda::rotate(d_src, d_dst, d_dst.size(), 180.0);

        // write output file
        std::string out_name = output_folder + "preprocessed_image" + std::to_string(index) + ".jpg";
        index++;
        cv::imwrite(out_name, dst);
        break; // only the first one is needed for the example
    }
    return 0;
}

With this example, when the imread is perform (so a mat create), the data pointer is reallocated for the first loop. It seems weird since mat create should verify mat::total * mat::elemsize, and not reallocate if the result is the same as what is read in the image. And here this is the case. I'm printing the addresses of src_ptr and the data address of the src mat : before the imread, they are identical, after the imread, the second one changed.
My goal is to avoid explicit data transfer between host and cuda device. I really don't know why the data pointer is reallocated, i'm looking forward to some advice in order to correct this issu or avoid it. Thank you in advance.

[EDIT] The unique "solution" i found is to read to a temp Mat and then use copyto to the src one. With this solution the address doesn't change, but a copy is performed...

Imread with preallocated data pointer (cuda unified memory) is reallocating with the exact same size, but shouldn't

Hello!
I have a problem with processing a set of images using opencv and cuda Unified Memory.
I am trying to use a Mat created once using a pre-allocated data buffer with cudaMallocManaged. When reading the image, the data seem to be reallocated during the mat::create in the imread function, but create should not be re-allocating since my previous data pointer has the exact same size. Here's an example of my issue

#include <iostream>
#include <experimental/filesystem>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>

#define COLS 5344
#define ROWS 4016

namespace fs = std::experimental::filesystem;

int main(int argc, char const *argv[])
{
    int index = 0;
    //---- get arguments ----
    if (argc < 2) {
        std::cout << "number of arguments invalid" << std::endl;
        std::cout << "1 - Input Folder" << std::endl;
        std::cout << "2 - Output Folder" << std::endl;
        exit(1);
    }
    std::string input_folder = argv[1];
    std::string output_folder = argv[2];

    // Unified pointers
    void *src_ptr, *dst_ptr;
    if (cudaSuccess != cudaMallocManaged(&src_ptr, ROWS*COLS*3)) return -1;
    cv::Mat src(ROWS, COLS, CV_8UC3, src_ptr);
    cv::cuda::GpuMat d_src(ROWS, COLS, CV_8UC3, src_ptr);
    if (cudaSuccess != cudaMallocManaged(&dst_ptr, ROWS*COLS*3)) return -2;
    cv::Mat dst(rows, cols, CV_8UC3, dst_ptr);
    cv::cuda::GpuMat d_dst(rows, cols, CV_8UC3, dst_ptr);

    for (const auto& entry : fs::directory_iterator(input_folder))
    {
        std::cout << "Before imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Reading image
        src = cv::imread(entry.path());

        std::cout << "After imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Some action on cuda device
        cv::cuda::rotate(d_src, d_dst, d_dst.size(), 180.0);

        // write output file
        std::string out_name = output_folder + "preprocessed_image" + std::to_string(index) + ".jpg";
        index++;
        cv::imwrite(out_name, dst);
        break; // only the first one is needed for the example
    }
    return 0;
}

With this example, when the imread is perform (so a mat create), the data pointer is reallocated for the first loop. It seems weird since mat create should verify mat::total * mat::elemsize, and not reallocate if the result is the same as what is read in the image. And here this is the case. I'm printing the addresses of src_ptr and the data address of the src mat : before the imread, they are identical, after the imread, the second one changed.
My goal is to avoid explicit data transfer between host and cuda device. I really don't know why the data pointer is reallocated, i'm looking forward to some advice in order to correct this issu or avoid it. Thank you in advance.

[EDIT] The unique "solution" i found is to read to a temp Mat and then use copyto to the src one. With this solution the address doesn't change, but a copy is performed...performed... I mean, the same mat create is used in both copyto or imread, why is imread reallocating my data, I really don't understand. The copyTo slow down my application a lot, if you have any alternatives I would be glad to hear about them. Thanks a lot in advance.

Imread with preallocated data pointer (cuda unified memory) is reallocating with the exact same size, but shouldn't

Hello!
I have a problem with processing a set of images using opencv and cuda Unified Memory.
I am trying to use a Mat created once using a pre-allocated data buffer with cudaMallocManaged. When reading the image, the data seem to be reallocated during the mat::create in the imread function, but create should not be re-allocating since my previous data pointer has the exact same size. Here's an example of my issue

#include <iostream>
#include <experimental/filesystem>
#include <cuda_runtime.h>
#include <opencv2/opencv.hpp>
#include <opencv2/cudawarping.hpp>

#define COLS 5344
#define ROWS 4016

namespace fs = std::experimental::filesystem;

int main(int argc, char const *argv[])
{
    int index = 0;
    //---- get arguments ----
    if (argc < 2) {
        std::cout << "number of arguments invalid" << std::endl;
        std::cout << "1 - Input Folder" << std::endl;
        std::cout << "2 - Output Folder" << std::endl;
        exit(1);
    }
    std::string input_folder = argv[1];
    std::string output_folder = argv[2];

    // Unified pointers
    void *src_ptr, *dst_ptr;
    if (cudaSuccess != cudaMallocManaged(&src_ptr, ROWS*COLS*3)) return -1;
    cv::Mat src(ROWS, COLS, CV_8UC3, src_ptr);
    cv::cuda::GpuMat d_src(ROWS, COLS, CV_8UC3, src_ptr);
    if (cudaSuccess != cudaMallocManaged(&dst_ptr, ROWS*COLS*3)) return -2;
    cv::Mat dst(rows, cols, CV_8UC3, dst_ptr);
    cv::cuda::GpuMat d_dst(rows, cols, CV_8UC3, dst_ptr);

    for (const auto& entry : fs::directory_iterator(input_folder))
    {
        std::cout << "Before imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Reading image
        src = cv::imread(entry.path());

        std::cout << "After imread : src total = " << src.total() << "    src elemsize = " << src.elemSize() << "    src_ptr : " << src_ptr << "    &(src 0 0)" << &src.at<cv::Vec3b>(0, 0) << std::endl;

        // Some action on cuda device
        cv::cuda::rotate(d_src, d_dst, d_dst.size(), 180.0);

        // write output file
        std::string out_name = output_folder + "preprocessed_image" + std::to_string(index) + ".jpg";
        index++;
        cv::imwrite(out_name, dst);
        break; // only the first one is needed for the example
    }
    return 0;
}

With this example, when the imread is perform (so a mat create), the data pointer is reallocated for the first loop. It seems weird since mat create should verify mat::total * mat::elemsize, and not reallocate if the result is the same as what is read in the image. And here this is the case. I'm printing the addresses of src_ptr and the data address of the src mat : before the imread, they are identical, after the imread, the second one changed.
My goal is to avoid explicit data transfer between host and cuda device. I really don't know why the data pointer is reallocated, i'm looking forward to some advice in order to correct this issu or avoid it. Thank you in advance.

[EDIT] The unique "solution" i found is to read to a temp Mat and then use copyto to the src one. With this solution the address doesn't change, but a copy is performed... I mean, the same mat create is used in both copyto or imread, why is imread reallocating my data, I really don't understand. The copyTo slow down my application a lot, if you have any alternatives I would be glad to hear about them. Thanks a lot in advance.

[EDIT] So basically I'm doing an assignment here :

src = cv::imread(entry.path());

Of course src is now the new mat created by imread. The solution would be to have an overloaded imread like this :

imread(const string path, Mat &src)

Thanks @cudawarped for pointing out the problem!