Revision history [back]

opencv dnn yolo3-tiny can not get the same result as darknet yolo3-tiny

Hello everybody! first, I train my traindataset by original Darknet yolo3-tiny on ubuntu system. second, I test my valdataset by the pretrained model，it predicts very good result; But, WHEN I USE the opencv dnn yolo like https://github.com/opencv/opencv/blob/master/samples/dnn/object_detection.cpp, I can not predict the same result as The Darknet, where Iam in trouble.! Can anyone explain the reason for me AND help me deal with the problem.

THERE is my opencv dnn yolo code

enter code here

include <fstream>

include <sstream>

include <iostream>

include <opencv2 dnn.hpp="">

include <opencv2 imgproc.hpp="">

include <opencv2 highgui.hpp="">

using namespace cv; using namespace std; using namespace dnn;

float confThreshold, nmsThreshold; std::vector<std::string> classes;

void postprocess(Mat& frame, const std::vector<mat>& out, Net& net);

void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);

std::vector<string> getOutputsNames(const Net& net);

int main(int argc, char** argv) {

confThreshold = 0.8;
nmsThreshold = 0.4;
double scale = 1.0/255;
//int scale = 1;
Scalar mean = Scalar(0,0,0);
bool swapRB = true;
int inpWidth = 416;
int inpHeight = 416;
std::string modelPath = "yolov3-tiny_111000.weights";
std::string configPath = "tiny.cfg";

// Open file with classes names.
if (1)
{
    std::string file = "target.names";
    std::ifstream ifs(file.c_str());
    if (!ifs.is_open())
        CV_Error(Error::StsError, "File " + file + " not found");
    std::string line;
    while (std::getline(ifs, line))
    {
        classes.push_back(line);
    }
}

// Load a model.
Net net = readNetFromDarknet( configPath, modelPath);
net.setPreferableBackend(DNN_BACKEND_OPENCV);
net.setPreferableTarget(DNN_TARGET_CPU);
std::vector<String> outNames = net.getUnconnectedOutLayersNames();

// Create a window
static const std::string kWinName = "Deep learning object detection in OpenCV";
namedWindow(kWinName, WINDOW_NORMAL);

// Open a video file or an image file or a camera stream.
VideoCapture cap;
cap.open("1.mp4");

// Process frames.
Mat frame, blob;
int index = 0;
while (waitKey(1) < 0)
{
    cap >> frame;
    index++;
    if (index % 10 != 0)
        continue;
    if (frame.empty())
    {
        waitKey();
        break;
    }

    // Create a 4D blob from a frame.
    Size inpSize(inpWidth > 0 ? inpWidth : frame.cols,
        inpHeight > 0 ? inpHeight : frame.rows);
    blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false, CV_32F);
    cout << "blob" << endl;
    cout << blob<< endl;
    //blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false, CV_8U);


    // Run a model.
    net.setInput(blob);
    if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
    {
        resize(frame, frame, inpSize);
        Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
        net.setInput(imInfo, "im_info");
    }
    std::vector<Mat> outs;
    net.forward(outs, outNames);

    postprocess(frame, outs, net);

    // Put efficiency information.
    std::vector<double> layersTimes;
    double freq = getTickFrequency() / 1000;
    double t = net.getPerfProfile(layersTimes) / freq;
    std::string label = format("Inference time: %.2f ms", t);
    putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

    imshow(kWinName, frame);
}
return 0;

}

void postprocess(Mat& frame, const std::vector<mat>& outs, Net& net) { static std::vector<int> outLayers = net.getUnconnectedOutLayers(); static std::string outLayerType = net.getLayer(outLayers[0])->type;

std::vector<int> classIds;
std::vector<float> confidences;
std::vector<Rect> boxes;
if (outLayerType == "DetectionOutput")
{
    // Network produces output blob with a shape 1x1xNx7 where N is a number of
    // detections and an every detection is a vector of values
    // [batchId, classId, confidence, left, top, right, bottom]
    CV_Assert(outs.size() > 0);
    for (size_t k = 0; k < outs.size(); k++)
    {
        float* data = (float*)outs[k].data;
        for (size_t i = 0; i < outs[k].total(); i += 7)
        {
            float confidence = data[i + 2];
            if (confidence > confThreshold)
            {
                int left = (int)data[i + 3];
                int top = (int)data[i + 4];
                int right = (int)data[i + 5];
                int bottom = (int)data[i + 6];
                int width = right - left + 1;
                int height = bottom - top + 1;
                if (width * height <= 1)
                {
                    left = (int)(data[i + 3] * frame.cols);
                    top = (int)(data[i + 4] * frame.rows);
                    right = (int)(data[i + 5] * frame.cols);
                    bottom = (int)(data[i + 6] * frame.rows);
                    width = right - left + 1;
                    height = bottom - top + 1;
                }
                classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
                boxes.push_back(Rect(left, top, width, height));
                confidences.push_back(confidence);
            }
        }
    }
}
else if (outLayerType == "Region")
{
    for (size_t i = 0; i < outs.size(); ++i)
    {
        // Network produces output blob with a shape NxC where N is a number of
        // detected objects and C is a number of classes + 4 where the first 4
        // numbers are [center_x, center_y, width, height]
        float* data = (float*)outs[i].data;
        for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
        {
            Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
            Point classIdPoint;
            double confidence;
            minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
            if (confidence > confThreshold)
            {
                int centerX = (int)(data[0] * frame.cols);
                int centerY = (int)(data[1] * frame.rows);
                int width = (int)(data[2] * frame.cols);
                int height = (int)(data[3] * frame.rows);
                int left = centerX - width / 2;
                int top = centerY - height / 2;

                classIds.push_back(classIdPoint.x);
                confidences.push_back((float)confidence);
                boxes.push_back(Rect(left, top, width, height));
            }
        }
    }
}
else
    CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);

std::vector<int> indices;
NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
for (size_t i = 0; i < indices.size(); ++i)
{
    int idx = indices[i];
    Rect box = boxes[idx];
    drawPred(classIds[idx], confidences[idx], box.x, box.y,
        box.x + box.width, box.y + box.height, frame);
}

}

void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) { rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));

std::string label = format("%.2f", conf);
if (!classes.empty())
{
    CV_Assert(classId < (int)classes.size());
    label = classes[classId] + ": " + label;
}

int baseLine;
Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

top = max(top, labelSize.height);
rectangle(frame, Point(left, top - labelSize.height),
    Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
namedWindow("frame",WINDOW_NORMAL);
imshow("frame", frame);
waitKey(0);

}

Darknet result is good, as following: image description

opencv dnn yolo3-tiny can not get the same result as darknet yolo3-tiny

THERE is my opencv dnn yolo code

enter code here

include <fstream>

include <sstream>

include <iostream>

include <opencv2 dnn.hpp="">

include <opencv2 imgproc.hpp="">

include <opencv2 highgui.hpp="">

#include <fstream> #include <sstream> #include <iostream> #include <opencv2/dnn.hpp> #include <opencv2/imgproc.hpp> #include <opencv2/highgui.hpp> using namespace cv; using namespace std; using namespace ~~dnn;~~

dnn; float confThreshold, nmsThreshold; std::vector<std::string> ~~classes;~~

classes; void postprocess(Mat& frame, const ~~std::vector<mat>&~~ std::vector<Mat>& out, Net& ~~net);~~

net); void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& ~~frame);~~

std::vector<string> frame); std::vector<String> getOutputsNames(const Net& ~~net);~~

net); int main(int argc, char** argv) {

{


    confThreshold = 0.8;
 nmsThreshold = 0.4;
 double scale = 1.0/255;
 //int scale = 1;
 Scalar mean = Scalar(0,0,0);
 bool swapRB = true;
 int inpWidth = 416;
 int inpHeight = 416;
 std::string modelPath = "yolov3-tiny_111000.weights";
 std::string configPath = "tiny.cfg";

 // Open file with classes names.
 if (1)
 {
     std::string file = "target.names";
     std::ifstream ifs(file.c_str());
     if (!ifs.is_open())
         CV_Error(Error::StsError, "File " + file + " not found");
     std::string line;
     while (std::getline(ifs, line))
     {
         classes.push_back(line);
     }
 }

 // Load a model.
 Net net = readNetFromDarknet( configPath, modelPath);
 net.setPreferableBackend(DNN_BACKEND_OPENCV);
 net.setPreferableTarget(DNN_TARGET_CPU);
 std::vector<String> outNames = net.getUnconnectedOutLayersNames();

 // Create a window
 static const std::string kWinName = "Deep learning object detection in OpenCV";
 namedWindow(kWinName, WINDOW_NORMAL);

 // Open a video file or an image file or a camera stream.
 VideoCapture cap;
 cap.open("1.mp4");

 // Process frames.
 Mat frame, blob;
 int index = 0;
 while (waitKey(1) < 0)
 {
     cap >> frame;
     index++;
     if (index % 10 != 0)
         continue;
     if (frame.empty())
     {
         waitKey();
         break;
     }

     // Create a 4D blob from a frame.
     Size inpSize(inpWidth > 0 ? inpWidth : frame.cols,
         inpHeight > 0 ? inpHeight : frame.rows);
     blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false, CV_32F);
     cout << "blob" << endl;
     cout << blob<< endl;
     //blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false, CV_8U);


     // Run a model.
     net.setInput(blob);
     if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
     {
         resize(frame, frame, inpSize);
         Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
         net.setInput(imInfo, "im_info");
     }
     std::vector<Mat> outs;
     net.forward(outs, outNames);

     postprocess(frame, outs, net);

     // Put efficiency information.
     std::vector<double> layersTimes;
     double freq = getTickFrequency() / 1000;
     double t = net.getPerfProfile(layersTimes) / freq;
     std::string label = format("Inference time: %.2f ms", t);
     putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

     imshow(kWinName, frame);
 }
 return 0;

}

} void postprocess(Mat& frame, const ~~std::vector<mat>&~~ std::vector<Mat>& outs, Net& net) { static std::vector<int> outLayers = net.getUnconnectedOutLayers(); static std::string outLayerType = ~~net.getLayer(outLayers[0])->type;~~

net.getLayer(outLayers[0])->type;

    std::vector<int> classIds;
 std::vector<float> confidences;
 std::vector<Rect> boxes;
 if (outLayerType == "DetectionOutput")
 {
     // Network produces output blob with a shape 1x1xNx7 where N is a number of
     // detections and an every detection is a vector of values
     // [batchId, classId, confidence, left, top, right, bottom]
     CV_Assert(outs.size() > 0);
     for (size_t k = 0; k < outs.size(); k++)
     {
         float* data = (float*)outs[k].data;
         for (size_t i = 0; i < outs[k].total(); i += 7)
         {
             float confidence = data[i + 2];
             if (confidence > confThreshold)
             {
                 int left = (int)data[i + 3];
                 int top = (int)data[i + 4];
                 int right = (int)data[i + 5];
                 int bottom = (int)data[i + 6];
                 int width = right - left + 1;
                 int height = bottom - top + 1;
                 if (width * height <= 1)
                 {
                     left = (int)(data[i + 3] * frame.cols);
                     top = (int)(data[i + 4] * frame.rows);
                     right = (int)(data[i + 5] * frame.cols);
                     bottom = (int)(data[i + 6] * frame.rows);
                     width = right - left + 1;
                     height = bottom - top + 1;
                 }
                 classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
                 boxes.push_back(Rect(left, top, width, height));
                 confidences.push_back(confidence);
             }
         }
     }
 }
 else if (outLayerType == "Region")
 {
     for (size_t i = 0; i < outs.size(); ++i)
     {
         // Network produces output blob with a shape NxC where N is a number of
         // detected objects and C is a number of classes + 4 where the first 4
         // numbers are [center_x, center_y, width, height]
         float* data = (float*)outs[i].data;
         for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
         {
             Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
             Point classIdPoint;
             double confidence;
             minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
             if (confidence > confThreshold)
             {
                 int centerX = (int)(data[0] * frame.cols);
                 int centerY = (int)(data[1] * frame.rows);
                 int width = (int)(data[2] * frame.cols);
                 int height = (int)(data[3] * frame.rows);
                 int left = centerX - width / 2;
                 int top = centerY - height / 2;

                 classIds.push_back(classIdPoint.x);
                 confidences.push_back((float)confidence);
                 boxes.push_back(Rect(left, top, width, height));
             }
         }
     }
 }
 else
     CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);

 std::vector<int> indices;
 NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
 for (size_t i = 0; i < indices.size(); ++i)
 {
     int idx = indices[i];
     Rect box = boxes[idx];
     drawPred(classIds[idx], confidences[idx], box.x, box.y,
         box.x + box.width, box.y + box.height, frame);
 }

}

} void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame) { rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, ~~0));~~

0));

    std::string label = format("%.2f", conf);
 if (!classes.empty())
 {
     CV_Assert(classId < (int)classes.size());
     label = classes[classId] + ": " + label;
 }

 int baseLine;
 Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

 top = max(top, labelSize.height);
 rectangle(frame, Point(left, top - labelSize.height),
     Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
 putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
 namedWindow("frame",WINDOW_NORMAL);
 imshow("frame", frame);
 waitKey(0);
}

}

Darknet result is good, as following: image description

opencv dnn yolo3-tiny can not get the same result as darknet yolo3-tiny

THERE is my opencv dnn yolo code

#include <fstream>
#include <sstream>
#include <iostream>

#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>

using namespace cv;
using namespace std;
using namespace dnn;

float confThreshold, nmsThreshold;
std::vector<std::string> classes;

void postprocess(Mat& frame, const std::vector<Mat>& out, Net& net);

void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame);


std::vector<String> getOutputsNames(const Net& net);

int main(int argc, char** argv)
{


    confThreshold = 0.8;
    nmsThreshold = 0.4;
    double scale = 1.0/255;
    //int scale = 1;
    Scalar mean = Scalar(0,0,0);
    bool swapRB = true;
    int inpWidth = 416;
    int inpHeight = 416;
    std::string modelPath = "yolov3-tiny_111000.weights";
    std::string configPath = "tiny.cfg";

    // Open file with classes names.
    if (1)
    {
        std::string file = "target.names";
        std::ifstream ifs(file.c_str());
        if (!ifs.is_open())
            CV_Error(Error::StsError, "File " + file + " not found");
        std::string line;
        while (std::getline(ifs, line))
        {
            classes.push_back(line);
        }
    }

    // Load a model.
    Net net = readNetFromDarknet( configPath, modelPath);
    net.setPreferableBackend(DNN_BACKEND_OPENCV);
    net.setPreferableTarget(DNN_TARGET_CPU);
    std::vector<String> outNames = net.getUnconnectedOutLayersNames();

    // Create a window
    static const std::string kWinName = "Deep learning object detection in OpenCV";
    namedWindow(kWinName, WINDOW_NORMAL);

    // Open a video file or an image file or a camera stream.
    VideoCapture cap;
    cap.open("1.mp4");

    // Process frames.
    Mat frame, blob;
    int index = 0;
    while (waitKey(1) < 0)
    {
        cap >> frame;
        index++;
        if (index % 10 != 0)
            continue;
        if (frame.empty())
        {
            waitKey();
            break;
        }

        // Create a 4D blob from a frame.
        Size inpSize(inpWidth > 0 ? inpWidth : frame.cols,
            inpHeight > 0 ? inpHeight : frame.rows);
        blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false, CV_32F);
        cout << "blob" << endl;
        cout << blob<< endl;
        //blobFromImage(frame, blob, scale, inpSize, mean, swapRB, false, CV_8U);


        // Run a model.
        net.setInput(blob);
        if (net.getLayer(0)->outputNameToIndex("im_info") != -1)  // Faster-RCNN or R-FCN
        {
            resize(frame, frame, inpSize);
            Mat imInfo = (Mat_<float>(1, 3) << inpSize.height, inpSize.width, 1.6f);
            net.setInput(imInfo, "im_info");
        }
        std::vector<Mat> outs;
        net.forward(outs, outNames);

        postprocess(frame, outs, net);

        // Put efficiency information.
        std::vector<double> layersTimes;
        double freq = getTickFrequency() / 1000;
        double t = net.getPerfProfile(layersTimes) / freq;
        std::string label = format("Inference time: %.2f ms", t);
        putText(frame, label, Point(0, 15), FONT_HERSHEY_SIMPLEX, 0.5, Scalar(0, 255, 0));

        imshow(kWinName, frame);
    }
    return 0;
}

void postprocess(Mat& frame, const std::vector<Mat>& outs, Net& net)
{
    static std::vector<int> outLayers = net.getUnconnectedOutLayers();
    static std::string outLayerType = net.getLayer(outLayers[0])->type;

    std::vector<int> classIds;
    std::vector<float> confidences;
    std::vector<Rect> boxes;
    if (outLayerType == "DetectionOutput")
    {
        // Network produces output blob with a shape 1x1xNx7 where N is a number of
        // detections and an every detection is a vector of values
        // [batchId, classId, confidence, left, top, right, bottom]
        CV_Assert(outs.size() > 0);
        for (size_t k = 0; k < outs.size(); k++)
        {
            float* data = (float*)outs[k].data;
            for (size_t i = 0; i < outs[k].total(); i += 7)
            {
                float confidence = data[i + 2];
                if (confidence > confThreshold)
                {
                    int left = (int)data[i + 3];
                    int top = (int)data[i + 4];
                    int right = (int)data[i + 5];
                    int bottom = (int)data[i + 6];
                    int width = right - left + 1;
                    int height = bottom - top + 1;
                    if (width * height <= 1)
                    {
                        left = (int)(data[i + 3] * frame.cols);
                        top = (int)(data[i + 4] * frame.rows);
                        right = (int)(data[i + 5] * frame.cols);
                        bottom = (int)(data[i + 6] * frame.rows);
                        width = right - left + 1;
                        height = bottom - top + 1;
                    }
                    classIds.push_back((int)(data[i + 1]) - 1);  // Skip 0th background class id.
                    boxes.push_back(Rect(left, top, width, height));
                    confidences.push_back(confidence);
                }
            }
        }
    }
    else if (outLayerType == "Region")
    {
        for (size_t i = 0; i < outs.size(); ++i)
        {
            // Network produces output blob with a shape NxC where N is a number of
            // detected objects and C is a number of classes + 4 where the first 4
            // numbers are [center_x, center_y, width, height]
            float* data = (float*)outs[i].data;
            for (int j = 0; j < outs[i].rows; ++j, data += outs[i].cols)
            {
                Mat scores = outs[i].row(j).colRange(5, outs[i].cols);
                Point classIdPoint;
                double confidence;
                minMaxLoc(scores, 0, &confidence, 0, &classIdPoint);
                if (confidence > confThreshold)
                {
                    int centerX = (int)(data[0] * frame.cols);
                    int centerY = (int)(data[1] * frame.rows);
                    int width = (int)(data[2] * frame.cols);
                    int height = (int)(data[3] * frame.rows);
                    int left = centerX - width / 2;
                    int top = centerY - height / 2;

                    classIds.push_back(classIdPoint.x);
                    confidences.push_back((float)confidence);
                    boxes.push_back(Rect(left, top, width, height));
                }
            }
        }
    }
    else
        CV_Error(Error::StsNotImplemented, "Unknown output layer type: " + outLayerType);

    std::vector<int> indices;
    NMSBoxes(boxes, confidences, confThreshold, nmsThreshold, indices);
    for (size_t i = 0; i < indices.size(); ++i)
    {
        int idx = indices[i];
        Rect box = boxes[idx];
        drawPred(classIds[idx], confidences[idx], box.x, box.y,
            box.x + box.width, box.y + box.height, frame);
    }
}

void drawPred(int classId, float conf, int left, int top, int right, int bottom, Mat& frame)
{
    rectangle(frame, Point(left, top), Point(right, bottom), Scalar(0, 255, 0));

    std::string label = format("%.2f", conf);
    if (!classes.empty())
    {
        CV_Assert(classId < (int)classes.size());
        label = classes[classId] + ": " + label;
    }

    int baseLine;
    Size labelSize = getTextSize(label, FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);

    top = max(top, labelSize.height);
    rectangle(frame, Point(left, top - labelSize.height),
        Point(left + labelSize.width, top + baseLine), Scalar::all(255), FILLED);
    putText(frame, label, Point(left, top), FONT_HERSHEY_SIMPLEX, 0.5, Scalar());
    namedWindow("frame",WINDOW_NORMAL);
    imshow("frame", frame);
    waitKey(0);
}

Darknet result is good, as following: image description

BUT THE opencv dnn yolo can not detect any object ，why???