Revision history [back]

LSVM translation invariant

Hi,

I am using LatentSVM in opencv 2.4 on models from opencv_extra as suggested here.

This is a voc sample Test image,

I noticed different scores and bounding boxes each 1-pixel shift, what are the causes of this variation although the object is fully present in these cases? Here is the code:

#include <opencv2/contrib/contrib.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/core/core_c.h>
#include <opencv2/core/mat.hpp>
#include <opencv2/core/operations.hpp>
#include <opencv2/core/types_c.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgproc/imgproc_c.h>
#include <opencv2/imgproc/types_c.h>
#include <opencv2/objdetect/objdetect.hpp>
#include <opencv2/video/background_segm.hpp>
#include <iostream>
#include <vector>
#include <sstream>

using namespace std;
using namespace cv;

vector<pair<Rect, float> > detectWithLSVM(Mat img, LatentSvmDetector& detector, float overlapThreshold,
        float confThresh) {

    vector<pair<Rect, float> > res;
    vector<LatentSvmDetector::ObjectDetection> detections;
    detector.detect(img, detections, overlapThreshold);

    for (int i = 0; i < detections.size(); i++) {
        if (detections[i].score < confThresh)
            continue;
        Rect r = detections[i].rect;

        res.push_back(make_pair(r, detections[i].score));
    }

    return res;
}

int main(int argc, char **argv) {

    LatentSvmDetector detector(vector<String>(1, "person.xml"));

    Mat image = imread("09.jpg",1);

    if (image.empty()) {
        cout << "Frame is empty! .. Quit" << endl;
        return 0;
    }

    int width = 300;
    string sstr;
    for (int i = 0; i < image.cols - width; i++) {
        Mat imaged = image.clone();
        Mat roi;
        Rect rectInput;
        rectInput = Rect(i, 0, width, image.rows);
        cout << rectInput << endl;
        roi = image(rectInput).clone();
        vector<pair<Rect, float> > r = detectWithLSVM(roi, detector, 0.1, 0.0);
        rectangle(imaged, rectInput, Scalar(255, 0, 0), 2);
        for (int ridx = 0; ridx < r.size(); ridx++) {
            r[ridx].first.x += rectInput.x;
            r[ridx].first.y += rectInput.y;
            rectangle(imaged, r[ridx].first, Scalar(0, 0, 255), 2);
            stringstream ss;
            ss<<r[ridx].second;
            ss >> sstr;
            putText(imaged, sstr, Point(r[ridx].first.x, r[ridx].first.y), 1, 2, Scalar(255, 255, 0), 2);
            imshow("LSVM",imaged);
            waitKey(1);
        }
    }
    return 0;
}

I want to ask about why different translation/roi image size produces different scores and predictions? And are there any way to guarantee what kind of displacements are best to achieve good detection (e.g the object is in the middle or in the boarder of the image, etc)
Are there any down resolution factors other than HOG blocks calculation? like stride in convolution and score calculation step, is it dense overlapped with 1 pixel stride?
For speeding up the detection, I changed LAMBDA to 5 instead of 10. Other than misdetection from different pyramid scale size, are there any miscalculation or dependent parameters in LSVM code to be ware of? or is it safe to change LAMBDA only limiting pyramid scale size?

LSVM translation invariant

Hi,

I am using LatentSVM in opencv 2.4 on models from opencv_extra as suggested here.

This is a voc sample Test image,

I noticed different scores and bounding boxes each 1-pixel shift, what are the causes of this variation although the object is fully present in these cases? Here is the code:

#include <opencv2/contrib/contrib.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/core/core_c.h>
#include <opencv2/core/mat.hpp>
#include <opencv2/core/operations.hpp>
#include <opencv2/core/types_c.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgproc/imgproc_c.h>
#include <opencv2/imgproc/types_c.h>
#include <opencv2/objdetect/objdetect.hpp>
#include <opencv2/video/background_segm.hpp>
#include <iostream>
#include <vector>
#include <sstream>

using namespace std;
using namespace cv;

vector<pair<Rect, float> > detectWithLSVM(Mat img, LatentSvmDetector& detector, float overlapThreshold,
        float confThresh) {

    vector<pair<Rect, float> > res;
    vector<LatentSvmDetector::ObjectDetection> detections;
    detector.detect(img, detections, overlapThreshold);

    for (int i = 0; i < detections.size(); i++) {
        if (detections[i].score < confThresh)
            continue;
        Rect r = detections[i].rect;

        res.push_back(make_pair(r, detections[i].score));
    }

    return res;
}

int main(int argc, char **argv) {

    LatentSvmDetector detector(vector<String>(1, "person.xml"));

    Mat image = imread("09.jpg",1);

    if (image.empty()) {
        cout << "Frame is empty! .. Quit" << endl;
        return 0;
    }

    int width = 300;
    string sstr;
    for (int i = 0; i < image.cols - width; i++) {
        Mat imaged = image.clone();
        Mat roi;
        Rect rectInput;
        rectInput = Rect(i, 0, width, image.rows);
        cout << rectInput << endl;
        roi = image(rectInput).clone();
        vector<pair<Rect, float> > r = detectWithLSVM(roi, detector, 0.1, 0.0);
        rectangle(imaged, rectInput, Scalar(255, 0, 0), 2);
        for (int ridx = 0; ridx < r.size(); ridx++) {
            r[ridx].first.x += rectInput.x;
            r[ridx].first.y += rectInput.y;
            rectangle(imaged, r[ridx].first, Scalar(0, 0, 255), 2);
            stringstream ss;
            ss<<r[ridx].second;
            ss >> sstr;
            putText(imaged, sstr, Point(r[ridx].first.x, r[ridx].first.y), 1, 2, Scalar(255, 255, 0), 2);
            imshow("LSVM",imaged);
            waitKey(1);
        }
    }
    return 0;
}

I want to ask about why different translation/roi image size produces different scores and predictions? And are there any way to guarantee what kind of displacements are best to achieve good detection (e.g the object is in the middle or in the boarder of the image, etc)

Are there any down resolution factors other than HOG blocks calculation? like stride in convolution and score calculation step, is it dense overlapped with 1 pixel stride?

For speeding up the detection, I changed LAMBDA to 5 instead of 10. Other than misdetection from different pyramid scale size, are there any miscalculation or dependent parameters in LSVM code to be ware of? or is it safe to change LAMBDA only limiting pyramid scale size?

LSVM translation invariant

Hi,

I am using LatentSVM in opencv 2.4 on models from opencv_extra as suggested here.

This is a voc sample Test image,

I noticed different scores and bounding boxes each 1-pixel shift, what are the causes of this variation although the object is fully present in these cases? Here is the ~~code:~~

#include <opencv2/contrib/contrib.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/core/core_c.h>
#include <opencv2/core/mat.hpp>
#include <opencv2/core/operations.hpp>
#include <opencv2/core/types_c.h>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/imgproc/imgproc_c.h>
#include <opencv2/imgproc/types_c.h>
#include <opencv2/objdetect/objdetect.hpp>
#include <opencv2/video/background_segm.hpp>
#include <iostream>
#include <vector>
#include <sstream>

using namespace std;
using namespace cv;

vector<pair<Rect, float> > detectWithLSVM(Mat img, LatentSvmDetector& detector, float overlapThreshold,
        float confThresh) {

    vector<pair<Rect, float> > res;
    vector<LatentSvmDetector::ObjectDetection> detections;
    detector.detect(img, detections, overlapThreshold);

    for (int i = 0; i < detections.size(); i++) {
        if (detections[i].score < confThresh)
            continue;
        Rect r = detections[i].rect;

        res.push_back(make_pair(r, detections[i].score));
    }

    return res;
}

int main(int argc, char **argv) {

    LatentSvmDetector detector(vector<String>(1, "person.xml"));

    Mat image = imread("09.jpg",1);

    if (image.empty()) {
        cout << "Frame is empty! .. Quit" << endl;
        return 0;
    }

    int width = 300;
    string sstr;
    for (int i = 0; i < image.cols - width; i++) {
        Mat imaged = image.clone();
        Mat roi;
        Rect rectInput;
        rectInput = Rect(i, 0, width, image.rows);
        cout << rectInput << endl;
        roi = image(rectInput).clone();
        vector<pair<Rect, float> > r = detectWithLSVM(roi, detector, 0.1, 0.0);
        rectangle(imaged, rectInput, Scalar(255, 0, 0), 2);
        for (int ridx = 0; ridx < r.size(); ridx++) {
            r[ridx].first.x += rectInput.x;
            r[ridx].first.y += rectInput.y;
            rectangle(imaged, r[ridx].first, Scalar(0, 0, 255), 2);
            stringstream ss;
            ss<<r[ridx].second;
            ss >> sstr;
            putText(imaged, sstr, Point(r[ridx].first.x, r[ridx].first.y), 1, 2, Scalar(255, 255, 0), 2);
            imshow("LSVM",imaged);
            waitKey(1);
        }
    }
    return 0;
}

code

I want to ask about why different translation/roi image size produces different scores and predictions? And are there any way to guarantee what kind of displacements are best to achieve good detection (e.g the object is in the middle or in the boarder of the image, etc)
Are there any down resolution factors other than HOG blocks calculation? like stride in convolution and score calculation step, is it dense overlapped with 1 pixel stride?
For speeding up the detection, I changed LAMBDA to 5 instead of 10. Other than misdetection from different pyramid scale size, are there any miscalculation or dependent parameters in LSVM code to be ware of? or is it safe to change LAMBDA only limiting pyramid scale size?

LSVM translation invariant

Hi,

I am using LatentSVM in opencv 2.4 on models from opencv_extra as suggested here.

This is a voc sample Test image,

I noticed different scores and bounding boxes each 1-pixel shift, what are the causes of this variation although the object is fully present in these cases? Here is the code

I want to ask about why different translation/roi image size produces different scores and predictions? And are there any way to guarantee what kind of displacements are best to achieve good detection (e.g the object is in the middle or in the boarder of the image, etc)
Are there any down resolution factors other than HOG blocks calculation? like stride in convolution and score calculation step, is it dense overlapped with 1 pixel stride?
For speeding up the detection, I changed LAMBDA to 5 instead of 10. Other than misdetection from different pyramid scale size, are there any miscalculation or dependent parameters in LSVM code to be ware of? or is it safe to change LAMBDA only limiting pyramid scale size?

Edit 1 Here are important more in details points:

Sliding window applied is only for illustration of variance output for different ROI/input, LatentSVM in opencv already applies sliding window of the input image so I am not trying to apply sliding window in here. I am just evaluating output stability with respect to translation.
My main interest is to detect people in general not pedestrian as there are far more structure and deformation in general person detector than in pedestrian (for example DPM achieves 88% with INRIA pedestrian but 50% with VOC person class)
Finally, I am interested on a non-deep learning method .. LSVM/DPM method is as far as I know is the state of the art classical method (any other high accuracy classical method?)

LSVM translation invariant

Hi,

I am using LatentSVM in opencv 2.4 on models from opencv_extra as suggested here.

This is a voc sample Test image,

, the results with 1 different shift

output_1 output_2

I noticed different scores and bounding boxes each 1-pixel shift, what are the causes of this variation although the object is fully present in these cases? Here is the code

I want to ask about why different translation/roi image size produces different scores and predictions? And are there any way to guarantee what kind of displacements are best to achieve good detection (e.g the object is in the middle or in the boarder of the image, etc)
Are there any down resolution factors other than HOG blocks calculation? like stride in convolution and score calculation step, is it dense overlapped with 1 pixel stride?
For speeding up the detection, I changed LAMBDA to 5 instead of 10. Other than misdetection from different pyramid scale size, are there any miscalculation or dependent parameters in LSVM code to be ware of? or is it safe to change LAMBDA only limiting pyramid scale size?

Edit 1 Here are important more in details points:

Sliding window applied is only for illustration of variance output for different ROI/input, LatentSVM in opencv already applies sliding window of the input image so I am not trying to apply sliding window in here. I am just evaluating output stability with respect to translation.
My main interest is to detect people in general not pedestrian as there are far more structure and deformation in general person detector than in pedestrian (for example DPM achieves 88% with INRIA pedestrian but 50% with VOC person class)
Finally, I am interested on a non-deep learning method .. LSVM/DPM method is as far as I know is the state of the art classical method (any other high accuracy classical method?)