Revision history [back]

imho, dnn's rule for this kind of task, nowadays.

that is: use an existing, pretrained model, and try to teach it some new tricks !

we can just "pipe" our images through the network, stop it at some layer (before it would do the final classification), grab the output neurons from there, and feed our own ml classifier with this data (instead of using the "raw" images)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // 700 cats, 699 dogs
    // glob() will also conveniently sort names lexically, so the cats come first!

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // 1000 numbers

        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels);
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float acc = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << acc << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // 700 cats, 699 dogs
    // glob() will also conveniently sort names lexically, so the cats come first!

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // 1000 numbers

        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels);
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float acc = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << acc << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // 700 cats, 699 dogs
    // glob() will also conveniently sort names lexically, so the cats come first!

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // 1000 numbers

        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels);
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float acc = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << acc << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // 700 cats, 699 dogs
    // glob() will also conveniently sort names lexically, so the cats come first!

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // 1000 numbers

        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels);
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float acc accuracy = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << acc accuracy << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // 700 cats, 699 dogs
    // glob() will also conveniently sort names lexically, so the cats come first!

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // now our feature has 1000 numbers

        // sort into train/test slots:
        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels);
labels); // yes, that'll take a few minutes ..
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float accuracy = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << accuracy << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // 700 cats, 699 dogs
    // glob() will also conveniently sort names lexically, so the cats come first!
    // so we have 700 cats, 699 dogs, and split it into:
    // 100 test cats
    // 600 train cats
    // 100 test dogs
    // 599 train dogs

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // now our feature has 1000 numbers

        // sort into train/test slots:
        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels); // yes, that'll take a few minutes ..
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float accuracy = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << accuracy << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // glob() will conveniently sort names lexically, so the cats come first!
    // so we have 700 cats, 699 dogs, and split it into:
    // 100 test cats
    // 600 train cats
    // 100 test dogs
    // 599 train dogs

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // now our feature has 1000 numbers

        // sort into train/test slots:
        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1;
1; // one-hot encoded labels for our ann
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels); // yes, that'll take a few minutes ..
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float accuracy = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << accuracy << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // glob() will conveniently sort names lexically, so the cats come first!
    // so we have 700 cats, 699 dogs, and split it into:
    // 100 test cats
    // 600 train cats
    // 100 test dogs
    // 599 train dogs

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // now our feature has 1000 numbers

        // sort into train/test slots:
        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1; // one-hot encoded labels for our ann
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels); // yes, that'll take a few minutes ..
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    // our result array is one-hot endoded, too, which means:
    // if pin 1 is larger than pin 0, -  it predicted a dog, else a cat.
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float accuracy = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << accuracy << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)

imho, dnn's rule for this kind of task, nowadays.

we could try to use transfer learning ,

that is: use an existing, pretrained model, and try to teach it some new tricks !

(colour) image   --> DNN --> 1000 numbers  --> our own classifier (ANN_MLP for today)

since opencv's dnn module already supports various classification models, let's try with squeezenet (which is also small, and quite fast !)

it was trained on millions of images (imagenet), among them cats & dogs. so, it has "seen the world", already. ;)

there are 67 layers (!), here's how the last 10 look like: (i=input,o=output)

fire9/squeeze1x1                       Convolution   i[1, 512, 14, 14]  o[1, 64, 14, 14]  
fire9/relu_squeeze1x1                  ReLU          i[1, 64, 14, 14]  o[1, 64, 14, 14]  
fire9/expand1x1                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand1x1                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/expand3x3                        Convolution   i[1, 64, 14, 14]  o[1, 256, 14, 14]   
fire9/relu_expand3x3                   ReLU          i[1, 256, 14, 14]  o[1, 256, 14, 14]  
fire9/concat                           Concat        i[1, 256, 14, 14]  i[1, 256, 14, 14]  o[1, 512, 14, 14]  
drop9                                  Dropout       i[1, 512, 14, 14]  o[1, 512, 14, 14]  
conv10                                 Convolution   i[1, 512, 14, 14]  o[1, 1000, 14, 14]   
relu_conv10                            ReLU          i[1, 1000, 14, 14]  o[1, 1000, 14, 14]  
pool10                                 Pooling       i[1, 1000, 14, 14]  o[1, 1000, 1, 1]  
prob                                   Softmax       i[1, 1000, 1, 1]  o[1, 1000, 1, 1]

so, pool10 looks like a good place to tap it !

(1000 features are a good number, if we have ~1000 images in our dataset)

you'll need to download the caffemodel and the prototxt , then we can start playing with your cats vs dogs dataset

#include "opencv2/opencv.hpp"
#include "opencv2/dnn.hpp"

using namespace cv;
using namespace std;

int main(int argc, char** argv)
{
    vector<String> fn;
    glob("c:/data/cat-dog/*.jpg", fn, true);
    // glob() will conveniently sort names lexically, so the cats come first!
    // so we have 700 cats, 699 dogs, and split it into:
    // 100 test cats
    // 600 train cats
    // 100 test dogs
    // 599 train dogs

    std::string modelTxt = "c:/data/mdl/squeezenet/deploy.prototxt";
    std::string modelBin = "c:/data/mdl/squeezenet/squeezenet_v1.1.caffemodel";
    dnn::Net net = dnn::readNetFromCaffe(modelTxt, modelBin);
    cv::Size inputImgSize = cv::Size(227, 227); // model was trained with this size

    Mat_<int> layers(4, 1);
    layers << 1000, 400, 100, 2; // the sqeezenet pool10 layer has 1000 neurons

    Ptr<ml::ANN_MLP> nn = ml::ANN_MLP::create();
    nn->setLayerSizes(layers);
    nn->setTrainMethod(ml::ANN_MLP::BACKPROP, 0.0001);
    nn->setActivationFunction(ml::ANN_MLP::SIGMOID_SYM);
    nn->setTermCriteria(TermCriteria(TermCriteria::MAX_ITER+TermCriteria::EPS, 300, 0.0001));

    Mat train, test;
    Mat labels(1199, 2, CV_32F, 0.f); // 1399 - 200 test images
    for (size_t i=0; i<fn.size(); i++) {
        // use the dnn as a "fixed function" preprocessor (no training here)
        Mat img = imread(fn[i]);
        net.setInput(dnn::blobFromImage(img, 1, inputImgSize, Scalar::all(127), false));
        Mat blob = net.forward("pool10");
        Mat f = blob.reshape(1,1).clone(); // now our feature has 1000 numbers

        // sort into train/test slots:
        if (i<100) {
            // test cat
            test.push_back(f);
        } else
        if (i>=100 && i<700) {
            // train cat
            train.push_back(f);
            labels.at<float>(i-100,0) = 1; // one-hot encoded labels for our ann
        } else
        if (i>=700 && i<800) {
            // test dog
            test.push_back(f);
        } else {
            // train dog
            train.push_back(f);
            labels.at<float>(i-200,1) = 1;
        }
        cout << i << "\r"; // "machine learning should come with a statusbar." ;)
    }

    cout << train.size() << " " << labels.size() << " " << test.size() << endl;
    nn->train(train, 0, labels); // yes, that'll take a few minutes ..
    nn->save("cats.dogs.ann.yml.gz");

    Mat result;
    // our result array is one-hot endoded, too, which means:
    // if pin 1 is larger than pin 0, -  it predicted a dog, else a cat.
    nn->predict(test,result);
    // cout << result << endl;

    float correct_cat = 0;
    float correct_dog = 0;
    for (int i=0; i<100; i++)
i++) // 1st hundred testexamples were cats
        correct_cat += result.at<float>(i,0) > result.at<float>(i,1); // 0, true cat
    for (int i=100; i<200; i++)
i++) // 2nd hundred were dogs
        correct_dog += result.at<float>(i,1) > result.at<float>(i,0); // 1, true dog;
    float accuracy = (correct_cat + correct_dog) / 200;
    cout << correct_cat << " " << correct_dog << " : " << accuracy << endl;

    return 0;
}

// 97 96 : 0.965

not bad, ey ? @sjhalayka ;)