Ask Your Question

Revision history [back]

click to hide/show revision 1
initial version

Setting up MLP for image recognition

Hi! I am new in OpenCV world and neural networks but I have some coding experience in C++/Java.


I created my first ANN MLP and learned it the XOR:

#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/ml/ml.hpp>

#include <iostream>
#include <iomanip>

using namespace cv;
using namespace ml;
using namespace std;

void print(Mat& mat, int prec)
{
    for (int i = 0; i<mat.size().height; i++)
    {
        cout << "[";
        for (int j = 0; j<mat.size().width; j++)
        {
            cout << fixed << setw(2) << setprecision(prec) << mat.at<float>(i, j);
            if (j != mat.size().width - 1)
                cout << ", ";
            else
                cout << "]" << endl;
        }
    }
}

int main()
{
    const int hiddenLayerSize = 4;
    float inputTrainingDataArray[4][2] = {
        { 0.0, 0.0 },
        { 0.0, 1.0 },
        { 1.0, 0.0 },
        { 1.0, 1.0 }
    };
    Mat inputTrainingData = Mat(4, 2, CV_32F, inputTrainingDataArray);

    float outputTrainingDataArray[4][1] = {
        { 0.0 },
        { 1.0 },
        { 1.0 },
        { 0.0 }
    };
    Mat outputTrainingData = Mat(4, 1, CV_32F, outputTrainingDataArray);

    Ptr<ANN_MLP> mlp = ANN_MLP::create();

    Mat layersSize = Mat(3, 1, CV_16U);
    layersSize.row(0) = Scalar(inputTrainingData.cols);
    layersSize.row(1) = Scalar(hiddenLayerSize);
    layersSize.row(2) = Scalar(outputTrainingData.cols);
    mlp->setLayerSizes(layersSize);

    mlp->setActivationFunction(ANN_MLP::ActivationFunctions::SIGMOID_SYM);

    TermCriteria termCrit = TermCriteria(
        TermCriteria::Type::COUNT + TermCriteria::Type::EPS,
        100000000,
        0.000000000000000001
    );
    mlp->setTermCriteria(termCrit);

    mlp->setTrainMethod(ANN_MLP::TrainingMethods::BACKPROP);

    Ptr<TrainData> trainingData = TrainData::create(
        inputTrainingData,
        SampleTypes::ROW_SAMPLE,
        outputTrainingData
    );

    mlp->train(trainingData
        /*, ANN_MLP::TrainFlags::UPDATE_WEIGHTS
        + ANN_MLP::TrainFlags::NO_INPUT_SCALE
        + ANN_MLP::TrainFlags::NO_OUTPUT_SCALE*/
    );

    for (int i = 0; i < inputTrainingData.rows; i++) {
        Mat sample = Mat(1, inputTrainingData.cols, CV_32F, inputTrainingDataArray[i]);
        Mat result;
        mlp->predict(sample, result);
        cout << sample << " -> ";// << result << endl;
        print(result, 0);
        cout << endl;
    }

    return 0;
}

It works very well for this simple problem, I also learn this network the 1-10 to binary conversion.


But i need to use MLP for simple image classification - road signs. I write the code for loading training images and preparing matrix for learning but I'm not able to train the network - it "learn" in one second even with 1 000 000 iterations! And it produce garbage results, the same for all inputs!

#include <opencv2/core.hpp>
#include <opencv2/imgcodecs.hpp>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/ml/ml.hpp>

#include <iostream>
#include <chrono>
#include <memory>
#include <iomanip>
#include <climits>

#include <Windows.h>

using namespace cv;
using namespace ml;
using namespace std;
using namespace chrono;

const int WIDTH_SIZE = 50;
const int HEIGHT_SIZE = (int)(WIDTH_SIZE * sqrt(3)) / 2;
const int IMAGE_DATA_SIZE = WIDTH_SIZE * HEIGHT_SIZE;

void print(Mat& mat, int prec)
{
    for (int i = 0; i<mat.size().height; i++)
    {
        cout << "[ ";
        for (int j = 0; j<mat.size().width; j++)
        {
            cout << fixed << setw(2) << setprecision(prec) << mat.at<float>(i, j);
            if (j != mat.size().width - 1)
                cout << ", ";
            else
                cout << " ]" << endl;
        }
    }
}

bool loadImage(string imagePath, Mat& outputImage)
{
    // load image in grayscale
    Mat image = imread(imagePath, IMREAD_GRAYSCALE);
    Mat temp;

    // check for invalid input
    if (image.empty()) {
        cout << "Could not open or find the image" << std::endl;
        return false;
    }

    // resize the image
    Size size(WIDTH_SIZE, HEIGHT_SIZE);
    resize(image, temp, size, 0, 0, CV_INTER_AREA);

    // convert to float 1-channel
    temp.convertTo(outputImage, CV_32FC1, 1.0/255.0);

    return true;
}

vector<string> getFilesNamesInFolder(string folder)
{
    vector<string> names;
    char search_path[200];
    sprintf(search_path, "%s/*.*", folder.c_str());
    WIN32_FIND_DATA fd;
    HANDLE hFind = ::FindFirstFile(search_path, &fd);
    if (hFind != INVALID_HANDLE_VALUE) {
        do {
            // read all (real) files in current folder
            // , delete '!' read other 2 default folder . and ..
            if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) {
                names.push_back(fd.cFileName);
            }
        } while (::FindNextFile(hFind, &fd));
        ::FindClose(hFind);
    }
    return names;
}

class Sign {
public:
    enum class Category { A = 'A', B = 'B', C = 'C', D = 'D' };

    Mat image;
    Category category;
    int number;

    Sign(Mat& image, string name) :image(image) {
        category = static_cast<Category>(name.at(0));
        number = stoi(name.substr(2, name.length()));
    };
};

vector<Sign> loadSignsFromFolder(String folderName) {
    vector<Sign> roadSigns;

    for (string fileName : getFilesNamesInFolder(folderName)) {
        Mat image;
        loadImage(folderName + fileName, image);
        roadSigns.emplace_back(image, fileName.substr(0, (fileName.length() - 4))); //cut .png
    }

    return roadSigns;
}

void showSignsInWindows(vector<Sign> roadSigns) {
    for (Sign sign : roadSigns) {
        String windowName = "Sign " + to_string(sign.number);
        namedWindow(windowName, WINDOW_AUTOSIZE);
        imshow(windowName, sign.image);
    }
    waitKey(0);
}

Mat getInputDataFromSignsVector(vector<Sign> roadSigns) {
    Mat roadSignsImageData;

    for (Sign sign : roadSigns) {
        Mat signImageDataInOneRow = sign.image.reshape(0, 1);
        roadSignsImageData.push_back(signImageDataInOneRow);
    }

    return roadSignsImageData;
}

Mat getOutputDataFromSignsVector(vector<Sign> roadSigns) {
    int signsCount = (int) roadSigns.size();
    int signsVectorSize = signsCount + 1;

    Mat roadSignsData(0, signsVectorSize, CV_32FC1);

    int i = 1;
    for (Sign sign : roadSigns) {
        vector<float> outputTraningVector(signsVectorSize);
        fill(outputTraningVector.begin(), outputTraningVector.end(), -1.0);
        outputTraningVector[i++] = 1.0;

        Mat tempMatrix(outputTraningVector, false);
        roadSignsData.push_back(tempMatrix.reshape(0, 1));
    }

    return roadSignsData;
}

int main(int argc, char* argv[])
{
    if (argc != 2) {
        cout << " Usage: display_image ImageToLoadAndDisplay" << endl;
        return -1;
    }

    const int hiddenLayerSize = 500;

    vector<Sign> roadSigns = loadSignsFromFolder("../../../Znaki/A/");
    Mat inputTrainingData = getInputDataFromSignsVector(roadSigns);
    Mat outputTrainingData = getOutputDataFromSignsVector(roadSigns);

    Ptr<ANN_MLP> mlp = ANN_MLP::create();

    Mat layersSize = Mat(3, 1, CV_16U);
    layersSize.row(0) = Scalar(inputTrainingData.cols);
    layersSize.row(1) = Scalar(hiddenLayerSize);
    layersSize.row(2) = Scalar(outputTrainingData.cols);
    mlp->setLayerSizes(layersSize);

    mlp->setActivationFunction(ANN_MLP::ActivationFunctions::SIGMOID_SYM, 1.0, 1.0);

    mlp->setTrainMethod(ANN_MLP::TrainingMethods::BACKPROP, 0.05, 0.05);
    //mlp->setTrainMethod(ANN_MLP::TrainingMethods::RPROP);

    TermCriteria termCrit = TermCriteria(
        TermCriteria::Type::MAX_ITER //| TermCriteria::Type::EPS,
        ,100 //(int) INT_MAX
        ,0.000001
    );
    mlp->setTermCriteria(termCrit);

    Ptr<TrainData> trainingData = TrainData::create(
        inputTrainingData,
        SampleTypes::ROW_SAMPLE,
        outputTrainingData
    );

    auto start = system_clock::now();
    mlp->train(trainingData
        //, //ANN_MLP::TrainFlags::UPDATE_WEIGHTS
        , ANN_MLP::TrainFlags::NO_INPUT_SCALE
        + ANN_MLP::TrainFlags::NO_OUTPUT_SCALE
    );
    auto duration = duration_cast<milliseconds> (system_clock::now() - start);
    cout << "Training time: " << duration.count() << "ms" << endl;

    for (int i = 0; i < inputTrainingData.rows; i++) {
        Mat result;
        //mlp->predict(inputTrainingData.row(i), result);
        mlp->predict(roadSigns[i].image.reshape(0, 1), result);
        //cout << result << endl;
        print(result, 2);
    }


    //showSignsInWindows(roadSigns);
    return 0;
}

What is wrong in this code, that XOR works but images not? I cheked the input and output matrix and they're correct... could somebody also explain me when to/shoud I use the ANN_MLP::TrainFlags::NO_INPUT_SCALE and ANN_MLP::TrainFlags::NO_OUTPUT_SCALE or what values of setActivationFunction parameters and setTrainMethod should I use?


Thanks!