so, the answer is probably the same as always: rather use opencv's internal parallelization, than trying to multithread your own. here's dkurt's idea with the batches. also i don't think, you need a 2nd thread for loading the images, but let's do it anyway:

// modified caffe_googlenet.cpp

#include <opencv2/dnn.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/highgui.hpp>
using namespace cv;
using namespace cv::dnn;

#include <fstream>
#include <iostream>
#include <cstdlib>
#include <thread>
using namespace std;

static std::vector<String> readClassNames(const char *filename = "synset_words.txt")
    std::vector<String> classNames;

    std::ifstream fp(filename);
    if (!fp.is_open())
        std::cerr << "File with classes labels not found: " << filename << std::endl;

    std::string name;
    while (!fp.eof())
        std::getline(fp, name);
        if (name.length())
            classNames.push_back( name.substr(name.find(' ')+1) );

    return classNames;

void makeBatch(const vector<String> &names, vector<Mat> &batch, size_t from, size_t to)
    for (size_t i=from; i<min(to,names.size()); i++)
        Mat img = imread(names[i]);
        if (img.empty())
        // order of operations matters !
        // we can't simply resize() our image, 
        // since we have to emulate the default crop=true option
        Size size(224,224);
        float resizeFactor = std::max(size.width  / (float)img.cols,
                                      size.height / (float)img.rows);
        resize(img, img, Size(), resizeFactor, resizeFactor);
        Rect crop(Point(0.5 * (img.cols - size.width),
                        0.5 * (img.rows - size.height)), size);
        img = img(crop);
        img.convertTo(img, CV_32F);
        img -= Scalar(104, 117, 123); // subtract mean

int main(int argc, char **argv)
    String modelTxt = "bvlc_googlenet.prototxt";
    String modelBin = "bvlc_googlenet.caffemodel";
    String imageDir = (argc > 1) ? argv[1] : "C:\\data\\img\\cache\\1";

    vector<String> images;
    glob(imageDir, images);
    cout << images.size() << " images on " << imageDir << endl;

    vector<String> classNames = readClassNames();

    Net net = dnn::readNetFromCaffe(modelTxt, modelBin);

    int batchsize = 8;
    int from = 0;
    int to = batchsize;
    cv::TickMeter t;

    // we have to run the 1st batch "manually"
    vector<Mat> batch;
    thread runner(makeBatch, std::ref(images), std::ref(batch), from, to);

    while (to < images.size())
        // wait for our images

        // we've done the preprocessing already.
        Mat inputBlob = blobFromImages(batch, 1.0, Size(), Scalar(), false);

        // start next round
        from += batchsize;
        to += batchsize;
        if (to<images.size())
            runner = thread(makeBatch, std::ref(images), std::ref(batch), from, to);

        net.setInput(inputBlob, "data");
        Mat prob = net.forward("prob");

        // each prediction is a row in the prob Mat
        for (size_t i=0; i<batch.size(); i++)
            Point classNumber; double classProb;

            minMaxLoc(prob.row(i), NULL, &classProb, NULL, &classNumber);
            int classId = classNumber.x;

            std::cout << "'" << << "'";
            std::cout << " (" << classProb * 100 << "%)" << std::endl;
    std::cout << "Time: " << (double)t.getTimeMilli() / (batchsize * t.getCounter()) << " ms (average from " << t.getCounter() << " * " << batchsize << " iterations)" << std::endl;
    return 0;