12

I implemented a modified version of the Caffe C++ example and while it works really well, it's incredibly slow because it only accepts images one by one. Ideally I'd like to pass Caffe a vector of 200 images and return the best prediction for each one. I received some great help from Fanglin Wang and implemented some of his recommendations, but am still having some trouble working out how to retrieve the best result from each image.

The Classify method is now passed a vector of cv::Mat objects (variable input_channels) which is a vector of grayscale floating point images. I've eliminated the preprocessing method in the code because I don't need to convert these images to floating point or subtract the mean image. I've also been trying to get rid of the N variable because I only want to return the top prediction and probability for each image.

#include "Classifier.h"
using namespace caffe;
using std::string;

Classifier::Classifier(const string& model_file, const string& trained_file, const string& label_file) {
#ifdef CPU_ONLY
  Caffe::set_mode(Caffe::CPU);
#else
  Caffe::set_mode(Caffe::GPU);
#endif

  /* Load the network. */
  net_.reset(new Net<float>(model_file, TEST));
  net_->CopyTrainedLayersFrom(trained_file);

  Blob<float>* input_layer = net_->input_blobs()[0];
  num_channels_ = input_layer->channels();
  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());

  /* Load labels. */
  std::ifstream labels(label_file.c_str());
  CHECK(labels) << "Unable to open labels file " << label_file;
  string line;
  while (std::getline(labels, line))
    labels_.push_back(string(line));

  Blob<float>* output_layer = net_->output_blobs()[0];
  CHECK_EQ(labels_.size(), output_layer->channels())
    << "Number of labels is different from the output layer dimension.";
}

static bool PairCompare(const std::pair<float, int>& lhs, const std::pair<float, int>& rhs) {
  return lhs.first > rhs.first;
}

/* Return the indices of the top N values of vector v. */
static std::vector<int> Argmax(const std::vector<float>& v, int N) {
  std::vector<std::pair<float, int> > pairs;
  for (size_t i = 0; i < v.size(); ++i)
    pairs.push_back(std::make_pair(v[i], i));
  std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);

  std::vector<int> result;
  for (int i = 0; i < N; ++i)
    result.push_back(pairs[i].second);
  return result;
}

/* Return the top N predictions. */
std::vector<Prediction> Classifier::Classify(const std::vector<cv::Mat> &input_channels) {
  std::vector<float> output = Predict(input_channels);

    std::vector<int> maxN = Argmax(output, 1);
    int idx = maxN[0];
    predictions.push_back(std::make_pair(labels_[idx], output[idx]));
    return predictions;
}

std::vector<float> Classifier::Predict(const std::vector<cv::Mat> &input_channels, int num_images) {
  Blob<float>* input_layer = net_->input_blobs()[0];
  input_layer->Reshape(num_images, num_channels_,
                       input_geometry_.height, input_geometry_.width);
  /* Forward dimension change to all layers. */
  net_->Reshape();

  WrapInputLayer(&input_channels);

  net_->ForwardPrefilled();

  /* Copy the output layer to a std::vector */
  Blob<float>* output_layer = net_->output_blobs()[0];
  const float* begin = output_layer->cpu_data();
  const float* end = begin + num_images * output_layer->channels();
  return std::vector<float>(begin, end);
}

/* Wrap the input layer of the network in separate cv::Mat objects (one per channel). This way we save one memcpy operation and we don't need to rely on cudaMemcpy2D. The last preprocessing operation will write the separate channels directly to the input layer. */
void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels) {
  Blob<float>* input_layer = net_->input_blobs()[0];

  int width = input_layer->width();
  int height = input_layer->height();
  float* input_data = input_layer->mutable_cpu_data();
  for (int i = 0; i < input_layer->channels() * num_images; ++i) {
    cv::Mat channel(height, width, CV_32FC1, input_data);
    input_channels->push_back(channel);
    input_data += width * height;
  }
}

UPDATE

Thank-you so much for your help Shai, I made the changes you recommended but seem to be getting some strange compilation issues I can't work out (I managed to sort out a few of the issues).

These are the changes I made:

Header File:

#ifndef __CLASSIFIER_H__
#define __CLASSIFIER_H__

#include <caffe/caffe.hpp>
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <algorithm>
#include <iosfwd>
#include <memory>
#include <string>
#include <utility>
#include <vector>


using namespace caffe;  // NOLINT(build/namespaces)
using std::string;

/* Pair (label, confidence) representing a prediction. */
typedef std::pair<string, float> Prediction;

class Classifier {
 public:
  Classifier(const string& model_file,
             const string& trained_file,
             const string& label_file);

  std::vector< std::pair<int,float> > Classify(const std::vector<cv::Mat>& img);

 private:

  std::vector< std::vector<float> > Predict(const std::vector<cv::Mat>& img, int nImages);

  void WrapInputLayer(std::vector<cv::Mat>* input_channels, int nImages);

  void Preprocess(const std::vector<cv::Mat>& img,
                  std::vector<cv::Mat>* input_channels, int nImages);

 private:
  shared_ptr<Net<float> > net_;
  cv::Size input_geometry_;
  int num_channels_;
  std::vector<string> labels_;
};

#endif /* __CLASSIFIER_H__ */

Class File:

#define CPU_ONLY
#include "Classifier.h"

using namespace caffe;  // NOLINT(build/namespaces)
using std::string;

Classifier::Classifier(const string& model_file,
                       const string& trained_file,
                       const string& label_file) {
#ifdef CPU_ONLY
  Caffe::set_mode(Caffe::CPU);
#else
  Caffe::set_mode(Caffe::GPU);
#endif

  /* Load the network. */
  net_.reset(new Net<float>(model_file, TEST));
  net_->CopyTrainedLayersFrom(trained_file);

  CHECK_EQ(net_->num_inputs(), 1) << "Network should have exactly one input.";
  CHECK_EQ(net_->num_outputs(), 1) << "Network should have exactly one output.";

  Blob<float>* input_layer = net_->input_blobs()[0];
  num_channels_ = input_layer->channels();
  CHECK(num_channels_ == 3 || num_channels_ == 1)
    << "Input layer should have 1 or 3 channels.";
  input_geometry_ = cv::Size(input_layer->width(), input_layer->height());

  /* Load labels. */
  std::ifstream labels(label_file.c_str());
  CHECK(labels) << "Unable to open labels file " << label_file;
  string line;
  while (std::getline(labels, line))
    labels_.push_back(string(line));

  Blob<float>* output_layer = net_->output_blobs()[0];
  CHECK_EQ(labels_.size(), output_layer->channels())
    << "Number of labels is different from the output layer dimension.";
}

static bool PairCompare(const std::pair<float, int>& lhs,
                        const std::pair<float, int>& rhs) {
  return lhs.first > rhs.first;
}

/* Return the indices of the top N values of vector v. */
static std::vector<int> Argmax(const std::vector<float>& v, int N) {
  std::vector<std::pair<float, int> > pairs;
  for (size_t i = 0; i < v.size(); ++i)
    pairs.push_back(std::make_pair(v[i], i));
  std::partial_sort(pairs.begin(), pairs.begin() + N, pairs.end(), PairCompare);

  std::vector<int> result;
  for (int i = 0; i < N; ++i)
    result.push_back(pairs[i].second);
  return result;
}

std::vector< std::pair<int,float> > Classifier::Classify(const std::vector<cv::Mat>& img) {
  std::vector< std::vector<float> > output = Predict(img, img.size());

  std::vector< std::pair<int,float> > predictions;
  for ( int i = 0 ; i < output.size(); i++ ) {
    std::vector<int> maxN = Argmax(output[i], 1);
    int idx = maxN[0];
    predictions.push_back(std::make_pair(labels_[idx], output[idx]));
  }
  return predictions;
}

std::vector< std::vector<float> > Classifier::Predict(const std::vector<cv::Mat>& img, int nImages) {
  Blob<float>* input_layer = net_->input_blobs()[0];
  input_layer->Reshape(nImages, num_channels_,
                       input_geometry_.height, input_geometry_.width);
  /* Forward dimension change to all layers. */
  net_->Reshape();

  std::vector<cv::Mat> input_channels;
  WrapInputLayer(&input_channels, nImages);

  Preprocess(img, &input_channels, nImages);

  net_->ForwardPrefilled();

  /* Copy the output layer to a std::vector */

  Blob<float>* output_layer = net_->output_blobs()[0];
  std::vector <std::vector<float> > ret;
  for (int i = 0; i < nImages; i++) {
    const float* begin = output_layer->cpu_data() + i*output_layer->channels();
    const float* end = begin + output_layer->channels();
    ret.push_back( std::vector<float>(begin, end) );
  }
  return ret;
}

/* Wrap the input layer of the network in separate cv::Mat objects
 * (one per channel). This way we save one memcpy operation and we
 * don't need to rely on cudaMemcpy2D. The last preprocessing
 * operation will write the separate channels directly to the input
 * layer. */
void Classifier::WrapInputLayer(std::vector<cv::Mat>* input_channels, int nImages) {
  Blob<float>* input_layer = net_->input_blobs()[0];

  int width = input_layer->width();
  int height = input_layer->height();
  float* input_data = input_layer->mutable_cpu_data();
  for (int i = 0; i < input_layer->channels()* nImages; ++i) {
    cv::Mat channel(height, width, CV_32FC1, input_data);
    input_channels->push_back(channel);
    input_data += width * height;
  }
}

void Classifier::Preprocess(const std::vector<cv::Mat>& img,
                            std::vector<cv::Mat>* input_channels, int nImages) {
  for (int i = 0; i < nImages; i++) {
      vector<cv::Mat> channels;
      cv::split(img[i], channels);
      for (int j = 0; j < channels.size(); j++){
           channels[j].copyTo((*input_channels)[i*num_channels_[0]+j]);
      }
  }
}
Shai
  • 111,146
  • 38
  • 238
  • 371
Jack Simpson
  • 1,681
  • 3
  • 30
  • 54
  • Could you briefly describe your modifications? Thanks. – ypx Sep 20 '15 at 14:43
  • The answer (with comments) below is correct. However, in your preprocessing step you need to (i) convert the image format to the network input format; (ii) resize the given image to input_geometry_ if they differ; and (iii) subtract the image mean, which you need to load from file imagenet_mean.binaryproto. Then you can split the image into separate channel-based image planes. – Josh Nov 29 '15 at 02:57

2 Answers2

10

If I understand your problem correctly, you input n images, expecting n pairs of (label, prob), but getting only one such pair.

I believe these modifications should do the trick for you:

  1. Classifier::Predict should return a vector< vector<float> >, that is a vector of probabilities per input image. That is a vector of size n of vectors of size output_layer->channels():

    std::vector< std::vecot<float> > 
    Classifier::Predict(const std::vector<cv::Mat> &input_channels, 
                        int num_images) {
      // same code here...
    
      /* changes here: Copy the output layer to a std::vector */
      Blob<float>* output_layer = net_->output_blobs()[0];
      std::vector< std::vector<float> > ret;
      for ( int i = 0 ; i < num_images ; i++ ) {
          const float* begin = output_layer->cpu_data() + i*output_layer->channels();
          const float* end = begin + output_layer->channels();
          ret.push_back( std::vector<float>(begin, end) );
      }
      return ret;
    }
    
  2. In Classifier::Classify you need to process each vector<float> through Argmax independantly:

     std::vector< std::pair<int,float> > 
     Classifier::Classify(const std::vector<cv::Mat> &input_channels) {
    
       std::vector< std::vector<float> > output = Predict(input_channels);
    
       std::vector< std::pair<int,float> > predictions;
       for ( int i = 0 ; i < output.size(); i++ ) {
           std::vector<int> maxN = Argmax(output[i], 1);
           int idx = maxN[0];
           predictions.push_back(std::make_pair(labels_[idx], output[idx]));
       }
       return predictions;
     }
    
Shai
  • 111,146
  • 38
  • 238
  • 371
  • Hi Shai, thank-you so much for your help! I followed your advice but seem to be having some problems getting it to compile. I updated the question with the modifications I made, I'm really sorry to use up more of your time but do you think you might be able to take a look? – Jack Simpson Sep 25 '15 at 11:29
  • Hi Shai, I did a bit of a rewrite and now seem to get this error when I compile which I'm having trouble working out how to fix: `candidate function not viable: no known conversion from 'pair &>::type, typename __make_pair_return > &>::type>' to 'const pair' for 1st argument _LIBCPP_INLINE_VISIBILITY void push_back(const_reference __x);` – Jack Simpson Sep 29 '15 at 12:14
  • @JackSimpson it seems like the return value I had in mind was the integer index of the label and its prob, while you want to have a string and prob pair. – Shai Sep 29 '15 at 12:19
  • So in `predictions.push_back(std::make_pair(labels_[idx], output[idx]));` the `labels_[idx]` is actually a string containing the class number from the labels.txt file and I should modify `std::vector< std::pair > Classifier::Classify(const std::vector& img)` to `std::vector< std::pair > Classifier::Classify(const std::vector& img)` to fix it? – Jack Simpson Sep 29 '15 at 12:25
  • @JackSimpson O believe so – Shai Sep 29 '15 at 12:37
  • Thanks so much for the advice Shai, that solved the problem, unfortunately now the line `predictions.push_back(std::make_pair(labels_[idx], output[idx]));` gives me the compilation error message: `error: no matching member function for call to 'push_back'`. I've been trying to make changes but I can't work out how to fix this, I'm so sorry to take up more of your time. – Jack Simpson Sep 29 '15 at 12:56
  • @JackSimpson you might need to change the definition of the output vector `prediction` – Shai Sep 29 '15 at 15:03
  • I changed the predictions vector to this `std::vector predictions;` using the `typedef std::pair Prediction;` type although unfortunately I seem to be still getting the same error. – Jack Simpson Sep 30 '15 at 01:08
  • 2
    All good, I asked someone and they asked me if I meant to be returning a single float or an array of floats so I changed it to `output[i][idx]` which fixed things :) – Jack Simpson Sep 30 '15 at 01:39
  • The last problem I have is the line `channels[j].copyTo((*input_channels)[i*num_channels_[0]+j]);` where the compiler says `error: subscripted value is not an array, pointer, or vector` `void Classifier::Preprocess(const std::vector& img, std::vector* input_channels, int nImages) { for (int i = 0; i < nImages; i++) { vector channels; cv::split(img[i], channels); for (int j = 0; j < channels.size(); j++){ channels[j].copyTo((*input_channels)[i*num_channels_[0]+j]); } } }` – Jack Simpson Sep 30 '15 at 01:41
  • 1
    @JackSimpson please don't put code in comments. it is unclear what your problem is. if the problem is remote enough from this thread, please ask new question. – Shai Sep 30 '15 at 09:25
  • 1
    Your `num_channels_` is defined in the header file as `int` not as an array/pointer/vector, therefore writing `num_channels_[0]` is a syntax error. Try replacing it by `num_channels_` only... – Shai Oct 01 '15 at 06:58
  • Thanks Shai, that seems to have solved all the issues :) – Jack Simpson Oct 01 '15 at 14:35
  • @JackSimpson glad I could help. you got quite a mileage out of your bounty ;) – Shai Oct 01 '15 at 14:36
4

Unfortunately, I don't believe a parallelization of network Forward passes has been implemented. However, if you'd like you could simply implement your own wrapper to repeatedly run data through copies of your network, in parallel?

Have a look at How many images can you pass to Caffe at a time?

In the linked prototxt all you have to define is

input_shape {
  dim: 64 // num of images
  dim: 1
  dim: 28 // height
  dim: 28 // width
}

The existing implementation evaluates a batch of 64 images but not necessarily in parallel. However, if running on a GPU, processing a batch of 64 will be faster than 64 single-image batches.

Community
  • 1
  • 1
Aidan Gomez
  • 8,167
  • 5
  • 28
  • 51
  • Thanks for the help Aiden, so it isn't possible for me to pass the blob equivalent to a vector and receive a vector of predictions from the network back in one chunk? – Jack Simpson Sep 20 '15 at 08:25
  • @JackSimpson Specifying the number of images as the first blob dim is treated the same as a vector of single image blobs. – ypx Sep 20 '15 at 14:36
  • @JackSimpson: ypx is correct, 64 blobs in a vector and one blob with a num dimension of 64 are equivalent. – Aidan Gomez Sep 21 '15 at 05:57