How to get the output from YOLO model using tensorflow with C++ correctly?

Question

I'm trying to write an inference program with YOLO model in C++. I've searched for some info about darknet, but it has to use .cfg file to import the model structure(which is a bit too complicated for me...), thus I want to do the program with tensorflow.

(My model weight is converted from .hdf5(used in python) to .pb(used in C++))

I've found some examples written in python, it seems like they have done some work before the inference process... Source

def yolo_eval(yolo_outputs,
              anchors,
              num_classes,
              image_shape,
              max_boxes=50,
              score_threshold=.6,
              iou_threshold=.5):

    """Evaluate YOLO model on given input and return filtered boxes."""
    num_layers = len(yolo_outputs)
    anchor_mask = [[6,7,8], [3,4,5], [0,1,2]] if num_layers==3 else [[3,4,5], [1,2,3]] # default setting
    input_shape = K.shape(yolo_outputs[0])[1:3] * 32
    boxes = []
    box_scores = []
    for l in range(num_layers):
        _boxes, _box_scores = yolo_boxes_and_scores(yolo_outputs[l],
            anchors[anchor_mask[l]], num_classes, input_shape, image_shape)
        boxes.append(_boxes)
        box_scores.append(_box_scores)
    boxes = K.concatenate(boxes, axis=0)
    box_scores = K.concatenate(box_scores, axis=0)

    mask = box_scores >= score_threshold
    max_boxes_tensor = K.constant(max_boxes, dtype='int32')
    boxes_ = []
    scores_ = []
    classes_ = []
    for c in range(num_classes):
        # TODO: use keras backend instead of tf.
        class_boxes = tf.boolean_mask(boxes, mask[:, c])
        class_box_scores = tf.boolean_mask(box_scores[:, c], mask[:, c])
        nms_index = tf.image.non_max_suppression(
            class_boxes, class_box_scores, max_boxes_tensor, iou_threshold=iou_threshold)
        class_boxes = K.gather(class_boxes, nms_index)
        class_box_scores = K.gather(class_box_scores, nms_index)
        classes = K.ones_like(class_box_scores, 'int32') * c
        boxes_.append(class_boxes)
        scores_.append(class_box_scores)
        classes_.append(classes)
    boxes_ = K.concatenate(boxes_, axis=0)
    scores_ = K.concatenate(scores_, axis=0)
    classes_ = K.concatenate(classes_, axis=0)

    return boxes_, scores_, classes_

I've printed out the return value and it looks like this

boxes-> Tensor("concat_11:0", shape=(?, 4), dtype=float32)

scores-> Tensor("concat_12:0", shape=(?,), dtype=float32)

classes-> Tensor("concat_13:0", shape=(?,), dtype=int32)

the original output of my YOLO model(.hdf5) is (I got this by printed out model.output)

tf.Tensor 'conv2d_59_1/BiasAdd:0' shape=(?, ?, ?, 21) dtype=float32

tf.Tensor 'conv2d_67_1/BiasAdd:0' shape=(?, ?, ?, 21) dtype=float32

tf.Tensor 'conv2d_75_1/BiasAdd:0' shape=(?, ?, ?, 21) dtype=float32

And the inference part of the python code is

out_boxes, out_scores, out_classes = sess.run(
                                    [boxes, scores, classes],
                                    feed_dict={
                                        yolo_model.input: image_data,
                                        input_image_shape: [image.size[1], image.size[0]],
                                        K.learning_phase(): 0
                                    })

Compare to the python version of inference code, C++ part is... (Reference)

int main()
{
    string image = "test.jpg";
    string graph = "yolo_weight.pb";
    string labels = "coco.names";
    int32 input_width = 416;
    int32 input_height = 416;
    float input_mean = 0;
    float input_std = 255;
    string input_layer = "input_1:0";
    std::vector<std::string> output_layer = {"conv2d_59/BiasAdd:0", "conv2d_67/BiasAdd:0", "conv2d_75/BiasAdd:0" };

    std::unique_ptr<tensorflow::Session> session;
    string graph_path = tensorflow::io::JoinPath(root_dir, graph);
    Status load_graph_status = LoadGraph(graph_path, &session);

    std::vector<Tensor> resized_tensors;
    string image_path = tensorflow::io::JoinPath(root_dir, image);
    Status read_tensor_status = ReadTensorFromImageFile(image_path, input_height, input_width, 
    input_mean, input_std, &resized_tensors);

    Tensor inpTensor = Tensor(DT_FLOAT, TensorShape({ 1, input_height, input_width, 3 }));
    std::vector<Tensor> outputs;
    cv::Mat srcImage = cv::imread(image);
    cv::resize(srcImage, srcImage, cv::Size(input_width, input_height));
    srcImage.convertTo(srcImage, CV_32FC3);
    srcImage = srcImage / 255;  
    string ty = type2str(srcImage.type());
    float *p = (&inpTensor)->flat<float>().data();
    cv::Mat tensorMat(input_height, input_width, CV_32FC3, p);
    srcImage.convertTo(tensorMat, CV_32FC3);
    Status run_status = session->Run({{ input_layer, inpTensor }}, { output_layer }, {}, &outputs);
    int cc = 1;
    auto output_detection_class = outputs[0].tensor<float, 4>();
    std::cout << "detection scores" << std::endl;
    std::cout << "typeid(output_detection_scoreclass).name->" << typeid(output_detection_class).name() << std::endl;
    for (int i = 0; i < 13; ++i)
    {
        for (int j = 0; j < 13; ++j)
        {
            for (int k = 0; k < 21; ++k)
            {
                // using (index_1, index_2, index_3) to access the element in a tensor
                printf("i->%d, j->%d, k->%d\t", i, j, k);
                std::cout << output_detection_class(1, i, j, k) << "\t";
                cc += 1;
                if (cc % 4 == 0)
                {
                    std::cout << "\n";
                }
            }
        }
        std::cout << std::endl;
    }
    return 0;
}

The output of c++ version inference part is

outputs.size()-> 3

outputs[0].shape()-> [1,13,13,21]

outputs[1].shape()-> [1,26,26,21]

outputs[2].shape()-> [1,52,52,21]

But the output I get is pretty weird...

(The output value of outputs[0] doesn't seems like any one of score, class, or coordinates...) Yolo output result

So I'm wondering is it because I miss the part written in python before its inference? Or I use the wrong way to get my output data?

I've checked some related questions and answers...

1.Yolo v3 model output clarification with keras

2.Convert YoloV3 output to coordinates of bounding box, label and confidence

3.How to access tensorflow::Tensor C++

But I still can't figure out how to make it :(

I also found a repo which might be helpful, I've taken a look at its yolo.cpp, but its model output tensor's shape is different from mine, I'm not sure if I can revise the code directly, its output tensor is

tf.Tensor 'import/output:0' shape=(?, 735) dtype = float32

Any help or advice is appreciated...

score 1 · Answer 1 · answered Apr 10 '20 at 17:00

1

In case you're still struggling with this, I don't see where you are applying the Sigmoid and Exp to the output layer values.

You might look at this paper, which describes how to handle the output.

https://medium.com/analytics-vidhya/yolo-v3-theory-explained-33100f6d193

answered Apr 10 '20 at 17:00

Bryan Greenway

703
11
30

Sorry for the late reply, I've figured out another way of this problem, I'll update the answer when I have free time. :D – Coco Yen Feb 09 '21 at 09:16

score 0 · Answer 2 · answered Feb 17 '21 at 01:49

As Bryan said, there're still some actions need to be done with the output layer.

So in my case (according to this repo), I add this to the YOLO class (at file yolo.py) for adding those post-processing when saving model:

def output_pb(self, out_dir, out_pb):

    out_bx = self.boxes.name.split(":")[0]
    out_sc = self.scores.name.split(":")[0]
    out_cs = self.classes.name.split(":")[0]
    print(out_bx, out_sc, out_cs)
    frozen_graph = tf.graph_util.remove_training_nodes(tf.graph_util.convert_variables_to_constants(self.sess, self.sess.graph.as_graph_def(), [out_bx, out_sc, out_cs]))
    tf.io.write_graph(frozen_graph, out_dir, out_pb, as_text=False)
    print("===== FINISH saving new pb file =====")

When saving model, I called the function like this:

yolo = YOLO(**config)
yolo.output_pb(output_dir, output_pb_name)

And when doing inference in C++, the whole process goes like this:

// initialize model
YOLO* YOLO_data = (YOLO*)Init_DllODM_object(config);
// do some stuff to set data in YOLO_data
cv::Mat input_pic = "whatever_pic.png";
predict(YOLO_data, input_pic, YOLO_data ->bbox_res, YOLO_data ->score_res, YOLO_data ->class_res);
// draw result on pic
cv::Mat res = show_result(YOLO_data, input_pic);

Detailed code is here:

// yolo_cpp.h

struct YOLO
{
    float score_thres;
    std::vector<int> class_res;
    std::vector<float> bbox_res, score_res;

    std::string inp_tensor_name;
    std::string placeholder_name;
    std::vector<std::string> out_tensors;
    Session* session;

    Tensor t, inpTensor;
    std::vector<tensorflow::Tensor> outTensor;

    std::vector<int> MD_size;
    std::vector<int> inp_pic_size;
    std::vector<std::string> md_class_list;
    std::vector<cv::Scalar> color_list;
    int show_score;
    int score_type;
    int return_origin;
};

// yolo_cpp.cpp

void* Init_DllODM_object(json config)
{
    std::string model_path = config["model"].get<std::string>();
    YOLO* YOLO_data = new YOLO();
    auto options = tensorflow::SessionOptions();
    GraphDef graphdef;
    // loading model to graph
    Status status_load = ReadBinaryProto(Env::Default(), model_path, &graphdef);

    options.config.mutable_gpu_options()->set_per_process_gpu_memory_fraction(0.7);
    options.config.mutable_gpu_options()->set_allow_growth(true);

    int node_count = graphdef.node_size();
    for (int i = 0; i < node_count; i++)
    {
        auto n = graphdef.node(i);
        if (n.name().find("input_") != string::npos)
        {
            YOLO_data->inp_tensor_name = n.name();
        }
        else if (n.name().find("Placeholder_") != string::npos)
        {
            YOLO_data->placeholder_name = n.name();
        }
        else if (i == node_count - 5)
        {
            YOLO_data->out_tensors.push_back(n.name());
        }
        else if (i == node_count - 3)
        {
            YOLO_data->out_tensors.push_back(n.name());
        }
        else if (i == node_count - 1)
        {
            YOLO_data->out_tensors.push_back(n.name());
        }

    }
    if (!status_load.ok()) {
        std::cout << "ERROR: Loading model failed..." << std::endl;
        std::cout << model_path << status_load.ToString() << "\n";
    }

    std::vector<int> MD_size_ = config["input_size"];
    YOLO_data->MD_size = MD_size_;
    std::vector<int> inp_pic_size_ = config["input_pic_size"];
    YOLO_data->inp_pic_size = inp_pic_size_;

    YOLO_data->inpTensor = Tensor(DT_FLOAT, TensorShape({ 1, YOLO_data->MD_size[0], YOLO_data->MD_size[1], 3 }));  // input tensor
    YOLO_data->t = Tensor(DT_FLOAT, TensorShape({ 2 }));
    //ref: https://stackoverflow.com/questions/36804714/define-a-feed-dict-in-c-for-tensorflow-models
    auto t_matrix = YOLO_data->t.tensor<float, 1>();
    t_matrix(0) = YOLO_data->inp_pic_size[0];
    t_matrix(1) = YOLO_data->inp_pic_size[1];
    // create session
    Status status_newsess = NewSession(options, &YOLO_data->session); //for the usage of gpu setting
    Status status_create = YOLO_data->session->Create(graphdef);
    if (!status_create.ok()) {
        std::cout << "ERROR: Creating graph in session failed.." << status_create.ToString() << std::endl;
    }
    else {
        std::cout << "----------- Successfully created session and load graph -------------" << std::endl;
    }

    return YOLO_data;
}

int predict(YOLO* YOLO_, cv::Mat srcImage, std::vector<float>& bbox_res, std::vector<float>& score_res, std::vector<int>& class_res)
{
    // read image -> input image
    if (srcImage.empty())   // check if image can open correctly
    {
        std::cout << "can't open the image!!!!!!!" << std::endl;
        int res = -1;
        return res;
    }
    // ref: https://ppt.cc/f7ERNx
    std::vector<std::pair<string, tensorflow::Tensor>> inputs = {
        { YOLO_->inp_tensor_name, YOLO_->inpTensor },
        { YOLO_->placeholder_name, YOLO_->t },
    };
    srcImage = letterbox_image(srcImage, YOLO_->MD_size[0], YOLO_->MD_size[1]);
    convertCVMatToTensor(YOLO_, srcImage);
    Status status_run = YOLO_->session->Run({ inputs }, { YOLO_->out_tensors }, {}, &YOLO_->outTensor);
    if (!status_run.ok()) {
        std::cout << "ERROR: RUN failed..." << std::endl;
        std::cout << status_run.ToString() << "\n";
        int res = -1;
        return res;
    }

    TTypes<float>::Flat pp1 = YOLO_->outTensor[0].flat<float>();
    TTypes<float>::Flat pp2 = YOLO_->outTensor[1].flat<float>();
    TTypes<int>::Flat pp3 = YOLO_->outTensor[2].flat<int>();
    int pp1_idx;

    for (int i = 0; i < pp2.size(); i++)
    {
        pp1_idx = i * 4;
        bbox_res.push_back(pp1(pp1_idx));
        bbox_res.push_back(pp1(pp1_idx + 1));
        bbox_res.push_back(pp1(pp1_idx + 2));
        bbox_res.push_back(pp1(pp1_idx + 3));
        score_res.push_back(pp2(i));
        class_res.push_back(pp3(i));
    }

    return 0;
}

cv::Mat show_result(YOLO* inf_obj, cv::Mat inp_pic)
{
    int bbox_idx;
    std::string plot_str;
    bool under_thresh = false;
    std::vector<int> del_idx;
    for (int i = 0; i < inf_obj->class_res.size(); i++)
    {
        int y_min, y_max, x_min, x_max;
        bbox_idx = i * 4;
        y_min = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx] + 0.5));
        x_min = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx + 1] + 0.5));
        y_max = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx + 2] + 0.5));
        x_max = std::max(0, (int)floor(inf_obj->bbox_res[bbox_idx + 3] + 0.5));
        //std::cout << md_class_list[class_res[i]] << ", ";
        //std::cout << score_res[i] << ",";
        //std::cout << "[" << x_min << ", " << y_min << ", " << x_max << ", " << y_max << "]\n";

        if (inf_obj->show_score)
        {
            if (inf_obj->score_type)
                plot_str = inf_obj->md_class_list[inf_obj->class_res[i]] + ", " + std::to_string(rounding(inf_obj->score_res[i] * 100, 2)).substr(0, 5) + "%";
            else
                plot_str = inf_obj->md_class_list[inf_obj->class_res[i]] + ", " + std::to_string(rounding(inf_obj->score_res[i], 2)).substr(0, 4);
        }

        else
            plot_str = inf_obj->md_class_list[inf_obj->class_res[i]];

        if (inf_obj->score_res[i] >= inf_obj->score_thres)
        {
            inp_pic = plot_one_box(inp_pic, x_min, y_min, x_max, y_max, plot_str, inf_obj->color_list[inf_obj->class_res[i]]);
        }
        else
        {
            //std::cout << "score_res[i]->" << score_res[i] << "under thresh!!" << std::endl;
            under_thresh = true;
            del_idx.push_back(i);
        }
    }

    if (under_thresh)
    {
        //std::cout << "*** deleting element" << std::endl;
        for (int x = 0; x < del_idx.size(); x++)
        {
            bbox_idx = (del_idx[x] - x) * 4;
            inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx + 3);
            inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx + 2);
            inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx + 1);
            inf_obj->bbox_res.erase(inf_obj->bbox_res.begin() + bbox_idx);
            inf_obj->score_res.erase(inf_obj->score_res.begin() + del_idx[x] - x);
            inf_obj->class_res.erase(inf_obj->class_res.begin() + del_idx[x] - x);
        }
        del_idx.clear();
    }

    return inp_pic;
}

Since my code is used for dll, I arranged in this way. There are still some redundant code I didn't delete, but I think the whole process can be done with these provided code so far. Hope this help :D

How to get the output from YOLO model using tensorflow with C++ correctly?

2 Answers2