I'm trying to deploy a bert model on onnxruntime-c++, but the infer time usage is confusing me. The time usage is longer and fluctuating when input is a single sentence from console real time than input a texts file with a lot of sentences.
The session initial code as follows:
class BertModel
{
public:
BertModel(){};
BertModel(const char* path)
{
// initial tokenizer
string vocab_path = join(path, "vocab.txt");
pTokenizer = new FullTokenizer(vocab_path);
// onnxruntime setup
Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "small_bert_onnx"); //Set a Env for this session,the Env holds the logging state used by all other objects.
session_options.SetIntraOpNumThreads(1); //Sets the number of threads used to parallelize the execution within nodes. Default is 0 to let onnxruntime choose.
session_options.SetInterOpNumThreads(1); //Sets the number of threads used to parallelize the execution of the graph (across nodes). Default is 0 to let onnxruntime choose.
string model_path = join(path, "bert_model_quant.onnx");
session = new Ort::Session(env, model_path.c_str(), session_options); //create a session,session is
// session = new Ort::Session(env, model_path.c_str(), Ort::SessionOptions{ nullptr }); //don't do anyOptions
size_t num_input_nodes = session->GetInputCount(); // num_input_nodes size of model need,eg:(ids,mask,labels),your will get 3;
char* input_name = session -> GetInputName(0, allocator);
input_node_names = {input_name};
output_node_names = {"logits"};
// print input node types
Ort::TypeInfo type_info = session -> GetInputTypeInfo(0);
auto tensor_info = type_info.GetTensorTypeAndShapeInfo();
ONNXTensorElementDataType type = tensor_info.GetElementType();
// print input shapes/dims
input_node_dims = tensor_info.GetShape();
cout << "session初始化成功" << endl;
}
string join(const char *a, const char *b);
vector<long> textTokenizer(string text);
int predicts(string text);
private:
FullTokenizer* pTokenizer;
Ort::SessionOptions session_options;
std::vector<int64_t> input_node_dims;
std::vector<const char*> output_node_names;
Ort::AllocatorWithDefaultOptions allocator; // allocator
std::vector<const char*> input_node_names;
Ort::Session* session;
};
My predicts function as follows:
int BertModel::predicts(string text)
{
vector<long> input_tensor_values = textTokenizer(text);
auto memory_info = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
input_node_dims[0]=1;
input_node_dims[1]=input_tensor_values.size();
Ort::Value input_tensor = Ort::Value::CreateTensor<long>(memory_info,input_tensor_values.data(),
input_tensor_values.size(), input_node_dims.data(), input_node_dims.size());
assert(input_tensor.IsTensor());
//outputs from session run is vector<Value>
auto output_tensors = session -> Run(Ort::RunOptions{nullptr},
input_node_names.data(),
&input_tensor,
1,
output_node_names.data(),
1);
// output_tensors 2, logitspreds
// onnlogitsfloatfloat
float* floatarr = output_tensors[0].GetTensorMutableData<float>();
int res = max_element(floatarr, floatarr + 3) - floatarr;
return res;
}
My code to infer single sentence is showing as follows,the input is get from console real time:
string text;
while(true)
{
cout << "enter your input" << endl;
getline(cin, text);
high_resolution_clock::time_point beginTime = high_resolution_clock::now();
int res = model.predicts(text);
high_resolution_clock::time_point endTime = high_resolution_clock::now();
milliseconds timeInterval = std::chrono::duration_cast<milliseconds>(endTime - beginTime);
cout << "predict result:" << res << endl;
cout << "time spent:" << timeInterval.count() << "ms" << endl;
}
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:16ms
enter your input
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:16ms
enter your input
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:14ms
enter your input
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:8ms
enter your input
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:15ms
enter your input
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:8ms
enter your input
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌
predict result:1
time spent:13ms
My code to infer a texts file is showing as follows:
string input_path = "../../test0711.txt";
string output_path = "../../test0711_result.txt";
ifstream input_file(input_path);
ofstream output_file(output_path);
if (!input_file.is_open()) {
cerr << "Could not open the file - '"
<< input_path << "'" << endl;
return EXIT_FAILURE;
}
if (!output_file.is_open()) {
cerr << "Could not open the file - '"
<< output_path << "'" << endl;
return EXIT_FAILURE;
}
int time_spent = 0;
int seq_nums = 0;
string line;
while (getline(input_file, line))
{
high_resolution_clock::time_point beginTime = high_resolution_clock::now(); //start time
int res = model.predicts(line); //predicts single sentence
high_resolution_clock::time_point endTime = high_resolution_clock::now(); //end time
milliseconds timeInterval = std::chrono::duration_cast<milliseconds>(endTime - beginTime); //spent time
cout << "bytes length of this sentence:" << line.size()/3 << endl;
cout << "predict result:" << res << endl;
cout << "time spent:" << timeInterval.count() << "ms" << endl;
output_file << line << '\t' << res << '\t' << timeInterval.count() << "ms" << endl;
time_spent += timeInterval.count();
seq_nums++;
}
input_file.close();
output_file.close();
你知道什么是版权问题吗就是他们就是这个 1 6ms
北石店 2 3ms
我要去新街口 0 4ms
导航到向阳小区 0 4ms
只想守护你 0 3ms
将车道偏离预警开关打开 0 4ms
导航到南海意库 0 4ms
导航去1号公馆 0 4ms
1米制的恭喜发财 1 4ms
你给我想没有包子铺的你也灯关的水都关了新的利润都被人骨的肌 1 8ms
你吃不吃粑粑 1 4ms
导航去深圳湾创新科技中心 0 4ms
个性也没看就行了 1 4ms
三好听你就三个1390这个都是套餐5万双送给您的 1 6ms
Obviously,time usage is proportional to length of sentences when I input a texts file. So,why the bug occur and how can I fix it?