In the next scenarios I will create two fictive bill formats, then write the code algorithm to parse them. I will write only the algorithm because I don't know JAVA.

On the first column we have great pictures from two bills. In the second column we have text data obtained from OCR software. It's like a simple text file, with no logic implemented. But we know certain keywords that can make it have meaning. Bellow is the algorithm that translates the meaningless file in a perfect logical JSON.
// Text obtained from BILL format 1
var TEXT_FROM_OCR = "Invoice no 12 Amount 55$
Vendor name BusinessTest 1 Account No 1213113
Due date 2019-12-07
Description Lorem ipsum dolor est"
// Text obtained from BILL format 2
var TEXT_FROM_OCR =" BusinessTest22
Invoice no 19 Amount 12$
Account 4564544 Due date 2019-12-15
Description
Lorem ipsum dolor est
Another description line
Last description line"
// This is a valid JSON object which describes the logic behind the text
var TEMPLATES = {
"bill_template_1": {
"vendor":{
"line_no_start": null, // This means is unknown and will be ignored by our text parsers
"line_no_end": null, // This means is unknown and will be ignored by our text parsers
"start_delimiter": "Vendor name", // Searched value starts immediatedly after this start_delimiters
"end_delimiter": "Account" // Searched value ends just before this end_delimter
"value_found": null // Save here the value we found
},
"account": {
"line_no_start": null, // This means is unknown and will be ignored by our text parsers
"line_no_end": null, // This means is unknown and will be ignored by our text parsers
"start_delimiter": "Account No", // Searched value starts immediatedly after this start_delimiters
"end_delimiter": null // Extract everything untill the end of current line
"value_found": null // Save here the value we found
},
"description": {
// apply same logic as above
},
"due_date" {
// apply same logic as above
},
"invoice_number" {
// apply same logic as above
},
"amount" {
// apply same logic as above
},
},
"bill_template_2": {
"vendor":{
"line_no_start": 0, // Extract data from line zero
"line_no_end": 0, // Extract data untill line zero
"start_delimiter": null, // Ignore this, because our delimiter is a complete line
"end_delimiter": null // Ignore this, because our delimiter is a complete line
"value_found": null // Save here the value we found
},
"account": {
"line_no_start": null, // This means is unknown and will be ignored by our text parsers
"line_no_end": null, // This means is unknown and will be ignored by our text parsers
"start_delimiter": "Account", // Searched value starts immediatedly after this start_delimiters
"end_delimiter": "Due date" // Searched value ends just before this end_delimter
"value_found": null // Save here the value we found
},
"description": {
"line_no_start": 6, // Extract data from line zero
"line_no_end": 99999, // Extract data untill line 99999 (a very big number which means EOF)
"start_delimiter": null, // Ignore this, because our delimiter is a complete line
"end_delimiter": null // Ignore this, because our delimiter is a complete line
"value_found": null // Save here the value we found
},
"due_date" {
// apply same logic as above
},
"invoice_number" {
// apply same logic as above
},
"amount" {
// apply same logic as above
},
}
}
// ALGORITHM
// 1. convert into an array the TEXT_FROM_OCR variable (each index, means a new line in file)
// in JavaScript we would do something like this:
TEXT_FROM_OCR = TEXT_FROM_OCR.split("\r\n");
var MAXIMUM_SCORE = 6; // we are looking to extract 6 values, out of 6
foreach TEMPLATES as TEMPLATE_TO_PARSE => PARSE_METADATA{
SCORE = 0; // for each field we find, we increment score
foreach PARSE_METADATA as SEARCHED_FIELD_NAME => DELIMITERS_METADATA{
// Search by line first
if (DELIMITERS_METADATA['line_no_start'] !== NULL && DELIMITERS_METADATA['line_no_end'] !== NULL){
// Initiate value with an empty string
DELIMITERS_METADATA['value_found'] = '';
// Concatenate the value found across these lines
for (LINE_NO = DELIMITERS_METADATA['line_no_start']; LINE_NO <= DELIMITERS_METADATA['line_no_end']; LINE_NO++){
// Add line, one by one as defined by your delimiters
DELIMITERS_METADATA['value_found'] += TEXT_FROM_OCR[ LINE_NO ];
}
// We have found a good value, continue to next field
SCORE++;
continue;
}
// Search by text delimiters
if (DELIMITERS_METADATA['start_delimiter'] !== NULL){
// Search for text inside each line of the file
foreach TEXT_FROM_OCR as LINE_CONTENT{
// If we found start_delimiter on this line, then let's parse it
if (LINE_CONTENT.indexOf(DELIMITERS_METADATA['start_delimiter']) > -1){
// START POSITION OF OUR SEARCHED VALUE IS THE OFFSET WE FOUND + THE TOTAL LENGTH OF START DELIMITER
START_POSITION = LINE_CONTENT.indexOf(DELIMITERS_METADATA['start_delimiter']) + LENGTH( DELIMITERS_METADATA['start_delimiter'] );
// by default we try to extract all data from START_POSITION untill the end of current line
END_POSITION = 999999999999; // till the end of line
// HOWEVER, IF THERE IS AN END DELIMITER DEFINED, WE WILL USE THAT
if (DELIMITERS_METADATA['end_delimiter'] !== NULL){
// IF WE FOUND THE END DELIMITER ON THIS LINE, WE WILL USE ITS OFFSET as END_POSITION
if (LINE_CONTENT.indexOf(DELIMITERS_METADATA['end_delimiter']) > -1){
END_POSITION = LINE_CONTENT.indexOf(DELIMITERS_METADATA['end_delimiter']);
}
}
// SUBSTRACT THE VALUE WE FOUND
DELIMITERS_METADATA['value_found'] = LINE_CONTENT.substr(START_POSITION, END_POSITION);
// We have found a good value earlier, increment the score
SCORE++;
// break this foreach as we found a good value, and we need to move to next field
break;
}
}
}
}
print(TEMPLATE_TO_PARSE obtained a score of SCORE out of MAXIMUM_SCORE):
}
At the end you will know which template extracted most of the data, and based on this which one to use for that bill. Feel free to ask anything in comments. If I stayed 45 minute to write this answer, I'll surely answer to your comments as well. :)