I'd like to extract table data from a PDF file as done by the Try document AI demo. However, if a table row has multi line text within the same row, it's reflected as a separate row in my Json response. This is also reflected when using the Upload test document feature in the cloud console, as shown on the attached screenshot.Cloud Console demo.
However, the Try document AI demo (https://cloud.google.com/document-ai/docs/drag-and-drop) reflects the data correctly. Try document AI demo. Is this due to different API versions being used?
Node js code - using document AI v1beta3 version with form parser processor;
var dataTable;
for (const page of document.pages) {
//tables
for (const table of page.tables) {
dataTable = printTableInfo(table, text);
if (dataTable) return { table: dataTable };
}
}
// printTableInfo function
function printTableInfo(table, text) {
let dataArray = [];
for (let i = 0; i < table.bodyRows.length; i++) {
let bodyRowText = "";
for (const bodyCell of table.bodyRows[i].cells) {
const bodyCellText = getText(bodyCell.layout.textAnchor, text); //
bodyRowText += `${JSON.stringify(bodyCellText.trim())} | `;
}
dataArray.push({
[`Data`]: bodyRowText,
});
}
return dataArray;
}
// getText function
const getText = (textAnchor, text) => {
if (!text || !textAnchor) return;
if (
(!textAnchor && !textAnchor.textSegments && !textAnchor.textSegments) ||
(textAnchor &&
textAnchor.textSegments &&
textAnchor.textSegments.length) === 0
) {
return "";
}
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;
return text.substring(startIndex, endIndex);
};