in document-ai/process-document-form.js [18:127]
async function main(projectId, location, processorId, filePath) {
// [START documentai_process_form_document]
/**
* TODO(developer): Uncomment these variables before running the sample.
*/
// const projectId = 'YOUR_PROJECT_ID';
// const location = 'YOUR_PROJECT_LOCATION'; // Format is 'us' or 'eu'
// const processorId = 'YOUR_PROCESSOR_ID'; // Create processor in Cloud Console
// const filePath = '/path/to/local/pdf';
const {DocumentProcessorServiceClient} =
require('@google-cloud/documentai').v1beta3;
// Instantiates a client
const client = new DocumentProcessorServiceClient();
async function processDocument() {
// The full resource name of the processor, e.g.:
// projects/project-id/locations/location/processor/processor-id
// You must create new processors in the Cloud Console first
const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;
// Read the file into memory.
const fs = require('fs').promises;
const imageFile = await fs.readFile(filePath);
// Convert the image data to a Buffer and base64 encode it.
const encodedImage = Buffer.from(imageFile).toString('base64');
const request = {
name,
rawDocument: {
content: encodedImage,
mimeType: 'application/pdf',
},
};
// Recognizes text entities in the PDF document
const [result] = await client.processDocument(request);
console.log('Document processing complete.');
// Read the table and form fields output from the processor
// The form processor also contains OCR data. For more information
// on how to parse OCR data please see the OCR sample.
// For a full list of Document object attributes,
// please reference this page: https://googleapis.dev/nodejs/documentai/latest/index.html
const {document} = result;
const {text} = document;
console.log(`Full document text: ${JSON.stringify(text)}`);
console.log(`There are ${document.pages.length} page(s) in this document.`);
for (const page of document.pages) {
console.log(`\n\n**** Page ${page.pageNumber} ****`);
console.log(`Found ${page.tables.length} table(s):`);
for (const table of page.tables) {
const numCollumns = table.headerRows[0].cells.length;
const numRows = table.bodyRows.length;
console.log(`Table with ${numCollumns} columns and ${numRows} rows:`);
printTableInfo(table, text);
}
console.log(`Found ${page.formFields.length} form field(s):`);
for (const field of page.formFields) {
const fieldName = getText(field.fieldName.textAnchor, text);
const fieldValue = getText(field.fieldValue.textAnchor, text);
console.log(
`\t* ${JSON.stringify(fieldName)}: ${JSON.stringify(fieldValue)}`
);
}
}
}
const printTableInfo = (table, text) => {
// Print header row
let headerRowText = '';
for (const headerCell of table.headerRows[0].cells) {
const headerCellText = getText(headerCell.layout.textAnchor, text);
headerRowText += `${JSON.stringify(headerCellText.trim())} | `;
}
console.log(
`Collumns: ${headerRowText.substring(0, headerRowText.length - 3)}`
);
// Print first body row
let bodyRowText = '';
for (const bodyCell of table.bodyRows[0].cells) {
const bodyCellText = getText(bodyCell.layout.textAnchor, text);
bodyRowText += `${JSON.stringify(bodyCellText.trim())} | `;
}
console.log(
`First row data: ${bodyRowText.substring(0, bodyRowText.length - 3)}`
);
};
// Extract shards from the text field
const getText = (textAnchor, text) => {
if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
return '';
}
// First shard in document doesn't have startIndex property
const startIndex = textAnchor.textSegments[0].startIndex || 0;
const endIndex = textAnchor.textSegments[0].endIndex;
return text.substring(startIndex, endIndex);
};
// [END documentai_process_form_document]
await processDocument();
}