async function batchProcessDocument()

in document-ai/batch-process-document.js [50:144]


  async function batchProcessDocument() {
    const name = `projects/${projectId}/locations/${location}/processors/${processorId}`;

    // Configure the batch process request.
    const request = {
      name,
      inputDocuments: {
        gcsDocuments: {
          documents: [
            {
              gcsUri: gcsInputUri,
              mimeType: 'application/pdf',
            },
          ],
        },
      },
      documentOutputConfig: {
        gcsOutputConfig: {
          gcsUri: `${gcsOutputUri}/${gcsOutputUriPrefix}/`,
        },
      },
    };

    // Batch process document using a long-running operation.
    // You can wait for now, or get results later.
    // Note: first request to the service takes longer than subsequent
    // requests.
    const [operation] = await client.batchProcessDocuments(request);

    // Wait for operation to complete.
    await operation.promise();
    console.log('Document processing complete.');

    // Query Storage bucket for the results file(s).
    const query = {
      prefix: gcsOutputUriPrefix,
    };

    console.log('Fetching results ...');

    // List all of the files in the Storage bucket
    const [files] = await storage.bucket(gcsOutputUri).getFiles(query);

    // Add all asynchronous downloads to queue for execution.
    const queue = new PQueue({concurrency: 15});
    const tasks = files.map((fileInfo, index) => async () => {
      // Get the file as a buffer
      const [file] = await fileInfo.download();

      console.log(`Fetched file #${index + 1}:`);

      // The results stored in the output Storage location
      // are formatted as a document object.
      const document = JSON.parse(file.toString());
      const {text} = document;

      // Extract shards from the text field
      const getText = textAnchor => {
        if (!textAnchor.textSegments || textAnchor.textSegments.length === 0) {
          return '';
        }

        // First shard in document doesn't have startIndex property
        const startIndex = textAnchor.textSegments[0].startIndex || 0;
        const endIndex = textAnchor.textSegments[0].endIndex;

        return text.substring(startIndex, endIndex);
      };

      // Read the text recognition output from the processor
      console.log('The document contains the following paragraphs:');

      const [page1] = document.pages;
      const {paragraphs} = page1;
      for (const paragraph of paragraphs) {
        const paragraphText = getText(paragraph.layout.textAnchor);
        console.log(`Paragraph text:\n${paragraphText}`);
      }

      // Form parsing provides additional output about
      // form-formatted PDFs. You  must create a form
      // processor in the Cloud Console to see full field details.
      console.log('\nThe following form key/value pairs were detected:');

      const {formFields} = page1;
      for (const field of formFields) {
        const fieldName = getText(field.fieldName.textAnchor);
        const fieldValue = getText(field.fieldValue.textAnchor);

        console.log('Extracted key value pair:');
        console.log(`\t(${fieldName}, ${fieldValue})`);
      }
    });
    await queue.addAll(tasks);
  }