paper_summarization/paper_summarization.ipynb

{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5b6bbbb6-f773-4a8d-8947-6dd645771f4f", "metadata": {}, "outputs": [], "source": [ "# Install a pip package in the current Jupyter kernel\n", "import sys\n", "\n", "!{sys.executable} -m pip install --upgrade pip\n", "!{sys.executable} -m pip install --upgrade -r /home/jupyter/paper_summarization/requirements.txt" ] }, { "cell_type": "code", "execution_count": null, "id": "67cc5ed5-180d-47b2-b3b3-40331ed18be7", "metadata": {}, "outputs": [], "source": [ "import os\n", "import threading\n", "from typing import Dict, List, Sequence, Set, Tuple\n", "\n", "import numpy as np\n", "import pandas as pd\n", "import wget" ] }, { "cell_type": "markdown", "id": "c764c130-d48c-48fe-a5f3-8d4930124b64", "metadata": { "tags": [] }, "source": [ "# Data Cleanup\n", "- Dataset is incomplete, some PDFs of Papers don't have companion summaries and some Summaries don't have PDFs\n", "- Use Sets to find non-matches, download any PDFs that we can get that have summaries, archive the rest" ] }, { "cell_type": "code", "execution_count": null, "id": "7a5012bf-84c6-47ae-86c2-22569e0b3d18", "metadata": {}, "outputs": [], "source": [ "def load_files_into_set(directory: str, target_extension: str) -> Set:\n", " output_files = set()\n", "\n", " for file in next(os.walk(directory), (None, None, []))[2]:\n", " # Filename without extension\n", " filename, extension = os.path.splitext(file)\n", "\n", " if target_extension in extension:\n", " output_files.add(filename)\n", "\n", " return output_files\n", "\n", "\n", "def find_matches(pdf_directory: str, txt_directory: str) -> Tuple[Set, Set, Set]:\n", " pdf_files = load_files_into_set(pdf_directory, \".pdf\")\n", " txt_files = load_files_into_set(txt_directory, \".txt\")\n", "\n", " matches = pdf_files & txt_files\n", " non_matches_pdf = pdf_files - txt_files\n", " non_matches_txt = txt_files - pdf_files\n", "\n", " return matches, non_matches_pdf, non_matches_txt\n", "\n", "\n", "def filter_non_matches(\n", " non_matches: Set, input_directory: str, output_directory: str, extension: str\n", "):\n", " for file in non_matches:\n", " os.rename(\n", " f\"{input_directory}/{file}{extension}\",\n", " f\"{output_directory}/{file}{extension}\",\n", " )" ] }, { "cell_type": "code", "execution_count": null, "id": "ededb615-8397-4e5b-a484-a88c15f475ac", "metadata": {}, "outputs": [], "source": [ "data_directory = \"/home/jupyter/paper_summarization/data\"\n", "pdf_directory = f\"{data_directory}/pdf\"\n", "txt_directory = f\"{data_directory}/summary_txt\"\n", "# json_directory = f\"{data_directory}/json\"\n", "\n", "cache_file = f\"{data_directory}/cache.txt\"\n", "sorted_json_directory = f\"{data_directory}/json\"\n", "unsharded_json_directory = f\"{data_directory}/unsharded_json\"\n", "\n", "full_txt_directory = f\"{data_directory}/full_txt\"\n", "\n", "non_match_directory = f\"{data_directory}/non_match\"\n", "non_match_pdf_directory = f\"{non_match_directory}/pdf\"\n", "non_match_txt_directory = f\"{non_match_directory}/summary_txt\"\n", "\n", "acl_site = \"https://aclanthology.org\"" ] }, { "cell_type": "code", "execution_count": null, "id": "7ee8a873-acc3-4503-8d41-425ffd63f50f", "metadata": {}, "outputs": [], "source": [ "matches, non_matches_pdf, non_matches_txt = find_matches(pdf_directory, txt_directory)\n", "\n", "# Download PDFs of files with summaries but no paper\n", "for file in non_matches_txt:\n", " url = f\"{acl_site}/{file}.pdf\"\n", " path = f\"{pdf_directory}/{file}.pdf\"\n", " try:\n", " # wget.download(url, path)\n", " print(f\"\\nDownloading {url}\")\n", " download_thread = threading.Thread(target=wget.download, args=(url, path))\n", " download_thread.start()\n", " except Exception as e:\n", " print(f\"Unable to download {url}\")\n", " continue\n", "\n", "# After downloading pdf matches, search for missing matches and sort out\n", "matches, non_matches_pdf, non_matches_txt = find_matches(pdf_directory, txt_directory)\n", "\n", "filter_non_matches(non_matches_pdf, pdf_directory, non_match_pdf_directory, \".pdf\")\n", "filter_non_matches(non_matches_txt, txt_directory, non_match_txt_directory, \".txt\")" ] }, { "cell_type": "markdown", "id": "c58efb1c-c71b-4e6d-bacb-517bb9441900", "metadata": {}, "source": [ "## Upload Files to GCS" ] }, { "cell_type": "code", "execution_count": null, "id": "2058a7fd-61d0-4e71-a355-f0472e8d2289", "metadata": {}, "outputs": [], "source": [ "!gsutil -m cp -r -n {txt_directory}/*.txt gs://cloud-samples-data/documentai/ScisummNet/summary_txt/\n", "!gsutil -m cp -r -n {pdf_directory}/*.pdf gs://cloud-samples-data/documentai/ScisummNet/pdf/" ] }, { "cell_type": "markdown", "id": "14447297-d0ae-48cc-b7b2-815caf2ec69f", "metadata": { "tags": [] }, "source": [ "# Send Documents to Document AI for OCR Processing" ] }, { "cell_type": "code", "execution_count": null, "id": "f8bb28c2-df1b-4979-b5ec-d2f2b0332ecc", "metadata": {}, "outputs": [], "source": [ "from time import sleep\n", "\n", "from google.api_core.client_options import ClientOptions\n", "from google.api_core.operation import Operation\n", "from google.cloud.documentai import BatchDocumentsInputConfig\n", "from google.cloud.documentai import BatchProcessMetadata\n", "from google.cloud.documentai import BatchProcessRequest\n", "from google.cloud.documentai import Document\n", "from google.cloud.documentai import DocumentOutputConfig\n", "from google.cloud.documentai import DocumentProcessorServiceClient\n", "from google.cloud.documentai import GcsDocument\n", "from google.cloud.documentai import GcsDocuments\n", "from google.cloud.documentai import GcsPrefix\n", "from google.cloud.documentai import Processor\n", "from google.cloud.documentai import ProcessorType\n", "from google.cloud.documentai import ProcessRequest\n", "from google.cloud.documentai import RawDocument\n", "from google.cloud.storage import Blob\n", "from google.cloud.storage import Client\n", "from google.protobuf.json_format import ParseError\n", "\n", "# See https://cloud.google.com/document-ai/docs/file-types\n", "PDF_MIME_TYPE = \"application/pdf\"\n", "JSON_MIME_TYPE = \"application/json\"\n", "\n", "ACCEPTED_MIME_TYPES = set(\n", " {\n", " PDF_MIME_TYPE,\n", " \"image/jpeg\",\n", " \"image/png\",\n", " \"image/tiff\",\n", " \"image/gif\",\n", " \"image/bmp\",\n", " \"image/webp\",\n", " }\n", ")\n", "\n", "# Based on https://cloud.google.com/document-ai/quotas\n", "BATCH_MAX_FILES = 50\n", "BATCH_MAX_REQUESTS = 5\n", "\n", "SKIP_HUMAN_REVIEW = True\n", "TIMEOUT = 200\n", "\n", "CONFIDENCE_THRESHOLD = 0.5\n", "\n", "GCS_INPUT_BUCKET = \"cloud-samples-data\"\n", "GCS_INPUT_PREFIX = \"documentai/ScisummNet/pdf\"\n", "\n", "GCS_OUTPUT_BUCKET = \"holtskinner-test-datasets\"\n", "GCS_OUTPUT_PREFIX = \"ScisummNet/output\"\n", "\n", "DOCAI_PROJECT_ID = \"908687846511\"\n", "DOCAI_LOCATION = \"us\"\n", "DOCAI_PROCESSOR_DISPLAY_NAME = \"Paper OCR Processor\"\n", "DOCAI_PROCESSOR_TYPE = \"OCR_PROCESSOR\"\n", "\n", "CLIENT_OPTIONS = ClientOptions(\n", " api_endpoint=f\"{DOCAI_LOCATION}-documentai.googleapis.com\"\n", ")" ] }, { "cell_type": "markdown", "id": "26875518", "metadata": {}, "source": [ "## Prepare Data in GCS\n", "- Document AI Batch Processing can read/write documents in GCS\n", "- Quotas Limit the amount of Simultaneous Batch Processing Calls that can be made and the amount of documents per call" ] }, { "cell_type": "code", "execution_count": null, "id": "8cb72636", "metadata": {}, "outputs": [], "source": [ "def create_gcs_uri(bucket_name: str, object_name: str) -> str:\n", " \"\"\"\n", " Create GCS URI\n", " \"\"\"\n", " return f\"gs://{bucket_name}/{object_name}\"\n", "\n", "\n", "def file_exists(blob_name: str, existing_files: Set) -> bool:\n", " basename = os.path.basename(blob_name)\n", " file_name = os.path.splitext(basename)[0]\n", " return file_name in existing_files\n", "\n", "\n", "def load_existing_files() -> Set:\n", " with open(cache_file, \"r\") as file:\n", " lines = file.readlines()\n", " existing_files = set([line.rstrip() for line in lines])\n", " return existing_files\n", "\n", "\n", "def create_batches(\n", " input_bucket: str,\n", " input_prefix: str,\n", " batch_size: int = BATCH_MAX_FILES,\n", ") -> List[List[GcsDocument]]:\n", " \"\"\"\n", " Create batches of documents to process\n", " \"\"\"\n", " if batch_size > BATCH_MAX_FILES:\n", " raise ValueError(\n", " f\"Batch size must be less than {BATCH_MAX_FILES}. \"\n", " f\"You provided {batch_size}\"\n", " )\n", "\n", " storage_client = Client()\n", " blob_list = storage_client.list_blobs(input_bucket, prefix=input_prefix)\n", "\n", " batches: List[List[GcsDocument]] = []\n", " batch: List[GcsDocument] = []\n", "\n", " existing_files = load_existing_files()\n", "\n", " for blob in blob_list:\n", " if blob.content_type not in ACCEPTED_MIME_TYPES:\n", " print(f\"Invalid Mime Type {blob.content_type} - Skipping file {blob.name}\")\n", " continue\n", "\n", " if file_exists(blob.name, existing_files):\n", " # print(f\"Skipping File {blob.name} - Already processed\")\n", " continue\n", "\n", " if len(batch) == batch_size:\n", " batches.append(batch)\n", " batch = []\n", "\n", " batch.append(\n", " GcsDocument(\n", " gcs_uri=create_gcs_uri(input_bucket, blob.name),\n", " mime_type=blob.content_type,\n", " )\n", " )\n", "\n", " batches.append(batch)\n", " return batches\n", "\n", "\n", "def add_batch_to_cache_file(batch: List[GcsDocument]):\n", " \"\"\"\n", " Add list of processed files to a persistent list in txt format to prevent re-processing\n", " \"\"\"\n", " with open(cache_file, \"a\") as f:\n", " for file in batch:\n", " basename = os.path.basename(file.gcs_uri)\n", " file_name = os.path.splitext(basename)[0]\n", " f.write(f\"{file_name}\\n\")" ] }, { "cell_type": "markdown", "id": "7aee8d88", "metadata": {}, "source": [ "## Create Processor" ] }, { "cell_type": "code", "execution_count": null, "id": "265198b2", "metadata": {}, "outputs": [], "source": [ "def create_processor(\n", " project_id: str, location: str, processor_display_name: str, processor_type: str\n", ") -> Processor:\n", " docai_client = DocumentProcessorServiceClient(client_options=CLIENT_OPTIONS)\n", "\n", " # The full resource name of the location\n", " # e.g.: projects/project_id/locations/location\n", " parent = docai_client.common_location_path(project_id, location)\n", "\n", " # Create a processor\n", " processor = docai_client.create_processor(\n", " parent=parent,\n", " processor=Processor(display_name=processor_display_name, type_=processor_type),\n", " )\n", " return processor" ] }, { "cell_type": "markdown", "id": "b4ee9242", "metadata": {}, "source": [ "## Batch Process Documents with Document AI" ] }, { "cell_type": "code", "execution_count": null, "id": "c255212c", "metadata": {}, "outputs": [], "source": [ "def batch_process(\n", " processor_name: str,\n", " document_batch: List[GcsDocument],\n", " gcs_output_uri: str,\n", " skip_human_review: bool = SKIP_HUMAN_REVIEW,\n", ") -> Operation:\n", " \"\"\"\n", " Calls Batch Process Method with a list of GCS URIs\n", " Internal Method for constructing Batch Process Requests\n", " Returns Batch Process Metadata Operation\n", " \"\"\"\n", " docai_client = DocumentProcessorServiceClient(client_options=CLIENT_OPTIONS)\n", "\n", " # Load GCS Input URI into a List of document files\n", " input_config = BatchDocumentsInputConfig(\n", " gcs_documents=GcsDocuments(documents=document_batch)\n", " )\n", "\n", " # Specify Output GCS Bucket\n", " output_config = DocumentOutputConfig(\n", " gcs_output_config=DocumentOutputConfig.GcsOutputConfig(gcs_uri=gcs_output_uri)\n", " )\n", "\n", " request = BatchProcessRequest(\n", " name=processor_name,\n", " input_documents=input_config,\n", " document_output_config=output_config,\n", " skip_human_review=skip_human_review,\n", " )\n", "\n", " return docai_client.batch_process_documents(request)\n", "\n", "\n", "def batch_process_directory(\n", " processor_name: str,\n", " gcs_input_bucket: str,\n", " gcs_input_prefix: str,\n", " gcs_output_bucket: str,\n", " gcs_output_prefix: str,\n", "):\n", " \"\"\"\n", " Load documents from GCS\n", " Create Batches\n", " Call BatchProcessMethod\n", " \"\"\"\n", " batches = create_batches(gcs_input_bucket, gcs_input_prefix)\n", " total_batches = len(batches)\n", "\n", " gcs_output_uri = create_gcs_uri(gcs_output_bucket, gcs_output_prefix)\n", "\n", " for i, batch in enumerate(batches):\n", " if len(batch) <= 0:\n", " continue\n", "\n", " print(f\"Processing batch {i + 1}/{total_batches}: {len(batch)} documents\")\n", "\n", " add_batch_to_cache_file(batch)\n", "\n", " operation = batch_process(processor_name, batch, gcs_output_uri)\n", "\n", " print(f\"Operation: {operation.operation.name}\")\n", "\n", " if i % BATCH_MAX_REQUESTS == BATCH_MAX_REQUESTS - 1 and i < total_batches - 1:\n", " # Wait for Operation to complete before proceeding due to quotas...\n", " print(f\"Waiting...\")\n", " sleep(TIMEOUT)\n", " # operation.result(timeout=TIMEOUT)" ] }, { "cell_type": "markdown", "id": "78cfa09e", "metadata": {}, "source": [ "## Run Processing Workflow" ] }, { "cell_type": "code", "execution_count": null, "id": "0a3a577c", "metadata": {}, "outputs": [], "source": [ "# Create Processor\n", "processor = create_processor(\n", " DOCAI_PROJECT_ID, DOCAI_LOCATION, DOCAI_PROCESSOR_DISPLAY_NAME, DOCAI_PROCESSOR_TYPE\n", ")\n", "processor_name = processor.name\n", "print(f\"Created Processor {processor_name}\")\n", "\n", "# Process Full Directory of Documents\n", "batch_process_directory(\n", " processor.name,\n", " GCS_INPUT_BUCKET,\n", " GCS_INPUT_PREFIX,\n", " GCS_OUTPUT_BUCKET,\n", " GCS_OUTPUT_PREFIX,\n", ")" ] }, { "cell_type": "markdown", "id": "377a302a", "metadata": { "tags": [] }, "source": [ "# Post-Processing" ] }, { "cell_type": "markdown", "id": "ae77aac3", "metadata": {}, "source": [ "## Download Processed Files" ] }, { "cell_type": "code", "execution_count": null, "id": "961a4999", "metadata": {}, "outputs": [], "source": [ "!gsutil -m cp -r -n {create_gcs_uri(GCS_OUTPUT_BUCKET, GCS_OUTPUT_PREFIX)}/* {sorted_json_directory}/" ] }, { "cell_type": "markdown", "id": "4ed2be9d", "metadata": { "tags": [] }, "source": [ "## Extract from `Document.json`\n", "- Combine \"Sharded\" Document.json files\n", "- Extract Document Text and save to full_txt/document.txt" ] }, { "cell_type": "code", "execution_count": null, "id": "2875ed64", "metadata": {}, "outputs": [], "source": [ "import threading\n", "\n", "\n", "def extract_text(root: str, files: List, output_directory: str):\n", " file_shards = sorted(files)\n", " document_name = file_shards[0].replace(\"-0.json\", \".txt\")\n", " output_file = f\"{output_directory}/{document_name}\"\n", "\n", " if os.path.exists(output_file):\n", " print(f\"Skipping {document_name}\")\n", " return\n", "\n", " print(f\"Writing {document_name}\")\n", "\n", " document_text_shards: List[str] = []\n", "\n", " for file in file_shards:\n", " if file.endswith(\".json\"):\n", " file_path = os.path.join(root, file)\n", "\n", " with open(file_path, \"r\") as doc_json:\n", " document = Document.from_json(\n", " doc_json.read(), ignore_unknown_fields=True\n", " )\n", " document_text_shards.append(document.text)\n", "\n", " full_document_text = \"\".join(document_text_shards)\n", "\n", " with open(output_file, \"w\") as full_txt_file:\n", " full_txt_file.write(full_document_text)\n", "\n", " return" ] }, { "cell_type": "code", "execution_count": null, "id": "f20c71d5-7765-4b1c-98ca-03de595c9493", "metadata": {}, "outputs": [], "source": [ "# Extract raw Text from all Document.json files and store as document.txt\n", "for root, dirs, files in os.walk(sorted_json_directory):\n", " if len(dirs) != 0:\n", " continue\n", "\n", " shard_thread = threading.Thread(\n", " target=extract_text, args=(root, files, f\"{full_txt_directory}\")\n", " )\n", " shard_thread.start()" ] }, { "cell_type": "code", "execution_count": null, "id": "791c1342-b3a5-4371-8d45-aff30b8f2fef", "metadata": {}, "outputs": [], "source": [ "!gsutil -m cp -r -n {full_txt_directory}/*.txt gs://cloud-samples-data/documentai/ScisummNet/full_txt/" ] }, { "cell_type": "markdown", "id": "7d4f4f0d", "metadata": { "tags": [] }, "source": [ "## Print OCR Data\n", "- Print OCR Data including tokens, lines, paragraphs from a Document object" ] }, { "cell_type": "code", "execution_count": null, "id": "43d02611", "metadata": {}, "outputs": [], "source": [ "def print_page_dimensions(dimension: Document.Page.Dimension) -> None:\n", " print(f\" Width: {str(dimension.width)}\")\n", " print(f\" Height: {str(dimension.height)}\")\n", "\n", "\n", "def print_detected_langauges(\n", " detected_languages: Sequence[Document.Page.DetectedLanguage],\n", ") -> None:\n", " print(\" Detected languages:\")\n", " for lang in detected_languages:\n", " code = lang.language_code\n", " print(f\" {code} ({lang.confidence:.1%} confidence)\")\n", "\n", "\n", "def print_paragraphs(paragraphs: Sequence[Document.Page.Paragraph], text: str) -> None:\n", " print(f\" {len(paragraphs)} paragraphs detected:\")\n", " first_paragraph_text = layout_to_text(paragraphs[0].layout, text)\n", " print(f\" First paragraph text: {repr(first_paragraph_text)}\")\n", " last_paragraph_text = layout_to_text(paragraphs[-1].layout, text)\n", " print(f\" Last paragraph text: {repr(last_paragraph_text)}\")\n", "\n", "\n", "def print_blocks(blocks: Sequence[Document.Page.Block], text: str) -> None:\n", " print(f\" {len(blocks)} blocks detected:\")\n", " first_block_text = layout_to_text(blocks[0].layout, text)\n", " print(f\" First text block: {repr(first_block_text)}\")\n", " last_block_text = layout_to_text(blocks[-1].layout, text)\n", " print(f\" Last text block: {repr(last_block_text)}\")\n", "\n", "\n", "def print_lines(lines: Sequence[Document.Page.Line], text: str) -> None:\n", " print(f\" {len(lines)} lines detected:\")\n", " first_line_text = layout_to_text(lines[0].layout, text)\n", " print(f\" First line text: {repr(first_line_text)}\")\n", " last_line_text = layout_to_text(lines[-1].layout, text)\n", " print(f\" Last line text: {repr(last_line_text)}\")\n", "\n", "\n", "def print_tokens(tokens: Sequence[Document.Page.Token], text: str) -> None:\n", " print(f\" {len(tokens)} tokens detected:\")\n", " first_token_text = layout_to_text(tokens[0].layout, text)\n", " first_token_break_type = tokens[0].detected_break.type_.name\n", " print(f\" First token text: {repr(first_token_text)}\")\n", " print(f\" First token break type: {repr(first_token_break_type)}\")\n", " last_token_text = layout_to_text(tokens[-1].layout, text)\n", " last_token_break_type = tokens[-1].detected_break.type_.name\n", " print(f\" Last token text: {repr(last_token_text)}\")\n", " print(f\" Last token break type: {repr(last_token_break_type)}\")\n", "\n", "\n", "def layout_to_text(layout: Document.Page.Layout, text: str) -> str:\n", " \"\"\"\n", " Document AI identifies text in different parts of the document by their\n", " offsets in the entirety of the document's text. This function converts\n", " offsets to a string.\n", " \"\"\"\n", " response = \"\"\n", " # If a text segment spans several lines, it will\n", " # be stored in different text segments.\n", " for segment in layout.text_anchor.text_segments:\n", " start_index = int(segment.start_index)\n", " end_index = int(segment.end_index)\n", " response += text[start_index:end_index]\n", " return response\n", "\n", "\n", "def print_document_ocr_data(document: Document):\n", " text = document.text\n", " print(f\"Full document text: {text}\\n\")\n", " print(f\"There are {len(document.pages)} page(s) in this document.\\n\")\n", "\n", " for page in document.pages:\n", " print(f\"Page {page.page_number}:\")\n", " print_page_dimensions(page.dimension)\n", " print_detected_langauges(page.detected_languages)\n", " print_paragraphs(page.paragraphs, text)\n", " print_blocks(page.blocks, text)\n", " print_lines(page.lines, text)\n", " print_tokens(page.tokens, text)" ] }, { "cell_type": "code", "execution_count": null, "id": "82a993b0", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "c6854686-68b1-4c26-884c-ea010247ea1e", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "environment": { "kernel": "python3", "name": "common-cpu.m95", "type": "gcloud", "uri": "gcr.io/deeplearning-platform-release/base-cpu:m95" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" }, "vscode": { "interpreter": { "hash": "7e1998ff7f8aa20ada591c520b972326324e5ea05489af9e422744c7c09f6dad" } } }, "nbformat": 4, "nbformat_minor": 5 }

paper_summarization/paper_summarization.ipynb (737 lines of code) (raw):