def register_doc()

in appdev_genai_googlecloud/src/genai-app-firestore/main.py [0:0]


def register_doc():
    # Read metadata from Pub/Sub event
    event = from_http(request.headers, request.get_data())
    event_id = event.get("id")
    data = event.data
    content_type, bucket_name, name = data["contentType"], data["bucket"], data["name"]
    user_id, filename = data["name"].split("/")[0], data["name"].split("/")[1]
    file_id = filename[:21]
    _, ext = os.path.splitext(name)

    app.logger.info(f"{event_id}: start registring a doc: {file_id}")

    # Return when the file is not a PDF file
    if content_type != "application/pdf" or ext.lower() != ".pdf":
        app.logger.info(f"{event_id}: skipping registring a doc since the file is not PDF format: {file_id}")
        return ("The file is not a PDF file", 204)

    # Load PDF on Cloud Storage
    loader = GCSFileLoader(project_name=PROJECT_ID, bucket=bucket_name, blob=name, loader_func=load_pdf)
    documents = loader.load()

    # Split PDF
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n", "。"],
        chunk_size=SPLITTER_CHUNK_SIZE,
        chunk_overlap=SPLITTER_CHUNK_OVERLAP,
        length_function=len,
    )
    pages = text_splitter.split_documents(documents)
    app.logger.info(f"{event_id}: identified {len(pages)} pages on {file_id}")

    # Generate embeddings using page content
    pages_content = [page.page_content for page in pages]

    app.logger.info(f"{event_id}: start embedding {len(pages_content)} texts")
    content_embeddings = embed_texts(pages_content)
    app.logger.info(f"{event_id}: finished embedding {len(pages_content)} texts")

    # Transform data to Firestore format
    docs = [{"text": page.page_content,
             "source": filename,
             "page": page.metadata['page'],
             "embedding": Vector(content_embedding)}
             for page, content_embedding in zip(pages, content_embeddings)]
    
    # Store result on Firestore 
    # Batch write 500 documents at once to make sure it won't exceed API limit size (10MB)
    app.logger.info(f"{event_id}: start storing embeddings into Firestore")
    docs_list = [docs[i:i+500] for i in range(0, len(docs), 500)]
    for doc_list in docs_list:
        batch = db.batch()
        for doc in doc_list:
            doc_ref = db.collection('users').document(user_id).collection("embeddings").document()
            batch.set(doc_ref, doc)
        batch.commit()
    app.logger.info(f"{event_id}: finished storing embeddings into Firestore")

    # Update the embedded flag to True
    app.logger.info(f"{event_id}: start updating embedding flag to True")
    doc_ref = db.collection("users").document(user_id).collection("items").document(file_id)
    doc_ref.update({"embedded": True})
    app.logger.info(f"{event_id}: finished updating embedding flag to True")

    app.logger.info(f"{event_id}: finished registring a doc: {file_id}")

    return ("Successfully registered", 204)