in appdev_genai_googlecloud/src/genai-app-firestore/main.py [0:0]
def register_doc():
# Read metadata from Pub/Sub event
event = from_http(request.headers, request.get_data())
event_id = event.get("id")
data = event.data
content_type, bucket_name, name = data["contentType"], data["bucket"], data["name"]
user_id, filename = data["name"].split("/")[0], data["name"].split("/")[1]
file_id = filename[:21]
_, ext = os.path.splitext(name)
app.logger.info(f"{event_id}: start registring a doc: {file_id}")
# Return when the file is not a PDF file
if content_type != "application/pdf" or ext.lower() != ".pdf":
app.logger.info(f"{event_id}: skipping registring a doc since the file is not PDF format: {file_id}")
return ("The file is not a PDF file", 204)
# Load PDF on Cloud Storage
loader = GCSFileLoader(project_name=PROJECT_ID, bucket=bucket_name, blob=name, loader_func=load_pdf)
documents = loader.load()
# Split PDF
text_splitter = RecursiveCharacterTextSplitter(
separators=["\n", "。"],
chunk_size=SPLITTER_CHUNK_SIZE,
chunk_overlap=SPLITTER_CHUNK_OVERLAP,
length_function=len,
)
pages = text_splitter.split_documents(documents)
app.logger.info(f"{event_id}: identified {len(pages)} pages on {file_id}")
# Generate embeddings using page content
pages_content = [page.page_content for page in pages]
app.logger.info(f"{event_id}: start embedding {len(pages_content)} texts")
content_embeddings = embed_texts(pages_content)
app.logger.info(f"{event_id}: finished embedding {len(pages_content)} texts")
# Transform data to Firestore format
docs = [{"text": page.page_content,
"source": filename,
"page": page.metadata['page'],
"embedding": Vector(content_embedding)}
for page, content_embedding in zip(pages, content_embeddings)]
# Store result on Firestore
# Batch write 500 documents at once to make sure it won't exceed API limit size (10MB)
app.logger.info(f"{event_id}: start storing embeddings into Firestore")
docs_list = [docs[i:i+500] for i in range(0, len(docs), 500)]
for doc_list in docs_list:
batch = db.batch()
for doc in doc_list:
doc_ref = db.collection('users').document(user_id).collection("embeddings").document()
batch.set(doc_ref, doc)
batch.commit()
app.logger.info(f"{event_id}: finished storing embeddings into Firestore")
# Update the embedded flag to True
app.logger.info(f"{event_id}: start updating embedding flag to True")
doc_ref = db.collection("users").document(user_id).collection("items").document(file_id)
doc_ref.update({"embedded": True})
app.logger.info(f"{event_id}: finished updating embedding flag to True")
app.logger.info(f"{event_id}: finished registring a doc: {file_id}")
return ("Successfully registered", 204)