in gemini/sample-apps/llamaindex-rag/backend/indexing/docai_parser.py [0:0]
def _parse_from_results(self, results: list["DocAIParsingResults"]): # noqa: F821
documents = []
storage_client = storage.Client()
for result in results:
print(
f"Processing result: source_path={result.source_path}, "
f"parsed_path={result.parsed_path}"
)
if not result.parsed_path:
print(
"Warning: Empty parsed_path for source "
f"{result.source_path}. Skipping."
)
continue
try:
bucket_name, prefix = result.parsed_path.replace("gs://", "").split(
"/", 1
)
except ValueError:
print(
f"Error: Invalid parsed_path format for {result.source_path}. Skipping."
)
continue
bucket = storage_client.bucket(bucket_name)
blobs = list(bucket.list_blobs(prefix=prefix))
print(f"Found {len(blobs)} blobs in {result.parsed_path}")
for blob in blobs:
if blob.name.endswith(".json"):
print(f"Processing JSON blob: {blob.name}")
try:
content = blob.download_as_text()
doc_data = json.loads(content)
if (
"chunkedDocument" in doc_data
and "chunks" in doc_data["chunkedDocument"]
):
for chunk in doc_data["chunkedDocument"]["chunks"]:
doc = Document(
text=chunk["content"],
metadata={
"chunk_id": chunk["chunkId"],
"source": result.source_path,
},
)
documents.append(doc)
else:
print(
"Warning: Expected 'chunkedDocument' "
f"structure not found in {blob.name}"
)
except Exception as e:
print(f"Error processing blob {blob.name}: {str(e)}")
print(f"Total documents created: {len(documents)}")
return documents