in gemini/sample-apps/llamaindex-rag/backend/indexing/run_parse_embed_index.py [0:0]
def main():
"""Main parsing, embedding and indexing logic for data living in GCS"""
# Initialize Vertex AI and create index and endpoint
aiplatform.init(project=PROJECT_ID, location=LOCATION)
# Creating Vector Search Index
vs_index, vs_endpoint = get_or_create_existing_index(
VECTOR_INDEX_NAME, INDEX_ENDPOINT_NAME, APPROXIMATE_NEIGHBORS_COUNT
)
# Vertex AI Vector Search Vector DB and Firestore Docstore
vector_store = VertexAIVectorStore(
project_id=PROJECT_ID,
region=LOCATION,
index_id=vs_index.name, # Use .name instead of .resource_name
endpoint_id=vs_endpoint.name, # Use .name instead of .resource_name
gcs_bucket_name=DOCSTORE_BUCKET_NAME,
)
docstore = FirestoreDocumentStore.from_database(
project=PROJECT_ID, database=FIRESTORE_DB_NAME, namespace=FIRESTORE_NAMESPACE
)
# Setup embedding model and LLM
embed_model = VertexTextEmbedding(
model_name=EMBEDDINGS_MODEL_NAME, project=PROJECT_ID, location=LOCATION
)
llm = Vertex(model="gemini-2.0-flash", temperature=0.0)
Settings.llm = llm
Settings.embed_model = embed_model
# Initialize Document AI parser
GCS_OUTPUT_PATH = f"gs://{DOCSTORE_BUCKET_NAME}/{VECTOR_DATA_PREFIX}/docai_output/"
parser = DocAIParser(
project_id=PROJECT_ID,
location=DOCAI_LOCATION,
processor_name=f"projects/{PROJECT_ID}/locations/{DOCAI_LOCATION}/processors/{DOCAI_PROCESSOR_ID}", # noqa: E501
gcs_output_path=GCS_OUTPUT_PATH,
)
# Download data from specified bucket and parse
local_data_path = os.path.join("/tmp", BUCKET_PREFIX)
os.makedirs(local_data_path, exist_ok=True)
blobs = create_pdf_blob_list(INPUT_BUCKET_NAME, BUCKET_PREFIX)
logger.info("downloading data")
download_bucket_with_transfer_manager(
INPUT_BUCKET_NAME, prefix=BUCKET_PREFIX, destination_directory=local_data_path
)
# Parse documents using Document AI
try:
parsed_docs, raw_results = parser.batch_parse(
blobs, chunk_size=CHUNK_SIZE, include_ancestor_headings=True
)
print(f"Number of documents parsed by Document AI: {len(parsed_docs)}")
if parsed_docs:
print(
f"First parsed document text (first 100 chars): {parsed_docs[0].text[:100]}..." # noqa: E501
)
else:
print("No documents were parsed by Document AI.")
# Print raw results for debugging
print("Raw results:")
for result in raw_results:
print(f" Source: {result.source_path}")
print(f" Parsed: {result.parsed_path}")
except Exception as e:
print(f"Error processing single document: {str(e)}")
parsed_docs = []
raw_results = []
# Turn each parsed document into a llamaindex Document
li_docs = [Document(text=doc.text, metadata=doc.metadata) for doc in parsed_docs]
if QA_INDEX_NAME or QA_ENDPOINT_NAME:
create_qa_index(li_docs, docstore, embed_model, llm)
if INDEXING_METHOD == "hierarchical":
create_hierarchical_index(li_docs, docstore, vector_store, embed_model, llm)
elif INDEXING_METHOD == "flat":
create_flat_index(li_docs, docstore, vector_store, embed_model, llm)