def main()

in gemini/sample-apps/llamaindex-rag/backend/indexing/run_parse_embed_index.py [0:0]


def main():
    """Main parsing, embedding and indexing logic for data living in GCS"""
    # Initialize Vertex AI and create index and endpoint
    aiplatform.init(project=PROJECT_ID, location=LOCATION)

    # Creating Vector Search Index
    vs_index, vs_endpoint = get_or_create_existing_index(
        VECTOR_INDEX_NAME, INDEX_ENDPOINT_NAME, APPROXIMATE_NEIGHBORS_COUNT
    )

    # Vertex AI Vector Search Vector DB and Firestore Docstore
    vector_store = VertexAIVectorStore(
        project_id=PROJECT_ID,
        region=LOCATION,
        index_id=vs_index.name,  # Use .name instead of .resource_name
        endpoint_id=vs_endpoint.name,  # Use .name instead of .resource_name
        gcs_bucket_name=DOCSTORE_BUCKET_NAME,
    )

    docstore = FirestoreDocumentStore.from_database(
        project=PROJECT_ID, database=FIRESTORE_DB_NAME, namespace=FIRESTORE_NAMESPACE
    )

    # Setup embedding model and LLM
    embed_model = VertexTextEmbedding(
        model_name=EMBEDDINGS_MODEL_NAME, project=PROJECT_ID, location=LOCATION
    )
    llm = Vertex(model="gemini-2.0-flash", temperature=0.0)
    Settings.llm = llm
    Settings.embed_model = embed_model

    # Initialize Document AI parser
    GCS_OUTPUT_PATH = f"gs://{DOCSTORE_BUCKET_NAME}/{VECTOR_DATA_PREFIX}/docai_output/"

    parser = DocAIParser(
        project_id=PROJECT_ID,
        location=DOCAI_LOCATION,
        processor_name=f"projects/{PROJECT_ID}/locations/{DOCAI_LOCATION}/processors/{DOCAI_PROCESSOR_ID}",  # noqa: E501
        gcs_output_path=GCS_OUTPUT_PATH,
    )

    # Download data from specified bucket and parse
    local_data_path = os.path.join("/tmp", BUCKET_PREFIX)
    os.makedirs(local_data_path, exist_ok=True)
    blobs = create_pdf_blob_list(INPUT_BUCKET_NAME, BUCKET_PREFIX)
    logger.info("downloading data")
    download_bucket_with_transfer_manager(
        INPUT_BUCKET_NAME, prefix=BUCKET_PREFIX, destination_directory=local_data_path
    )

    # Parse documents using Document AI
    try:
        parsed_docs, raw_results = parser.batch_parse(
            blobs, chunk_size=CHUNK_SIZE, include_ancestor_headings=True
        )
        print(f"Number of documents parsed by Document AI: {len(parsed_docs)}")
        if parsed_docs:
            print(
                f"First parsed document text (first 100 chars): {parsed_docs[0].text[:100]}..."  # noqa: E501
            )
        else:
            print("No documents were parsed by Document AI.")

        # Print raw results for debugging
        print("Raw results:")
        for result in raw_results:
            print(f"  Source: {result.source_path}")
            print(f"  Parsed: {result.parsed_path}")
    except Exception as e:
        print(f"Error processing single document: {str(e)}")
        parsed_docs = []
        raw_results = []

    # Turn each parsed document into a llamaindex Document
    li_docs = [Document(text=doc.text, metadata=doc.metadata) for doc in parsed_docs]

    if QA_INDEX_NAME or QA_ENDPOINT_NAME:
        create_qa_index(li_docs, docstore, embed_model, llm)

    if INDEXING_METHOD == "hierarchical":
        create_hierarchical_index(li_docs, docstore, vector_store, embed_model, llm)

    elif INDEXING_METHOD == "flat":
        create_flat_index(li_docs, docstore, vector_store, embed_model, llm)