def enrich_with_embeddings()

in Autogen_v0.4/rag_agent/search_helper.py [0:0]


def enrich_with_embeddings(output_file_name):
    with open(output_file_name, "r") as f:
        aml_index_data = json.loads(f.read())

    titles = []
    content = []
    categories = []
    tags = []
    for doc in aml_index_data:
        titles.append(doc["title"])
        content.append(doc["content"])
        categories.append(doc["category"])
        tags.append(doc["tags"])


    batch_size = 500

    for i in range(0, len(titles), batch_size):
        print(f"Processing batch: {i}")
        title_embeddings = openai_helper.generate_embeddings(titles[i:i+batch_size], dimensions=azure_openai_embedding_small_dimensions,
                                                model=azure_openai_embedding__small_deployment)
        content_embeddings = openai_helper.generate_embeddings(content[i:i+batch_size], dimensions=azure_openai_embedding_large_dimensions,
                                                model=azure_openai_embedding__large_deployment)
        category_embeddings = openai_helper.generate_embeddings(categories[i:i+batch_size],dimensions=azure_openai_embedding_small_dimensions,
                                                model=azure_openai_embedding__small_deployment)
        tags_embeddings = openai_helper.generate_embeddings(tags[i:i+batch_size],dimensions=azure_openai_embedding_small_dimensions, model=azure_openai_embedding__small_deployment)


        for j, (title_emb, content_emb, category_emb, tag_emb) in enumerate(zip(title_embeddings,
                        content_embeddings,
                        category_embeddings,
                        tags_embeddings)):
            aml_index_data[i+j]["titleVector"] = title_emb.embedding
            aml_index_data[i+j]["contentVector"] = content_emb.embedding
            aml_index_data[i+j]["categoryVector"] = category_emb.embedding
            aml_index_data[i+j]["tagsVector"] = tag_emb.embedding


        print(f"Embeddings generated for batch: {i}")

    vector_file_name = f"{output_file_name}_with_vectors.json"

    with open(vector_file_name, "w") as f:
        json.dump(aml_index_data, f)