in Autogen_v0.4/rag_agent/search_helper.py [0:0]
def enrich_with_embeddings(output_file_name):
with open(output_file_name, "r") as f:
aml_index_data = json.loads(f.read())
titles = []
content = []
categories = []
tags = []
for doc in aml_index_data:
titles.append(doc["title"])
content.append(doc["content"])
categories.append(doc["category"])
tags.append(doc["tags"])
batch_size = 500
for i in range(0, len(titles), batch_size):
print(f"Processing batch: {i}")
title_embeddings = openai_helper.generate_embeddings(titles[i:i+batch_size], dimensions=azure_openai_embedding_small_dimensions,
model=azure_openai_embedding__small_deployment)
content_embeddings = openai_helper.generate_embeddings(content[i:i+batch_size], dimensions=azure_openai_embedding_large_dimensions,
model=azure_openai_embedding__large_deployment)
category_embeddings = openai_helper.generate_embeddings(categories[i:i+batch_size],dimensions=azure_openai_embedding_small_dimensions,
model=azure_openai_embedding__small_deployment)
tags_embeddings = openai_helper.generate_embeddings(tags[i:i+batch_size],dimensions=azure_openai_embedding_small_dimensions, model=azure_openai_embedding__small_deployment)
for j, (title_emb, content_emb, category_emb, tag_emb) in enumerate(zip(title_embeddings,
content_embeddings,
category_embeddings,
tags_embeddings)):
aml_index_data[i+j]["titleVector"] = title_emb.embedding
aml_index_data[i+j]["contentVector"] = content_emb.embedding
aml_index_data[i+j]["categoryVector"] = category_emb.embedding
aml_index_data[i+j]["tagsVector"] = tag_emb.embedding
print(f"Embeddings generated for batch: {i}")
vector_file_name = f"{output_file_name}_with_vectors.json"
with open(vector_file_name, "w") as f:
json.dump(aml_index_data, f)