in data/sample-documents-indexing.py [0:0]
def gen_documents(path: str) -> List[Dict[str, any]]:
openai_service_endoint = os.environ["AZURE_OPENAI_ENDPOINT"]
openai_deployment = "text-embedding-ada-002"
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
client = AzureOpenAI(
api_version="2023-07-01-preview",
azure_endpoint=openai_service_endoint,
azure_deployment=openai_deployment,
azure_ad_token_provider=token_provider
)
documents = pd.read_csv(path)
items = []
for document in documents.to_dict("records"):
content = document["description"]
id = str(document["id"])
title = document["name"]
url = document["url"]
emb = client.embeddings.create(input=content, model=openai_deployment)
rec = {
"id": id,
"content": content,
"filepath": f"{title.lower().replace(' ', '-')}",
"title": title,
"url": url,
"contentVector": emb.data[0].embedding,
}
items.append(rec)
return items