def gen_documents()

in data/sample-documents-indexing.py [0:0]


def gen_documents(path: str) -> List[Dict[str, any]]:
    openai_service_endoint = os.environ["AZURE_OPENAI_ENDPOINT"]
    openai_deployment = "text-embedding-ada-002"

    token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")
    client = AzureOpenAI(
        api_version="2023-07-01-preview",
        azure_endpoint=openai_service_endoint,
        azure_deployment=openai_deployment,
        azure_ad_token_provider=token_provider
    )

    documents = pd.read_csv(path)
    items = []
    for document in documents.to_dict("records"):
        content = document["description"]
        id = str(document["id"])
        title = document["name"]
        url = document["url"]
        emb = client.embeddings.create(input=content, model=openai_deployment)
        rec = {
            "id": id,
            "content": content,
            "filepath": f"{title.lower().replace(' ', '-')}",
            "title": title,
            "url": url,
            "contentVector": emb.data[0].embedding,
        }
        items.append(rec)

    return items