demo-python/code/vector-quantization-and-storage/lib/embeddings.py (60 lines of code) (raw):
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from openai import AzureOpenAI
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import os
import json
file_directory = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(
file_directory,
"..",
"..",
"..",
"..",
"data",
"benefitdocs")
content_path = os.path.join(
file_directory,
"..",
"..",
"..",
"..",
"data",
"documentVectors.json")
# You can create the index used by this function using the integrated vectorization notebook
def load_chunks_from_index():
load_dotenv(override=True)
endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")
index = os.getenv("AZURE_SEARCH_INDEX")
search_client = SearchClient(endpoint, index, AzureKeyCredential(key))
content = []
for doc in search_client.search(search_text="", top=1000, select="chunk_id,chunk,title"):
content.append({"id": doc["chunk_id"], "chunk": doc["chunk"], "title": doc["title"], "embedding": doc["vector"]})
with open(content_path, "w") as f:
json.dump(content, f)
# Optionally re-embed all the chunks
# Can be used to test a different embedding model such as text-embedding-3-large
def create_embeddings():
load_dotenv(override=True)
openai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
openai_key = os.getenv("AZURE_OPENAI_KEY")
embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))
api_version = os.getenv("AZURE_OPENAI_API_VERSION")
openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")
client = AzureOpenAI(
api_version=api_version,
azure_endpoint=openai_endpoint,
api_key=openai_key,
azure_ad_token_provider=token_provider if not openai_key else None
)
with open(content_path, "rb") as f:
content = json.load(f)
chunks = [c["chunk"] for c in content]
embedding_response = client.embeddings.create(input=chunks, model=embedding_deployment, dimensions=embedding_dimensions)
embeddings = [item.embedding for item in embedding_response.data]
for item, embedding in zip(content, embeddings):
item["embedding"] = embedding
with open(content_path, "w") as f:
json.dump(content, f)