demo-python/code/community-integration/ragas/lib/utils.py (282 lines of code) (raw):
from importlib import reload
import langchain_community.vectorstores.azuresearch
from typing import Callable, Optional, List
import os
from azure.search.documents.indexes import SearchIndexerClient, SearchIndexClient
from azure.search.documents.indexes.models import (
SearchIndexerDataContainer,
SearchIndexerDataSourceConnection,
SearchField,
SearchFieldDataType,
VectorSearch,
HnswAlgorithmConfiguration,
HnswParameters,
VectorSearchProfile,
AzureOpenAIVectorizer,
AzureOpenAIParameters,
SearchIndex,
NativeBlobSoftDeleteDeletionDetectionPolicy,
SemanticConfiguration,
SemanticSearch,
SemanticPrioritizedFields,
SemanticField,
SplitSkill,
InputFieldMappingEntry,
OutputFieldMappingEntry,
AzureOpenAIEmbeddingSkill,
SearchIndexerIndexProjections,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
IndexProjectionMode,
SearchIndexerSkillset,
SearchIndexer,
FieldMapping,
IndexingSchedule
)
from datetime import timedelta
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.storage.blob import BlobServiceClient
import os
import glob
def upload_sample_documents(
blob_connection_string: str,
blob_container_name: str,
use_user_identity: bool = True
):
# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(conn_str=blob_connection_string, credential=DefaultAzureCredential() if use_user_identity else None)
container_client = blob_service_client.get_container_client(blob_container_name)
if not container_client.exists():
container_client.create_container()
documents_directory = os.path.join("..", "..", "..", "..", "data", "benefitdocs")
pdf_files = glob.glob(os.path.join(documents_directory, '*.pdf'))
for file in pdf_files:
with open(file, "rb") as data:
name = os.path.basename(file)
if not container_client.get_blob_client(name).exists():
container_client.upload_blob(name=name, data=data)
def create_sample_datasource(
indexer_client: SearchIndexerClient,
blob_container_name: str,
index_name: str,
search_blob_connection_string: str):
# Create a data source
container = SearchIndexerDataContainer(name=blob_container_name)
data_source_connection = SearchIndexerDataSourceConnection(
name=f"{index_name}-blob",
type="azureblob",
connection_string=search_blob_connection_string,
container=container,
data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy()
)
return indexer_client.create_or_update_data_source_connection(data_source_connection)
def create_sample_index(
index_client: SearchIndexClient,
index_name: str,
azure_openai_endpoint: str,
azure_openai_ada002_embedding_deployment: str,
azure_openai_3_large_embedding_deployment: str,
azure_openai_key: Optional[str] = None
):
# Create a search index
fields = [
SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
SearchField(name="title", type=SearchFieldDataType.String),
SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),
SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),
SearchField(name="vector_ada002", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="hnsw_ada002"),
SearchField(name="vector_3_large", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=3072, vector_search_profile_name="hnsw_3_large")
]
# Configure the vector search configuration
vector_search = VectorSearch(
algorithms=[
HnswAlgorithmConfiguration(
name="hnsw",
parameters=HnswParameters()
)
],
profiles=[
VectorSearchProfile(
name="hnsw_ada002",
algorithm_configuration_name="hnsw",
vectorizer="ada002",
),
VectorSearchProfile(
name="hnsw_3_large",
algorithm_configuration_name="hnsw",
vectorizer="3_large",
),
],
vectorizers=[
AzureOpenAIVectorizer(
name="ada002",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=azure_openai_endpoint,
deployment_id=azure_openai_ada002_embedding_deployment,
api_key=azure_openai_key,
model_name="text-embedding-ada-002"
)
),
AzureOpenAIVectorizer(
name="3_large",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=azure_openai_endpoint,
deployment_id=azure_openai_3_large_embedding_deployment,
api_key=azure_openai_key,
model_name="text-embedding-3-large"
)
)
],
)
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=SemanticPrioritizedFields(
content_fields=[SemanticField(field_name="chunk")]
))
# Create the semantic search with the configuration
semantic_search = SemanticSearch(configurations=[semantic_config])
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_search=semantic_search)
return index_client.create_or_update_index(index)
def create_sample_skillset(
search_indexer_client: SearchIndexerClient,
index_name: str,
azure_openai_endpoint: str,
azure_openai_ada002_embedding_deployment: str,
azure_openai_3_large_embedding_deployment: str,
azure_openai_key: Optional[str] = None
):
# Create a skillset
skillset_name = f"{index_name}-skillset"
split_skill = SplitSkill(
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
maximum_page_length=2000,
page_overlap_length=500,
inputs=[
InputFieldMappingEntry(name="text", source="/document/content"),
],
outputs=[
OutputFieldMappingEntry(name="textItems", target_name="pages")
],
)
embedding_ada_002_skill = AzureOpenAIEmbeddingSkill(
description="Skill to generate ada 002 embeddings via Azure OpenAI",
context="/document/pages/*",
resource_uri=azure_openai_endpoint,
deployment_id=azure_openai_ada002_embedding_deployment,
api_key=azure_openai_key,
model_name="text-embedding-ada-002",
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
],
outputs=[
OutputFieldMappingEntry(name="embedding", target_name="vector_ada002")
],
)
embedding_3_large_skill = AzureOpenAIEmbeddingSkill(
description="Skill to generate ada 002 embeddings via Azure OpenAI",
context="/document/pages/*",
resource_uri=azure_openai_endpoint,
deployment_id=azure_openai_3_large_embedding_deployment,
api_key=azure_openai_key,
model_name="text-embedding-3-large",
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
],
outputs=[
OutputFieldMappingEntry(name="embedding", target_name="vector_3_large")
],
)
index_projections = SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
parent_key_field_name="parent_id",
source_context="/document/pages/*",
mappings=[
InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
InputFieldMappingEntry(name="vector_ada002", source="/document/pages/*/vector_ada002"),
InputFieldMappingEntry(name="vector_3_large", source="/document/pages/*/vector_3_large"),
InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
],
),
],
parameters=SearchIndexerIndexProjectionsParameters(
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
),
)
skillset = SearchIndexerSkillset(
name=skillset_name,
description="Skillset to chunk documents and generating embeddings",
skills=[split_skill, embedding_3_large_skill, embedding_ada_002_skill],
index_projections=index_projections,
)
return search_indexer_client.create_or_update_skillset(skillset)
def create_sample_indexer(
search_indexer_client: SearchIndexerClient,
index_name: str,
skilset_name: str,
datasource_name: str
):
# Create an indexer
indexer_name = f"{index_name}-indexer"
indexer = SearchIndexer(
name=indexer_name,
description="Indexer to index documents and generate embeddings",
skillset_name=skilset_name,
target_index_name=index_name,
data_source_name=datasource_name,
# Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results
field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
schedule=IndexingSchedule(interval=timedelta(minutes=5))
)
indexer = search_indexer_client.create_or_update_indexer(indexer)
# Run the indexer
search_indexer_client.run_indexer(indexer_name)
return indexer
def create_langchain_azure_openai_wrappers(
azure_openai_api_version: str,
azure_openai_endpoint: str,
azure_openai_3_large_embedding_deployment: str,
azure_openai_ada002_embedding_deployment: str,
azure_openai_generator_deployment: str,
azure_openai_critic_deployment: str,
azure_openai_key: Optional[str] = None
):
openai_credential = DefaultAzureCredential()
token_provider = get_bearer_token_provider(openai_credential, "https://cognitiveservices.azure.com/.default")
azure_openai_args = {
"openai_api_version": azure_openai_api_version,
"azure_endpoint": azure_openai_endpoint,
"api_key": azure_openai_key,
"azure_ad_token_provider": token_provider if not azure_openai_key else None
}
# Use API key if provided, otherwise use RBAC authentication
text_3_large_embeddings = AzureOpenAIEmbeddings(
azure_deployment=azure_openai_3_large_embedding_deployment,
**azure_openai_args
)
ada_002_embeddings = AzureOpenAIEmbeddings(
azure_deployment=azure_openai_ada002_embedding_deployment,
**azure_openai_args
)
generator_llm = AzureChatOpenAI(
azure_deployment=azure_openai_generator_deployment,
**azure_openai_args
)
critic_llm = AzureChatOpenAI(
azure_deployment=azure_openai_critic_deployment,
**azure_openai_args
)
return (text_3_large_embeddings, ada_002_embeddings, generator_llm, critic_llm)
def create_langchain_vectorstore(
azure_search_endpoint: str,
azure_search_key: str,
index_name: str,
embedding_function: Callable,
search_type: str = "semantic_hybrid",
vector_field_name: Optional[str] = None):
os.environ["AZURESEARCH_FIELDS_CONTENT_VECTOR"] = vector_field_name
os.environ["AZURESEARCH_FIELDS_CONTENT"] = "chunk"
reload(langchain_community.vectorstores.azuresearch)
return langchain_community.vectorstores.azuresearch.AzureSearch(
azure_search_endpoint=azure_search_endpoint,
azure_search_key=azure_search_key,
index_name=index_name,
embedding_function=embedding_function,
search_type=search_type
)