demo-python/code/data-chunking/lib/common.py (235 lines of code) (raw):
from azure.search.documents.indexes.models import (
SearchIndex,
SearchIndexer,
SearchIndexerDataSourceConnection,
SearchIndexerDataContainer,
SearchField,
SearchFieldDataType,
VectorSearch,
VectorSearchProfile,
HnswVectorSearchAlgorithmConfiguration,
AzureOpenAIEmbeddingSkill,
SplitSkill
)
# Required to use the preview SDK
from azure.search.documents.indexes._generated.models import (
SearchIndexerSkillset,
AzureOpenAIVectorizer,
AzureOpenAIParameters,
SearchIndexerIndexProjections,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
InputFieldMappingEntry,
OutputFieldMappingEntry
)
import tiktoken
import matplotlib.pyplot as plt
import math
import numpy as np
def create_search_index(index_name, azure_openai_endpoint, azure_openai_embedding_deployment_id, azure_openai_key=None):
return SearchIndex(
name=index_name,
fields=[
SearchField(
name="chunk_id",
type=SearchFieldDataType.String,
key=True,
hidden=False,
filterable=True,
sortable=True,
facetable=False,
searchable=True,
analyzer_name="keyword"
),
SearchField(
name="parent_id",
type=SearchFieldDataType.String,
hidden=False,
filterable=True,
sortable=True,
facetable=False,
searchable=True
),
SearchField(
name="chunk",
type=SearchFieldDataType.String,
hidden=False,
filterable=False,
sortable=False,
facetable=False,
searchable=True
),
SearchField(
name="title",
type=SearchFieldDataType.String,
hidden=False,
filterable=False,
sortable=False,
facetable=False,
searchable=True
),
SearchField(
name="vector",
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
hidden=False,
filterable=False,
sortable=False,
facetable=False,
searchable=True,
vector_search_dimensions=1536,
vector_search_profile="profile"
)
],
vector_search=VectorSearch(
profiles=[
VectorSearchProfile(
name="profile",
algorithm="hnsw-algorithm",
vectorizer="azure-openai-vectorizer"
)
],
algorithms=[
HnswVectorSearchAlgorithmConfiguration(name="hnsw-algorithm")
],
vectorizers=[
AzureOpenAIVectorizer(
name="azure-openai-vectorizer",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=azure_openai_endpoint,
deployment_id=azure_openai_embedding_deployment_id,
api_key=azure_openai_key # Optional if using RBAC authentication
)
)
]
)
)
def create_search_datasource(datasource_name, connection_string, container_name):
return SearchIndexerDataSourceConnection(
name=datasource_name,
type="azureblob",
connection_string=connection_string,
container=SearchIndexerDataContainer(
name=container_name
)
)
def create_search_skillset(
skillset_name,
index_name,
azure_openai_endpoint,
azure_openai_embedding_deployment_id,
azure_openai_key=None,
text_split_mode='pages',
maximum_page_length=2000,
page_overlap_length=500):
return SearchIndexerSkillset(
name=skillset_name,
skills=[
SplitSkill(
name="Text Splitter",
default_language_code="en",
text_split_mode=text_split_mode,
maximum_page_length=maximum_page_length,
page_overlap_length=page_overlap_length,
context="/document",
inputs=[
InputFieldMappingEntry(
name="text",
source="/document/content"
)
],
outputs=[
OutputFieldMappingEntry(
name="textItems",
target_name="pages"
)
]
),
AzureOpenAIEmbeddingSkill(
name="Embeddings",
resource_uri=azure_openai_endpoint,
deployment_id=azure_openai_embedding_deployment_id,
api_key=azure_openai_key, # Optional if using RBAC authentication
context="/document/pages/*",
inputs=[
InputFieldMappingEntry(
name="text",
source="/document/pages/*"
)
],
outputs=[
OutputFieldMappingEntry(
name="embedding",
target_name="vector"
)
]
)
],
index_projections=SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
parent_key_field_name="parent_id",
source_context="/document/pages/*",
mappings=[
InputFieldMappingEntry(
name="chunk",
source="/document/pages/*"
),
InputFieldMappingEntry(
name="vector",
source="/document/pages/*/vector"
),
InputFieldMappingEntry(
name="title",
source="/document/metadata_storage_name"
)
]
)
],
parameters=SearchIndexerIndexProjectionsParameters(projection_mode="skipIndexingParentDocuments")
)
)
def create_search_indexer(
indexer_name,
skillset_name,
datasource_name,
index_name):
return SearchIndexer(
name=indexer_name,
data_source_name=datasource_name,
target_index_name=index_name,
skillset_name=skillset_name
)
def get_chunks(search_client):
results = search_client.search(search_text="*", top=100000, select="chunk_id,chunk")
chunks = {}
for result in results:
id = int(result["chunk_id"].split("_")[3])
chunks[id] = result["chunk"]
return [chunks[id] for id in sorted(chunks.keys())]
def get_encoding_name(model="gpt-3.5-turbo"):
return tiktoken.encoding_for_model(model).name
def get_token_length(text, model="gpt-3.5-turbo"):
return len(tiktoken.encoding_for_model(model).encode(text))
def plot_chunk_histogram(chunks, length_fn, title, xlabel, ylabel="Chunk Count"):
def round_to_lowest_multiple(number, multiple):
return (number // multiple) * multiple
def round_to_highest_multiple(number, multiple):
return math.ceil(number / multiple) * multiple
ys = [length_fn(chunk) for chunk in chunks]
min_y = min(ys)
max_y = max(ys)
bins=25
n, _, _ = plt.hist(ys, edgecolor="black", bins=bins)
# Set y-axis limits to remove the gap at the top
max_freq = max(n)
plt.ylim(0, max_freq)
# Spacing for ticks on x-axis and x-axis limits to remove gaps
tick_step = max(int(round_to_lowest_multiple((max_y-min_y)/5, 100)), 100)
max_xtick = round_to_highest_multiple(max_y, tick_step)
xticks = list(np.arange(start=round_to_lowest_multiple(min_y, tick_step), stop=round_to_highest_multiple(max_xtick, tick_step), step=tick_step))
if max_xtick and xticks[-1] != max_xtick:
xticks.append(max_xtick)
plt.xticks(xticks)
plt.xlim(round_to_lowest_multiple(min_y, tick_step), max_xtick)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.show()