in demo-python/code/indexers/document-intelligence-custom-skill/scripts/setup_search_service.py [0:0]
def create_or_update_skillset(search_indexer_client: SearchIndexerClient, document_skill_url: str, split_skill_url: str):
document_skill = WebApiSkill(
description="Document intelligence skill to extract content from documents",
context="/document",
uri=document_skill_url,
timeout=timedelta(seconds=230),
batch_size=1,
degree_of_parallelism=1,
inputs=[
InputFieldMappingEntry(name="metadata_storage_path", source="/document/metadata_storage_path"),
InputFieldMappingEntry(name="metadata_storage_sas_token", source="/document/metadata_storage_sas_token"),
InputFieldMappingEntry(name="mode", source='="markdown"')
],
outputs=[
OutputFieldMappingEntry(name="content", target_name="file_markdown_content")
]
)
vectorizer_resource_uri = os.environ["AZURE_OPENAI_ENDPOINT"]
vectorizer_deployment = os.environ["AZURE_OPENAI_EMB_DEPLOYMENT"]
vectorizer_model = os.environ["AZURE_OPENAI_EMB_MODEL"]
vectorizer_dimensions = os.environ["AZURE_OPENAI_EMB_MODEL_DIMENSIONS"]
split_skill = WebApiSkill(
description="Markdown split skill to extract chunks from documents",
context="/document",
uri=split_skill_url,
timeout=timedelta(seconds=230),
batch_size=1,
degree_of_parallelism=1,
inputs=[
InputFieldMappingEntry(name="content", source="/document/file_markdown_content"),
InputFieldMappingEntry(name="encoderModelName", source=f'="{vectorizer_model}"'),
InputFieldMappingEntry(name="chunkSize", source=f'=512'),
InputFieldMappingEntry(name="chunkOverlap", source=f'=128')
],
outputs=[
OutputFieldMappingEntry(name="chunks", target_name="chunks")
]
)
embedding_skill = AzureOpenAIEmbeddingSkill(
description="Skill to generate embeddings via an Azure OpenAI endpoint",
context="/document/chunks/*",
resource_uri=vectorizer_resource_uri,
deployment_id=vectorizer_deployment,
model_name=vectorizer_model,
dimensions=vectorizer_dimensions,
inputs=[
InputFieldMappingEntry(name="text", source="/document/chunks/*/content"),
],
outputs=[
OutputFieldMappingEntry(name="embedding", target_name="vector")
]
)
index_projections = SearchIndexerIndexProjections(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=sample_index_name,
parent_key_field_name="parent_id",
source_context="/document/chunks/*",
mappings=[
InputFieldMappingEntry(name="chunk", source="/document/chunks/*/content"),
InputFieldMappingEntry(name="vector", source="/document/chunks/*/vector"),
InputFieldMappingEntry(name="chunk_headers", source="/document/chunks/*/headers"),
InputFieldMappingEntry(name="title", source="/document/metadata_storage_name")
],
)
],
parameters=SearchIndexerIndexProjectionsParameters(
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
),
)
skillset = SearchIndexerSkillset(
name=sample_skillset_name,
description="Skillset to use document intelligence, chunk documents and generating embeddings",
skills=[document_skill, split_skill, embedding_skill],
index_projections=index_projections,
)
search_indexer_client.create_or_update_skillset(skillset)