in src/ai/commands/search_command.cs [341:445]
private static SearchIndexerSkillset PrepGetSkillset(string skillsetName, string aiServicesApiKey, string embeddingsEndpoint, string embeddingsDeployment, string embeddingsModelName, string embeddingsApiKey, string idFieldName, string contentFieldName, string vectorFieldName, SearchIndex datasourceIndex)
{
const int maximumPageLength = 2000;
const int pageOverlapLength = 500;
var useOcr = !string.IsNullOrEmpty(aiServicesApiKey);
var ocrSkill = new OcrSkill(
new List<InputFieldMappingEntry> {
new InputFieldMappingEntry("image") { Source = "/document/normalized_images/*" }
},
new List<OutputFieldMappingEntry> {
new OutputFieldMappingEntry("text") { TargetName = "text"}
}) {
Context = "/document/normalized_images/*",
ShouldDetectOrientation = true
};
var ocrMergeSkill = new MergeSkill(
new List<InputFieldMappingEntry> {
new InputFieldMappingEntry("text") { Source = "/document/content" },
new InputFieldMappingEntry("itemsToInsert") { Source = "/document/normalized_images/*/text" },
new InputFieldMappingEntry("offsets") { Source = "/document/normalized_images/*/contentOffset" }
},
new List<OutputFieldMappingEntry> {
new OutputFieldMappingEntry("mergedText") { TargetName = "mergedText"}
}) {
Context = "/document",
InsertPreTag = " ",
InsertPostTag = " "
};
var splitSkill = new SplitSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = useOcr ? "/document/mergedText" : "/document/content" }
},
new List<OutputFieldMappingEntry> {
new OutputFieldMappingEntry("textItems") { TargetName = "pages"}
})
{
DefaultLanguageCode = SplitSkillLanguage.En,
TextSplitMode = TextSplitMode.Pages,
MaximumPageLength = maximumPageLength,
PageOverlapLength = pageOverlapLength,
Context = "/document",
};
var azureOpenAIEmbeddingSkill = new AzureOpenAIEmbeddingSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/pages/*" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("embedding") { TargetName = "vector" }
})
{
Context = "/document/pages/*",
ResourceUri = new Uri(embeddingsEndpoint),
ApiKey = embeddingsApiKey,
DeploymentId = embeddingsDeployment,
ModelName = embeddingsModelName,
};
var skills = useOcr
? new List<SearchIndexerSkill> { ocrSkill, ocrMergeSkill, splitSkill, azureOpenAIEmbeddingSkill }
: new List<SearchIndexerSkill> { splitSkill, azureOpenAIEmbeddingSkill };
var indexProjections = new SearchIndexerIndexProjections(
new List<SearchIndexerIndexProjectionSelector>
{
new SearchIndexerIndexProjectionSelector(
datasourceIndex.Name,
parentKeyFieldName: idFieldName,
sourceContext: "/document/pages/*",
mappings: new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry(contentFieldName)
{
Source = "/document/pages/*"
},
new InputFieldMappingEntry(vectorFieldName)
{
Source = "/document/pages/*/vector"
}
})
})
{
Parameters = new SearchIndexerIndexProjectionsParameters()
{
ProjectionMode = IndexProjectionMode.SkipIndexingParentDocuments
}
};
var skillset = new SearchIndexerSkillset(skillsetName, skills)
{
IndexProjections = indexProjections,
CognitiveServicesAccount = useOcr
? new CognitiveServicesAccountKey(aiServicesApiKey)
: new DefaultCognitiveServicesAccount()
};
return skillset;
}