in tools/issue-labeler/src/SearchIndexCreator/DocumentIndex.cs [27:143]
public async Task SetupAndRunIndexer(SearchIndexClient indexClient, SearchIndexerClient indexerClient, AzureOpenAIClient openAIClient)
{
// Create an Index
Console.WriteLine("Creating/Updating the index...");
var index = GetSampleIndex();
await indexClient.CreateOrUpdateIndexAsync(index);
Console.WriteLine("Index Created/Updated!");
// Create a Data Source Connection
Console.WriteLine("Creating/Updating the data source connection...");
var dataSource = new SearchIndexerDataSourceConnection(
$"{_config["DocumentIndexName"]}-blob",
SearchIndexerDataSourceType.AzureBlob,
connectionString: _config["BlobConnectionString"],
container: new SearchIndexerDataContainer($"{_config["DocumentIndexName"]}-blob"))
{
DataChangeDetectionPolicy = new HighWaterMarkChangeDetectionPolicy("metadata_storage_last_modified"),
DataDeletionDetectionPolicy = new NativeBlobSoftDeleteDeletionDetectionPolicy()
};
indexerClient.CreateOrUpdateDataSourceConnection(dataSource);
Console.WriteLine("Data Source Created/Updated!");
// Create a Skillset
Console.WriteLine("Creating/Updating the skillset...");
var skillset = new SearchIndexerSkillset($"{_config["DocumentIndexName"]}-skillset", new List<SearchIndexerSkill>
{
// Add required skills here
new SplitSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/Content" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("textItems") { TargetName = "pages" }
})
{
Context = "/document",
TextSplitMode = TextSplitMode.Pages,
MaximumPageLength = 1000,
PageOverlapLength = 100,
},
new AzureOpenAIEmbeddingSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/pages/*" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("embedding") { TargetName = "text_vector" }
}
)
{
Context = "/document/pages/*",
ResourceUri = new Uri(_config["OpenAIEndpoint"]),
ModelName = _config["EmbeddingModelName"],
DeploymentName = _config["EmbeddingModelName"]
}
})
{
IndexProjection = new SearchIndexerIndexProjection(new[]
{
// Base the parent key on the title as it will be the metadata_storage_name which is unique.
new SearchIndexerIndexProjectionSelector(_config["DocumentIndexName"], parentKeyFieldName: "parent_id", sourceContext: "/document/pages/*", mappings: new[]
{
new InputFieldMappingEntry("chunk")
{
Source = "/document/pages/*"
},
new InputFieldMappingEntry("text_vector")
{
Source = "/document/pages/*/text_vector"
},
new InputFieldMappingEntry("Title")
{
Source = "/document/metadata_storage_name"
},
new InputFieldMappingEntry("Url")
{
Source = "/document/Url"
},
new InputFieldMappingEntry("metadata_storage_last_modified")
{
Source = "/document/metadata_storage_last_modified"
}
})
})
{
Parameters = new SearchIndexerIndexProjectionsParameters
{
ProjectionMode = IndexProjectionMode.SkipIndexingParentDocuments
}
}
};
await indexerClient.CreateOrUpdateSkillsetAsync(skillset).ConfigureAwait(false);
Console.WriteLine("Skillset Created/Updated!");
// Create an Indexer
Console.WriteLine("Creating the indexer and running it...");
var indexer = new SearchIndexer($"{_config["DocumentIndexName"]}-indexer", dataSource.Name, _config["DocumentIndexName"])
{
Description = "Indexer to chunk documents, generate embeddings, and add to the index",
Parameters = new IndexingParameters()
{
IndexingParametersConfiguration = new IndexingParametersConfiguration()
{
DataToExtract = BlobIndexerDataToExtract.ContentAndMetadata,
ParsingMode = BlobIndexerParsingMode.Json
}
},
SkillsetName = skillset.Name,
Schedule = new IndexingSchedule(TimeSpan.FromDays(1)) // Schedule to run every day
};
await indexerClient.CreateOrUpdateIndexerAsync(indexer).ConfigureAwait(false);
Console.WriteLine("Indexer Created/Updated!");
}