in tools/issue-labeler/src/SearchIndexCreator/IssueIndex.cs [26:171]
public async Task SetupAndRunIndexer(SearchIndexClient indexClient, SearchIndexerClient indexerClient, AzureOpenAIClient openAIClient)
{
// Create an Index
Console.WriteLine("Creating/Updating the index...");
var index = GetSampleIndex();
await indexClient.CreateOrUpdateIndexAsync(index);
Console.WriteLine("Index Created/Updated!");
// Create a Data Source Connection
Console.WriteLine("Creating/Updating the data source connection...");
var dataSource = new SearchIndexerDataSourceConnection(
$"{_config["IssueIndexName"]}-blob",
SearchIndexerDataSourceType.AzureBlob,
connectionString: _config["BlobConnectionString"], // "Connection string" indicating to use managed identity
container: new SearchIndexerDataContainer($"{_config["IssueIndexName"]}-blob"))
{
DataChangeDetectionPolicy = new HighWaterMarkChangeDetectionPolicy("metadata_storage_last_modified"),
DataDeletionDetectionPolicy = new NativeBlobSoftDeleteDeletionDetectionPolicy()
};
indexerClient.CreateOrUpdateDataSourceConnection(dataSource);
Console.WriteLine("Data Source Created/Updated!");
// Create a Skillset specifically for chunking
// Each issue has its associated comments attached to it. Not sure if this will change in the future making chunking potentially unnecessary.
Console.WriteLine("Creating/Updating the skillset...");
var skillset = new SearchIndexerSkillset($"{_config["IssueIndexName"]}-skillset", new List<SearchIndexerSkill>
{
// Add required skills here
new SplitSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/Body" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("textItems") { TargetName = "pages" }
})
{
Context = "/document",
TextSplitMode = TextSplitMode.Pages,
// 10k because token limits are so high but want to experiment with lower chunking.
MaximumPageLength = 1000,
PageOverlapLength = 100,
},
new AzureOpenAIEmbeddingSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/pages/*" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("embedding") { TargetName = "text_vector" }
}
)
{
Context = "/document/pages/*",
ResourceUri = new Uri(_config["OpenAIEndpoint"]),
ModelName = _config["EmbeddingModelName"],
DeploymentName = _config["EmbeddingModelName"]
}
})
{
IndexProjection = new SearchIndexerIndexProjection(new[]
{
new SearchIndexerIndexProjectionSelector(_config["IssueIndexName"], parentKeyFieldName: "parent_id", sourceContext: "/document/pages/*", mappings: new[]
{
new InputFieldMappingEntry("text_vector")
{
Source = "/document/pages/*/text_vector"
},
new InputFieldMappingEntry("chunk")
{
Source = "/document/pages/*"
},
new InputFieldMappingEntry("Id")
{
Source = "/document/Id"
},
new InputFieldMappingEntry("Title")
{
Source = "/document/Title"
},
new InputFieldMappingEntry("Service")
{
Source = "/document/Service"
},
new InputFieldMappingEntry("Category")
{
Source = "/document/Category"
},
new InputFieldMappingEntry("Author")
{
Source = "/document/Author"
},
new InputFieldMappingEntry("Repository")
{
Source = "/document/Repository"
},
new InputFieldMappingEntry("CreatedAt")
{
Source = "/document/CreatedAt"
},
new InputFieldMappingEntry("Url")
{
Source = "/document/Url"
},
new InputFieldMappingEntry("CodeOwner")
{
Source = "/document/CodeOwner"
},
// Metadata is needed for updating the document (or atleast last modified not sure of the rest)
new InputFieldMappingEntry("metadata_storage_last_modified")
{
Source = "/document/metadata_storage_last_modified"
}
})
})
{
Parameters = new SearchIndexerIndexProjectionsParameters
{
ProjectionMode = IndexProjectionMode.SkipIndexingParentDocuments
}
}
};
await indexerClient.CreateOrUpdateSkillsetAsync(skillset).ConfigureAwait(false);
Console.WriteLine("Skillset Created/Updated!");
// Create an Indexer
Console.WriteLine("Creating the indexer and running it...");
var indexer = new SearchIndexer($"{_config["IssueIndexName"]}-indexer", dataSource.Name, _config["IssueIndexName"])
{
Description = "Indexer to chunk documents, generate embeddings, and add to the index",
Parameters = new IndexingParameters()
{
IndexingParametersConfiguration = new IndexingParametersConfiguration()
{
DataToExtract = BlobIndexerDataToExtract.ContentAndMetadata,
ParsingMode = BlobIndexerParsingMode.Json
}
},
SkillsetName = skillset.Name,
Schedule = new IndexingSchedule(TimeSpan.FromDays(1)) // Schedule to run every day
};
await indexerClient.CreateOrUpdateIndexerAsync(indexer).ConfigureAwait(false);
Console.WriteLine("Indexer Created/Updated!");
}