tools/issue-labeler/src/SearchIndexCreator/IssueIndex.cs (293 lines of code) (raw):
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
using Azure.AI.OpenAI;
using Azure.Search.Documents.Indexes.Models;
using Azure.Search.Documents.Indexes;
using Microsoft.Extensions.Configuration;
namespace SearchIndexCreator
{
public class IssueIndex
{
private readonly IConfiguration _config;
public IssueIndex(IConfiguration config)
{
_config = config;
}
/// <summary>
/// Sets up and runs the indexer.
/// </summary>
/// <param name="indexClient">The client to manage the Azure Search index.</param>
/// <param name="indexerClient">The client to manage the Azure Search indexer.</param>
/// <param name="openAIClient">The client to interact with Azure OpenAI.</param>
public async Task SetupAndRunIndexer(SearchIndexClient indexClient, SearchIndexerClient indexerClient, AzureOpenAIClient openAIClient)
{
// Create an Index
Console.WriteLine("Creating/Updating the index...");
var index = GetSampleIndex();
await indexClient.CreateOrUpdateIndexAsync(index);
Console.WriteLine("Index Created/Updated!");
// Create a Data Source Connection
Console.WriteLine("Creating/Updating the data source connection...");
var dataSource = new SearchIndexerDataSourceConnection(
$"{_config["IssueIndexName"]}-blob",
SearchIndexerDataSourceType.AzureBlob,
connectionString: _config["BlobConnectionString"], // "Connection string" indicating to use managed identity
container: new SearchIndexerDataContainer($"{_config["IssueIndexName"]}-blob"))
{
DataChangeDetectionPolicy = new HighWaterMarkChangeDetectionPolicy("metadata_storage_last_modified"),
DataDeletionDetectionPolicy = new NativeBlobSoftDeleteDeletionDetectionPolicy()
};
indexerClient.CreateOrUpdateDataSourceConnection(dataSource);
Console.WriteLine("Data Source Created/Updated!");
// Create a Skillset specifically for chunking
// Each issue has its associated comments attached to it. Not sure if this will change in the future making chunking potentially unnecessary.
Console.WriteLine("Creating/Updating the skillset...");
var skillset = new SearchIndexerSkillset($"{_config["IssueIndexName"]}-skillset", new List<SearchIndexerSkill>
{
// Add required skills here
new SplitSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/Body" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("textItems") { TargetName = "pages" }
})
{
Context = "/document",
TextSplitMode = TextSplitMode.Pages,
// 10k because token limits are so high but want to experiment with lower chunking.
MaximumPageLength = 1000,
PageOverlapLength = 100,
},
new AzureOpenAIEmbeddingSkill(
new List<InputFieldMappingEntry>
{
new InputFieldMappingEntry("text") { Source = "/document/pages/*" }
},
new List<OutputFieldMappingEntry>
{
new OutputFieldMappingEntry("embedding") { TargetName = "text_vector" }
}
)
{
Context = "/document/pages/*",
ResourceUri = new Uri(_config["OpenAIEndpoint"]),
ModelName = _config["EmbeddingModelName"],
DeploymentName = _config["EmbeddingModelName"]
}
})
{
IndexProjection = new SearchIndexerIndexProjection(new[]
{
new SearchIndexerIndexProjectionSelector(_config["IssueIndexName"], parentKeyFieldName: "parent_id", sourceContext: "/document/pages/*", mappings: new[]
{
new InputFieldMappingEntry("text_vector")
{
Source = "/document/pages/*/text_vector"
},
new InputFieldMappingEntry("chunk")
{
Source = "/document/pages/*"
},
new InputFieldMappingEntry("Id")
{
Source = "/document/Id"
},
new InputFieldMappingEntry("Title")
{
Source = "/document/Title"
},
new InputFieldMappingEntry("Service")
{
Source = "/document/Service"
},
new InputFieldMappingEntry("Category")
{
Source = "/document/Category"
},
new InputFieldMappingEntry("Author")
{
Source = "/document/Author"
},
new InputFieldMappingEntry("Repository")
{
Source = "/document/Repository"
},
new InputFieldMappingEntry("CreatedAt")
{
Source = "/document/CreatedAt"
},
new InputFieldMappingEntry("Url")
{
Source = "/document/Url"
},
new InputFieldMappingEntry("CodeOwner")
{
Source = "/document/CodeOwner"
},
// Metadata is needed for updating the document (or atleast last modified not sure of the rest)
new InputFieldMappingEntry("metadata_storage_last_modified")
{
Source = "/document/metadata_storage_last_modified"
}
})
})
{
Parameters = new SearchIndexerIndexProjectionsParameters
{
ProjectionMode = IndexProjectionMode.SkipIndexingParentDocuments
}
}
};
await indexerClient.CreateOrUpdateSkillsetAsync(skillset).ConfigureAwait(false);
Console.WriteLine("Skillset Created/Updated!");
// Create an Indexer
Console.WriteLine("Creating the indexer and running it...");
var indexer = new SearchIndexer($"{_config["IssueIndexName"]}-indexer", dataSource.Name, _config["IssueIndexName"])
{
Description = "Indexer to chunk documents, generate embeddings, and add to the index",
Parameters = new IndexingParameters()
{
IndexingParametersConfiguration = new IndexingParametersConfiguration()
{
DataToExtract = BlobIndexerDataToExtract.ContentAndMetadata,
ParsingMode = BlobIndexerParsingMode.Json
}
},
SkillsetName = skillset.Name,
Schedule = new IndexingSchedule(TimeSpan.FromDays(1)) // Schedule to run every day
};
await indexerClient.CreateOrUpdateIndexerAsync(indexer).ConfigureAwait(false);
Console.WriteLine("Indexer Created/Updated!");
}
/// <summary>
/// Gets a sample search index with HNSW alorithm, built in vectorizer, semantic search turned on, compression set up, and all needed fields for issues.
/// </summary>
/// <returns>The sample search index.</returns>
private SearchIndex GetSampleIndex()
{
const string vectorSearchHnswProfile = "issue-vector-profile";
const string vectorSearchHnswConfig = "issueHnsw";
const string vectorSearchVectorizer = "issueOpenAIVectorizer";
const string semanticSearchConfig = "issue-semantic-config";
const string binaryCompression = "issue-binary-compression";
const int modelDimensions = 1536;// "Default" value
SearchIndex searchIndex = new SearchIndex(_config["IssueIndexName"])
{
VectorSearch = new()
{
Profiles =
{
new VectorSearchProfile(vectorSearchHnswProfile, vectorSearchHnswConfig)
{
VectorizerName = vectorSearchVectorizer,
CompressionName = binaryCompression
},
},
Algorithms =
{
new HnswAlgorithmConfiguration(vectorSearchHnswConfig),
},
Vectorizers =
{
new AzureOpenAIVectorizer(vectorSearchVectorizer)
{
Parameters = new AzureOpenAIVectorizerParameters()
{
ResourceUri = new Uri(_config["OpenAIEndpoint"]),
DeploymentName = _config["EmbeddingModelName"],
ModelName = _config["EmbeddingModelName"]
}
}
},
Compressions =
{
new BinaryQuantizationCompression(binaryCompression)
}
},
SemanticSearch = new()
{
Configurations =
{
new SemanticConfiguration(semanticSearchConfig, new()
{
TitleField = new SemanticField(fieldName: "Title"),
ContentFields =
{
new SemanticField(fieldName: "chunk")
},
KeywordsFields =
{
new SemanticField(fieldName: "Service"),
new SemanticField(fieldName: "Category")
},
})
},
},
Fields =
{
new SearchableField("chunk_id")
{
IsKey = true,
IsFilterable = false,
IsSortable = true,
IsFacetable = false,
AnalyzerName = LexicalAnalyzerName.Keyword
},
new SearchableField("parent_id")
{
IsFilterable = true,
IsSortable = false,
IsFacetable = false
},
new SearchableField("chunk"),
new SearchField("text_vector", SearchFieldDataType.Collection(SearchFieldDataType.Single))
{
IsSearchable = true,
VectorSearchDimensions = modelDimensions,
VectorSearchProfileName = vectorSearchHnswProfile
},
new SearchField("Id", SearchFieldDataType.String)
{
IsSearchable = false
},
new SearchableField("Title"),
new SearchableField("Service")
{
IsFilterable = true
},
new SearchableField("Category")
{
IsFilterable = true
},
new SearchField("Author", SearchFieldDataType.String)
{
IsSearchable = false
},
new SearchField("Repository", SearchFieldDataType.String)
{
IsSearchable = false
},
new SearchField("CreatedAt", SearchFieldDataType.DateTimeOffset)
{
IsSearchable = false
},
new SearchField("Url", SearchFieldDataType.String)
{
IsSearchable = false
},
// 0 = false, 1 = true
// Used numbers to use the magnitude boosting function
new SearchField("CodeOwner", SearchFieldDataType.Int32)
{
IsSearchable = false,
IsSortable = false,
IsFilterable = true
},
new SearchField("metadata_storage_last_modified", SearchFieldDataType.DateTimeOffset)
{
IsHidden = true,
IsSearchable = false
}
}
};
// Scoring Boost for "Issue" objects that are comments made by the codeowner.
searchIndex.ScoringProfiles.Add(new ScoringProfile("CodeOwnerBoost")
{
Functions =
{
new MagnitudeScoringFunction(
fieldName: "CodeOwner",
boost: 5.0, // Adjust the boost factor as needed
parameters: new MagnitudeScoringParameters(1, 1)
{
ShouldBoostBeyondRangeByConstant = false,
})
{
Interpolation = ScoringFunctionInterpolation.Constant
}
}
});
return searchIndex;
}
}
}