in experiments/create-search-index/src/Program.cs [13:133]
static async Task Main(string[] args)
{
// ///////////////////
// Setup services
// ///////////////////
// Configure the run
_config = new ConfigurationBuilder()
.AddJsonFile("appsettings.json")
.AddEnvironmentVariables()
.Build();
_settings = _config.GetRequiredSection("Settings").Get<Settings>()!;
// Validate settings
if (_settings is null)
throw new ArgumentException("Settings required. Check your appsettings.json exists.");
if (string.IsNullOrEmpty(_settings.AzureAiVisionKey) || string.IsNullOrEmpty(_settings.AzureAiVisionEndpoint))
throw new ArgumentException("Azure AI Vision settings missing. Check your appsettings.json for 'AzureAiVisionKey' and 'AzureAiVisionEndpoint' settings.");
if (_settings.ExternalSourceType.Equals(ExternalSourceType.NGA) && string.IsNullOrEmpty(_settings.ExternalSourceConnectionInfo))
throw new ArgumentException("Assuming NGA source and database connection string not provided. Check your appsettings.json for 'ExternalSourceConnectionInfo' setting.");
if (string.IsNullOrEmpty(_settings.AzureAiSearchKey) || string.IsNullOrEmpty(_settings.AzureAiSearchEndpoint))
throw new ArgumentException("Azure AI Search settings missing. Check your appsettings.json for 'AzureAiSearchKey' and 'AzureAiSearchEndpoint' settings.");
if (string.IsNullOrEmpty(_settings.AzureAiSearchIndexName))
_settings.AzureAiSearchIndexName = "gallerydata-v";
// Setup services
IndexManager indexManager = new(_settings);
BaseRetriever retriever = _settings.ExternalSourceType switch
{
ExternalSourceType.MET => new MetOpenAccessRetriever(_settings),
ExternalSourceType.NGA => new NGAOpenDataRetriever(_settings),
_ => throw new ArgumentException("External Source Type not recognized."),
};
ImageVectorizer vectorizor = new(_settings);
// ///////////////////
// (Optionally) Cache responses from the Met API
// ///////////////////
// The Met Open Access API requires a separate call per object ID. This
// can be time consuming, and there may be extra data of interest that
// wouldn't be in the search index but to which you may want to refer.
// Uncommenting this next block of code will capture all responses from
// the API into a Cosmos DB container. This also makes retrieving the
// records much faster on subsequent runs if necessary for indexing.
/*
Console.WriteLine("PREPROCESSING... SAVING MET DATA TO COSMOS DB");
MetOpenAccessPreProcessor processor = new(_settings);
await processor.LoadAllRecordsIntoCosmosNoSql();
return;
*/
Console.WriteLine("Application configured. Creating index...");
// ///////////////////
// Create the index
// ///////////////////
// Ensure Index is ready
await indexManager.Create(_settings.DropAndRecreateIndexIfItExists);
Console.WriteLine($"Index created. Getting records from {_settings.ExternalSourceType}...");
// ///////////////////
// Populate the index
// ///////////////////
// NOTE: This process takes time! It takes roughly three hours for the
// ~116,119 records that were in NGA's Open Data program data set when this
// was last run. It will pull all records into memory, generate vectors for
// all objects in memory, and bulk upload in batches after all vectors have
// been generated for the entire data set. There is a minimal impact on memory.
// Get data from external source
var records = await retriever.GetAllRecords();
Console.WriteLine("Records retrieved. Enriching with embeddings...");
ConcurrentBag<IndexDocument> indexDocuments = [];
// Create IndexDocuments for all records with url that returned vectors
var parallelOptions = new ParallelOptions { MaxDegreeOfParallelism = -1 };
await Parallel.ForEachAsync(records, parallelOptions, async (record, cancellationToken) =>
{
try
{
record.VectorizedImage = await vectorizor.VectorizeImage(record.ImageUrl);
indexDocuments.Add(record);
}
catch (Exception e)
{
// 429s (Throttling) are handled in the ImageVectorizer with a built-in retry-after
// policy; however, we occassionally see 400s (Bad Request) when object IDs no longer
// resolve to an IIIF URL. It's a small subset, so we just print the error and skip.
// If critical, those objects would need to be managed so that they could be
// corrected and re-processed. Just skipping, we usually skip ~100.
//Console.WriteLine($"ERROR: {e.Message}\n{e.StackTrace}");
Console.WriteLine($"Unexpected issue getting vector data for object ({e.Message}). \n\tSkipping: [ ID = {record.ObjectID}, Title = {record.Title}, Url = {record.ImageUrl} ]");
}
});
Console.WriteLine($"Index documents created and enriched for {indexDocuments.Count} records. Bulk inserting in batches of 1000...");
// Bulk upload (in chunks of 1000 since there's an AI Search limit of 1000 per request)
var chunks = indexDocuments.Chunk(1000);
foreach (var chunk in chunks)
{
try
{
await indexManager.BulkInsert(chunk);
}
catch (Exception e)
{
Console.WriteLine($"Batch could not be saved to the index. Continuing. Error: {e.Message}");
}
}
Console.WriteLine("All data processed and index updated. Goodbye.");
}