in JfkWebApiSkills/JfkInitializer/SearchResources.cs [19:303]
public static SearchIndexerSkillset GetSkillset(string name, string blobContainerNameForImageStore)
{
string azureFunctionEndpointUri = string.Format("https://{0}.azurewebsites.net", ConfigurationManager.AppSettings["AzureFunctionSiteName"]);
return new SearchIndexerSkillset(
name: name,
skills: new List<SearchIndexerSkill>()
{
new OcrSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "image")
{
Source = "/document/normalized_images/*"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "text"),
new OutputFieldMappingEntry(name: "layoutText")
})
{
Context = "/document/normalized_images/*",
DefaultLanguageCode = OcrSkillLanguage.En
},
new ImageAnalysisSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "image")
{
Source = "/document/normalized_images/*"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "tags")
{
TargetName = "Tags"
},
new OutputFieldMappingEntry(name: "description")
{
TargetName = "Description"
}
})
{
Context = "/document/normalized_images/*",
VisualFeatures = { VisualFeature.Tags, VisualFeature.Description },
Details = { ImageDetail.Celebrities },
DefaultLanguageCode = ImageAnalysisSkillLanguage.En
},
new MergeSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "text")
{
Source = "/document/content"
},
new InputFieldMappingEntry(name: "itemsToInsert")
{
Source = "/document/normalized_images/*/text"
},
new InputFieldMappingEntry(name: "offsets")
{
Source = "/document/normalized_images/*/contentOffset"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "mergedText")
{
TargetName = "nativeTextAndOcr"
}
})
{
Description = "Merge native text content and inline OCR content where images were present",
Context = "/document"
},
new MergeSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "text")
{
Source = "/document/nativeTextAndOcr"
},
new InputFieldMappingEntry(name: "itemsToInsert")
{
Source = "/document/normalized_images/*/Description/captions/*/text"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "mergedText")
{
TargetName = "fullTextAndCaptions"
}
})
{
Description = "Merge text content with image captions",
Context = "/document"
},
new MergeSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "text")
{
Source = "/document/fullTextAndCaptions"
},
new InputFieldMappingEntry(name: "itemsToInsert")
{
Source = "/document/normalized_images/*/Tags/*/name"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "mergedText")
{
TargetName = "finalText"
}
})
{
Description = "Merge text content with image tags",
Context = "/document"
},
new SplitSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "text")
{
Source = "/document/finalText"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "textItems")
{
TargetName = "pages"
}
})
{
Description = "Split text into pages for subsequent skill processing",
Context = "/document/finalText",
TextSplitMode = TextSplitMode.Pages,
MaximumPageLength = 5000
},
new LanguageDetectionSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "text")
{
Source = "/document/finalText"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "languageCode")
}),
new EntityRecognitionSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "text")
{
Source = "/document/finalText/pages/*"
},
new InputFieldMappingEntry(name: "languageCode")
{
Source = "/document/languageCode"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "persons")
{
TargetName = "people"
},
new OutputFieldMappingEntry(name: "locations"),
new OutputFieldMappingEntry(name: "organizations"),
new OutputFieldMappingEntry(name: "namedEntities")
{
TargetName = "entities"
}
},
skillVersion: EntityRecognitionSkill.SkillVersion.V3)
{
Context = "/document/finalText/pages/*",
Categories = { EntityCategory.Person, EntityCategory.Location, EntityCategory.Organization },
},
new ShaperSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "layoutText")
{
Source = "/document/normalized_images/*/layoutText"
},
new InputFieldMappingEntry(name: "imageStoreUri")
{
Source = "/document/normalized_images/*/imageStoreUri"
},
new InputFieldMappingEntry(name: "width")
{
Source = "/document/normalized_images/*/width"
},
new InputFieldMappingEntry(name: "height")
{
Source = "/document/normalized_images/*/height"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "output")
{
TargetName = "ocrImageMetadata"
}
})
{
Description = "Create a custom OCR image metadata object used to generate an HOCR document",
Context = "/document/normalized_images/*"
},
new WebApiSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "imageData")
{
Source = "/document/normalized_images/*/data"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "imageStoreUri")
},
uri: string.Format("{0}/api/image-store?code={1}", azureFunctionEndpointUri, ConfigurationManager.AppSettings["AzureFunctionHostKey"]))
{
Description = "Upload image data to the annotation store",
Context = "/document/normalized_images/*",
HttpHeaders =
{
["BlobContainerName"] = blobContainerNameForImageStore
},
BatchSize = 1
},
new WebApiSkill(
inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "ocrImageMetadataList")
{
Source = "/document/normalized_images/*/ocrImageMetadata"
},
new InputFieldMappingEntry(name: "wordAnnotations")
{
Source = "/document/cryptonyms"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "hocrDocument")
},
uri: string.Format("{0}/api/hocr-generator?code={1}", azureFunctionEndpointUri, ConfigurationManager.AppSettings["AzureFunctionHostKey"]))
{
Description = "Generate HOCR for webpage rendering",
Context = "/document",
BatchSize = 1,
},
new WebApiSkill(inputs: new List<InputFieldMappingEntry>()
{
new InputFieldMappingEntry(name: "words")
{
Source = "/document/normalized_images/*/layoutText/words/*/text"
}
},
outputs: new List<OutputFieldMappingEntry>()
{
new OutputFieldMappingEntry(name: "cryptonyms")
},
uri: string.Format("{0}/api/link-cryptonyms-list?code={1}", azureFunctionEndpointUri, ConfigurationManager.AppSettings["AzureFunctionHostKey"]))
{
Description = "Cryptonym linker",
Context = "/document",
BatchSize = 1
}
})
{
Name = name,
Description = "JFK Files Skillset",
CognitiveServicesAccount = new CognitiveServicesAccountKey(key: ConfigurationManager.AppSettings["CognitiveServicesAccountKey"])
};
}