in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GazetteerIndexer.java [115:171]
public void index(File geonamesData, File geoNamesCountryInfo, File geonamesAdmin1CodesASCII, File usgsDataFile,
File usgsGovUnitsFile, File outputIndexDir, File outputCountryContextFile, File regionsFile)
throws IOException {
if (!outputIndexDir.isDirectory()) {
throw new IllegalArgumentException("outputIndexDir must be a directory.");
}
if (!geonamesData.exists()) {
throw new FileNotFoundException("geonames data file does not exist");
}
if (!geoNamesCountryInfo.exists()) {
throw new FileNotFoundException("geoNamesCountryCodes data file does not exist");
}
if (!geonamesAdmin1CodesASCII.exists()) {
throw new FileNotFoundException("geonamesAdmin1CodesASCII data file does not exist");
}
if (!usgsDataFile.exists()) {
throw new FileNotFoundException("usgsDataFile data file does not exist");
}
if (!usgsGovUnitsFile.exists()) {
throw new FileNotFoundException("usgsGovUnitsFile data file does not exist");
}
if (!outputIndexDir.exists()) {
throw new FileNotFoundException("outputIndexDir data file does not exist");
}
if (!regionsFile.exists()) {
throw new FileNotFoundException("regionsFile data file does not exist");
}
String indexloc = outputIndexDir.getPath() + "/opennlp_geoentitylinker_gazetteer";
Directory index = new MMapDirectory(Paths.get(indexloc));
Analyzer a = new StandardAnalyzer(new CharArraySet(new ArrayList<>(), true));
Map<String, Analyzer> analyMap = new HashMap<>();
analyMap.put("countrycode", new KeywordAnalyzer());
analyMap.put("admincode", new KeywordAnalyzer());
analyMap.put("loctype", new KeywordAnalyzer());
analyMap.put("countycode", new KeywordAnalyzer());
analyMap.put("gazsource", new KeywordAnalyzer());
PerFieldAnalyzerWrapper aWrapper = new PerFieldAnalyzerWrapper(a, analyMap);
IndexWriterConfig config = new IndexWriterConfig(aWrapper);
try (IndexWriter w = new IndexWriter(index, config)) {
//write the column headers for the countryContextFile
try (FileWriter writer = new FileWriter(outputCountryContextFile, false)) {
String colNamesForCountryContextFile = "countrycode\tprovcode\tcountycode\tcountryname\tprovincename\tcountyname\tcountryregex\tprovregex\tcountyregex\n";
writer.write(colNamesForCountryContextFile);
writer.flush();
}
USGSProcessor.process(usgsGovUnitsFile, usgsDataFile, outputCountryContextFile, w);
GeonamesProcessor.process(geoNamesCountryInfo, geonamesAdmin1CodesASCII, geonamesData, outputCountryContextFile, w);
RegionProcessor.process(regionsFile, outputCountryContextFile, w);
w.commit();
}
System.out.println("\nIndexing complete. Be sure to add '" + indexloc + "' and context file '" +
outputCountryContextFile.getPath() + "' to entitylinker.properties file");
}