in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java [177:271]
public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type,
Map<String, AdminBoundary> adms, Map<String, String> countrycodes,
IndexWriter w) throws IOException {
final Map<String, Float> boostMap = new HashMap<>();
for (String boost : BOOSTS) {
boostMap.put(boost.toLowerCase(), 10f);
}
final List<String> fields = List.of("geonameid", "name", "asciiname", "alternatenames",
"latitude", "longitude", "feature_class", "feature_code", "country code", "cc2",
"admin1_code", "admin2_code", "admin3_code", "admin4_code", "population",
"elevation", "dem ", "timezone", "modification_date");
int counter = 0;
System.out.println("reading gazetteer data from file...........");
String line;
try (BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData))) {
while ((line = reader.readLine()) != null) {
String[] values = line.split(type.getSeparator());
Document doc = new Document();
String admincode = values[10].toLowerCase();
String ccode = values[8].toLowerCase();
if (ccode.contains(",")) {
String[] codes = ccode.split(",");
if (codes.length > 0) {
ccode = codes[0];
}
}
AdminBoundary adm = adms.get(ccode + "." + admincode);
String placeName = values[2];
String lat = values[4];
String lon = values[5];
String dsg = values[7].toLowerCase();
String id = values[0];
String concatIndexEntry;
String countryname;
if (adm != null) {
concatIndexEntry = adm.countryName() + ", " + adm.provinceName() + ", " + placeName;
countryname = adm.countryName();
} else {
//there is no admin info, but we can still use the countrycode to concat the country name
String n = countrycodes.get(ccode);
countryname = n;
if (n != null) {
concatIndexEntry = n + ", " + placeName;
} else {
///don't want a single token hierarchy entry.
concatIndexEntry = "";
}
}
if (ccode == null) {
System.out.println("naughty country code");
}
for (int i = 0; i < fields.size() - 1; i++) {
doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
}
if (dsg.equals("pcli")) {
System.out.println("placename: " + placeName + " RESET TO: " + countryname);
placeName = countryname;
}
/*
* add standard fields to the index
*/
doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
doc.add(new TextField("placename", placeName, Field.Store.YES));
// doc.add(new TextField("countryname", countryname, Field.Store.YES));
doc.add(new TextField("latitude", lat, Field.Store.YES));
doc.add(new TextField("longitude", lon, Field.Store.YES));
doc.add(new StringField("loctype", dsg, Field.Store.YES));
doc.add(new StringField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
doc.add(new StringField("countrycode", ccode.toLowerCase(), Field.Store.YES));
doc.add(new StringField("countycode", "", Field.Store.YES));
doc.add(new StringField("locid", id, Field.Store.YES));
placeName = placeName.replace("republic of", "").replace("federative", "");
if (id.equals("3175395")) {
System.out.println(placeName);
}
doc.add(new StringField("gazsource", "geonames", Field.Store.YES));
w.addDocument(doc);
counter++;
if (counter % 100000 == 0) {
w.commit();
System.out.println(counter + " .........Geonames entries committed to index..............");
}
}
}
System.out.println("Completed indexing geonames gaz! index name is: " + type.toString());
}