public static void readFile()

in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java [177:271]


  public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type,
                              Map<String, AdminBoundary> adms, Map<String, String> countrycodes,
                              IndexWriter w) throws IOException {

    final Map<String, Float> boostMap = new HashMap<>();
    for (String boost : BOOSTS) {
      boostMap.put(boost.toLowerCase(), 10f);
    }
    final List<String> fields = List.of("geonameid", "name", "asciiname", "alternatenames",
            "latitude", "longitude", "feature_class", "feature_code", "country code", "cc2",
            "admin1_code", "admin2_code", "admin3_code", "admin4_code", "population",
            "elevation", "dem ", "timezone", "modification_date");

    int counter = 0;
    System.out.println("reading gazetteer data from file...........");
    String line;

    try (BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData))) {
      while ((line = reader.readLine()) != null) {
        String[] values = line.split(type.getSeparator());

        Document doc = new Document();
        String admincode = values[10].toLowerCase();
        String ccode = values[8].toLowerCase();
        if (ccode.contains(",")) {
          String[] codes = ccode.split(",");
          if (codes.length > 0) {
            ccode = codes[0];
          }
        }
        AdminBoundary adm = adms.get(ccode + "." + admincode);

        String placeName = values[2];
        String lat = values[4];
        String lon = values[5];
        String dsg = values[7].toLowerCase();

        String id = values[0];
        String concatIndexEntry;
        String countryname;
        if (adm != null) {
          concatIndexEntry = adm.countryName() + ", " + adm.provinceName() + ", " + placeName;
          countryname = adm.countryName();
        } else {
          //there is no admin info, but we can still use the countrycode to concat the country name
          String n = countrycodes.get(ccode);
          countryname = n;
          if (n != null) {
            concatIndexEntry = n + ", " + placeName;
          } else {
            ///don't want a single token hierarchy entry.
            concatIndexEntry = "";
          }
        }
        if (ccode == null) {
          System.out.println("naughty country code");
        }
        for (int i = 0; i < fields.size() - 1; i++) {
          doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

        }
        if (dsg.equals("pcli")) {
          System.out.println("placename: " + placeName + " RESET TO: " + countryname);
          placeName = countryname;
        }
        /*
         * add standard fields to the index
         */
        doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
        doc.add(new TextField("placename", placeName, Field.Store.YES));
        // doc.add(new TextField("countryname", countryname, Field.Store.YES));
        doc.add(new TextField("latitude", lat, Field.Store.YES));
        doc.add(new TextField("longitude", lon, Field.Store.YES));
        doc.add(new StringField("loctype", dsg, Field.Store.YES));
        doc.add(new StringField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
        doc.add(new StringField("countrycode", ccode.toLowerCase(), Field.Store.YES));
        doc.add(new StringField("countycode", "", Field.Store.YES));
        doc.add(new StringField("locid", id, Field.Store.YES));
        placeName = placeName.replace("republic of", "").replace("federative", "");
        if (id.equals("3175395")) {
          System.out.println(placeName);
        }
        doc.add(new StringField("gazsource", "geonames", Field.Store.YES));

        w.addDocument(doc);

        counter++;
        if (counter % 100000 == 0) {
          w.commit();
          System.out.println(counter + " .........Geonames entries committed to index..............");
        }
      }
    }
    System.out.println("Completed indexing geonames gaz! index name is: " + type.toString());
  }