public static void readFile()

in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/GeonamesProcessor.java [182:294]


  public static void readFile(File gazateerInputData, GazetteerIndexer.GazType type, Map<String, AdminBoundary> adms, Map<String, String> countrycodes, IndexWriter w) throws Exception {

    BufferedReader reader = new BufferedReader(new FileReader(gazateerInputData));
    String[] boosts = "ADM1 ADM1H ADM2 ADM2H ADM3 ADM3H ADM4 ADM4H ADM5 ADMD ADMDH PCLD PCLH PCLI PCLIX TERR PCLIX PPL PPLA PPLA2 PPLA3 PPLA4 PPLC PPLCH PPLF PPLG PPLH PPLL PPLQ PPLR PPLS PPLX STLMT".split(" ");
    Map<String, Float> boostMap = new HashMap<>();
    for (String boost : boosts) {
      boostMap.put(boost.toLowerCase(), 10f);
    }
    String[] fieldStrings = new String[]{
      "geonameid",
      "name",
      "asciiname",
      "alternatenames",
      "latitude",
      "longitude",
      "feature_class",
      "feature_code",
      "country code",
      "cc2",
      "admin1_code",
      "admin2_code",
      "admin3_code",
      "admin4_code",
      "population",
      "elevation",
      "dem ",
      "timezone",
      "modification_date"};

    List<String> fields = Arrays.asList(fieldStrings);
    int counter = 0;
    System.out.println("reading gazetteer data from file...........");
    String line;
    while ((line = reader.readLine()) != null) {
      String[] values = line.split(type.getSeparator());

      Document doc = new Document();
      String admincode = values[10].toLowerCase();
      String ccode = values[8].toLowerCase();
      if (ccode.contains(",")) {
        String[] codes = ccode.split(",");
        if (codes.length > 0) {
          ccode = codes[0];
        }
      }
      AdminBoundary adm = adms.get(ccode + "." + admincode);

      String placeName = values[2];
      String lat = values[4];
      String lon = values[5];
      String dsg = values[7].toLowerCase();

      String id = values[0];
      String concatIndexEntry;
      String countryname;
      if (adm != null) {
        concatIndexEntry = adm.getCountryName() + ", " + adm.getProvinceName() + ", " + placeName;
        countryname = adm.getCountryName();
      } else {
        //there is no admin info, but we can still use the countrycode to concat the country name
        String n = countrycodes.get(ccode);
        countryname = n;
        if (n != null) {
          concatIndexEntry = n + ", " + placeName;
        } else {
          ///don't want a single token hierarchy entry.
          concatIndexEntry = "";
        }
      }
      if (ccode == null) {
        System.out.println("naughty country code");
      }
      for (int i = 0; i < fields.size() - 1; i++) {
        doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));

      }
      if (dsg.equals("pcli")) {
        System.out.println("placename: " + placeName + " RESET TO: " + countryname);
        placeName = countryname;
      }
      /*
       * add standard fields to the index
       */
      doc.add(new TextField("hierarchy", concatIndexEntry, Field.Store.YES));
      doc.add(new TextField("placename", placeName, Field.Store.YES));
      // doc.add(new TextField("countryname", countryname, Field.Store.YES));
      //System.out.println(placeName);

      doc.add(new TextField("latitude", lat, Field.Store.YES));
      doc.add(new TextField("longitude", lon, Field.Store.YES));
      doc.add(new StringField("loctype", dsg, Field.Store.YES));
      doc.add(new StringField("admincode", (ccode + "." + admincode).toLowerCase(), Field.Store.YES));
      doc.add(new StringField("countrycode", ccode.toLowerCase(), Field.Store.YES));
      doc.add(new StringField("countycode", "", Field.Store.YES));
      doc.add(new StringField("locid", id, Field.Store.YES));
      placeName = placeName.replace("republic of", "").replace("federative", "");
      if (id.equals("3175395")) {
        System.out.println(placeName);
      }
      doc.add(new StringField("gazsource", "geonames", Field.Store.YES));

      w.addDocument(doc);

      counter++;
      if (counter % 100000 == 0) {
        w.commit();
        System.out.println(counter + " .........Geonames entries committed to index..............");
      }

    }

    System.out.println("Completed indexing geonames gaz! index name is: " + type.toString());
  }