public static void readFile()

in geoentitylinker-addon/src/main/java/opennlp/addons/geoentitylinker/indexing/USGSProcessor.java [59:182]


  public static void readFile(File gazetteerInput, IndexWriter w, GazetteerIndexer.GazType type,
                              Map<String, AdminBoundary> lookupMap) throws IOException {

    Map<String, StateCentroid> states = new HashMap<>();
    try (BufferedReader reader = new BufferedReader(new FileReader(gazetteerInput))) {

      List<String> fields = new ArrayList<>();
      int counter = 0;
      System.out.println("reading gazetteer data from USGS file...........");
      String line;
      while ((line = reader.readLine()) != null) {

        String[] values = line.split(type.getSeparator());
        if (counter == 0) {
          for (String columnName : values) {
            fields.add(columnName.replace("»¿", "").trim());
          }

        } else {
          Document doc = new Document();
          for (int i = 0; i < fields.size() - 1; i++) {
            doc.add(new TextField(fields.get(i), values[i].trim(), Field.Store.YES));
          }
          String placeName = values[1];
          String lat = values[9];
          String lon = values[10];
          String dsg = values[2];
          String id = values[0];

          String ccode = values[6];
          String admincode = values[3];
          AdminBoundary get = lookupMap.get(admincode + "." + ccode);
          String countyname = "";
          if (get == null) {
            System.out.println("null...continuing to index" + " ccode: " + ccode + " , admincode: " + admincode + " , placename: " + placeName);
            continue;

          }
          String countyCode = get.countyCode();

          if (!get.countyName().equals("NO_DATA_FOUND_VALUE")) {
            countyname = get.countyName();
          }
          if (!get.countyCode().equals("NO_DATA_FOUND_VALUE")) {
            countyCode = get.countyCode();
          }
          String hierarchy = get.countryName() + ", " + get.provinceName() + ", " + countyname + ", " + placeName;

          if (states.containsKey(get.provinceName())) {
            StateCentroid entry = states.get(get.provinceName());
            entry.count++;
            entry.latSum += Double.parseDouble(lat);
            entry.longSum += Double.parseDouble(lon);
          } else {
            StateCentroid centroid = new StateCentroid();
            centroid.statecode = get.getProvCode();
            centroid.count = 1;
            centroid.latSum = Double.parseDouble(lat);
            centroid.longSum = Double.parseDouble(lon);
            states.put(get.provinceName(), centroid);
          }

          doc.add(new TextField("hierarchy", hierarchy, Field.Store.YES));
          doc.add(new TextField("placename", placeName, Field.Store.YES));
          doc.add(new TextField("latitude", lat, Field.Store.YES));
          doc.add(new TextField("longitude", lon, Field.Store.YES));
          doc.add(new StringField("loctype", dsg, Field.Store.YES));
          doc.add(new StringField("admincode", (get.countryCode() + "." + get.getProvCode()).toLowerCase(), Field.Store.YES));
          doc.add(new StringField("countrycode", get.countryCode().toLowerCase(), Field.Store.YES));
          doc.add(new StringField("countycode", (get.countryCode() + "." + get.getProvCode() + "." + countyCode).toLowerCase(), Field.Store.YES));

          doc.add(new StringField("locid", id, Field.Store.YES));
          doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
          w.addDocument(doc);
        }
        counter++;
        if (counter % 100000 == 0) {
          w.commit();
          System.out.println(counter + " .........USGS entries committed to index..............");
        }

      }
    }


    for (String state : states.keySet()) {
      StateCentroid get = states.get(state);
      Document doc = new Document();
      doc.add(new TextField("hierarchy", "united states, " + state, Field.Store.YES));
      doc.add(new TextField("placename", state, Field.Store.YES));
      //calculate a centroid for all the points that were in the state
      doc.add(new TextField("latitude", (get.latSum / get.count) + "", Field.Store.YES));
      doc.add(new TextField("longitude", (get.longSum / get.count) + "", Field.Store.YES));
      doc.add(new StringField("loctype", "adm1", Field.Store.YES));
      doc.add(new StringField("admincode", get.statecode, Field.Store.YES));
      doc.add(new StringField("countrycode", "us", Field.Store.YES));
      doc.add(new StringField("countycode", "", Field.Store.YES));

      doc.add(new StringField("locid", "us_state:" + state, Field.Store.YES));
      doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
      w.addDocument(doc);

      // System.out.println(get.statecode + "," + (get.latSum / get.count) + "," + (get.longSum / get.count));
    }
    Document doc = new Document();
    doc.add(new TextField("hierarchy", "united states", Field.Store.YES));
    doc.add(new TextField("placename", "united states", Field.Store.YES));
    //calculate a centroid for all the points that were in the state
    doc.add(new TextField("latitude", 39.0 + "", Field.Store.YES));
    doc.add(new TextField("longitude", -103.0 + "", Field.Store.YES));
    doc.add(new StringField("loctype", "pcli", Field.Store.YES));
    doc.add(new StringField("admincode", "", Field.Store.YES));
    doc.add(new StringField("countrycode", "us", Field.Store.YES));
    doc.add(new StringField("countycode", "", Field.Store.YES));

    doc.add(new StringField("locid", "us_centroid" + "unitedstates", Field.Store.YES));
    doc.add(new StringField("gazsource", "usgs", Field.Store.YES));
    //System.out.println("uscentroid," + (sumofLatSums / sumOfCounts) + "," + (sumofLonSums / sumOfCounts));

    w.addDocument(doc);
    w.commit();

    System.out.println("Completed indexing USGS gaz!");
  }