private HashMap read()

in opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java [126:228]


  private HashMap<String,String[]> read(byte[] data) {
   int WORDS  = (int) (76401 / 0.7); // presizing
   int GROUPS = (int) (88022 / 0.7); // presizing
   HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<>(WORDS);  // Map<String word, int[] groups>
   HashMap<Integer,ArrayList<String>> group2Words = new HashMap<>(GROUPS); // Map<int group, String[] words>
   HashMap<String,String> internedWords = new HashMap<>(WORDS);// Map<String word, String word>

   Charset charset = StandardCharsets.UTF_8;
   int lastNum = -1;
   Integer lastGroup = null;
   int len = data.length;
   int i=0;

   while (i < len) { // until EOF
     /* Part A: Parse a line */

     // scan to beginning of group
     while (i < len && data[i] != '(') i++;
     if (i >= len) break; // EOF
     i++;

     // parse group
     int num = 0;
     while (i < len && data[i] != ',') {
       num = 10*num + (data[i] - 48);
       i++;
     }
     i++;
  //      if (DEBUG) System.err.println("num="+ num);

     // scan to beginning of word
     while (i < len && data[i] != '\'') i++;
     i++;

     // scan to end of word
     int start = i;
     do {
       while (i < len && data[i] != '\'') i++;
       i++;
     } while (i < len && data[i] != ','); // word must end with "',"

     if (i >= len) break; // EOF
     String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
  //      String word = new String(data, 0, start, i-start-1); // ASCII

     /*
      * Part B: ignore phrases (with spaces and hyphens) and
      * non-alphabetic words, and let user customize word (e.g. do some
      * stemming)
      */
     if (!isValid(word)) continue; // ignore
     word = analyze(word);
     if (word == null || word.length() == 0) continue; // ignore


     /* Part C: Add (group,word) to tables */

     // ensure compact string representation, minimizing memory overhead
     String w = internedWords.get(word);
     if (w == null) {
       word = new String(word); // ensure compact string
       internedWords.put(word, word);
     } else {
       word = w;
     }

     Integer group = lastGroup;
     if (num != lastNum) {
       group = num;
       lastGroup = group;
       lastNum = num;
     }

     // add word --> group
     ArrayList<Integer> groups =  word2Groups.get(word);
     if (groups == null) {
       groups = new ArrayList<>(1);
       word2Groups.put(word, groups);
     }
     groups.add(group);

     // add group --> word
     ArrayList<String> words = group2Words.get(group);
     if (words == null) {
       words = new ArrayList<>(1);
       group2Words.put(group, words);
     }
     words.add(word);
   }


   /* Part D: compute index data structure */
   HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);

   /* Part E: minimize memory consumption by a factor 3 (or so) */
  //    if (true) return word2Syns;
   word2Groups = null; // help gc
   //TODO: word2Groups.clear(); would be more appropriate  ?
   group2Words = null; // help gc
   //TODO: group2Words.clear(); would be more appropriate  ?

   return optimize(word2Syns, internedWords);
  }