in opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java [126:228]
private HashMap<String,String[]> read(byte[] data) {
int WORDS = (int) (76401 / 0.7); // presizing
int GROUPS = (int) (88022 / 0.7); // presizing
HashMap<String,ArrayList<Integer>> word2Groups = new HashMap<>(WORDS); // Map<String word, int[] groups>
HashMap<Integer,ArrayList<String>> group2Words = new HashMap<>(GROUPS); // Map<int group, String[] words>
HashMap<String,String> internedWords = new HashMap<>(WORDS);// Map<String word, String word>
Charset charset = StandardCharsets.UTF_8;
int lastNum = -1;
Integer lastGroup = null;
int len = data.length;
int i=0;
while (i < len) { // until EOF
/* Part A: Parse a line */
// scan to beginning of group
while (i < len && data[i] != '(') i++;
if (i >= len) break; // EOF
i++;
// parse group
int num = 0;
while (i < len && data[i] != ',') {
num = 10*num + (data[i] - 48);
i++;
}
i++;
// if (DEBUG) System.err.println("num="+ num);
// scan to beginning of word
while (i < len && data[i] != '\'') i++;
i++;
// scan to end of word
int start = i;
do {
while (i < len && data[i] != '\'') i++;
i++;
} while (i < len && data[i] != ','); // word must end with "',"
if (i >= len) break; // EOF
String word = charset.decode(ByteBuffer.wrap(data, start, i-start-1)).toString();
// String word = new String(data, 0, start, i-start-1); // ASCII
/*
* Part B: ignore phrases (with spaces and hyphens) and
* non-alphabetic words, and let user customize word (e.g. do some
* stemming)
*/
if (!isValid(word)) continue; // ignore
word = analyze(word);
if (word == null || word.length() == 0) continue; // ignore
/* Part C: Add (group,word) to tables */
// ensure compact string representation, minimizing memory overhead
String w = internedWords.get(word);
if (w == null) {
word = new String(word); // ensure compact string
internedWords.put(word, word);
} else {
word = w;
}
Integer group = lastGroup;
if (num != lastNum) {
group = num;
lastGroup = group;
lastNum = num;
}
// add word --> group
ArrayList<Integer> groups = word2Groups.get(word);
if (groups == null) {
groups = new ArrayList<>(1);
word2Groups.put(word, groups);
}
groups.add(group);
// add group --> word
ArrayList<String> words = group2Words.get(group);
if (words == null) {
words = new ArrayList<>(1);
group2Words.put(group, words);
}
words.add(word);
}
/* Part D: compute index data structure */
HashMap<String,String[]> word2Syns = createIndex(word2Groups, group2Words);
/* Part E: minimize memory consumption by a factor 3 (or so) */
// if (true) return word2Syns;
word2Groups = null; // help gc
//TODO: word2Groups.clear(); would be more appropriate ?
group2Words = null; // help gc
//TODO: group2Words.clear(); would be more appropriate ?
return optimize(word2Syns, internedWords);
}