in opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java [113:201]
private List<String> classifySentence(String queryStr) {
List<String> results = new ArrayList<>();
// too short of a query
if (queryStr.length() < MIN_CHARS_IN_QUERY) {
return results;
}
Analyzer std = new StandardAnalyzer();
QueryParser parser = new QueryParser("text", std);
parser.setDefaultOperator(QueryParser.Operator.OR);
Query query;
try {
query = parser.parse(queryStr);
} catch (ParseException e2) {
return results;
}
TopDocs hits = null; // TopDocs search(Query, int)
// Finds the top n hits for query.
try {
hits = indexSearcher.search(query, MAX_DOCS_TO_USE_FOR_CLASSIFY + 2);
} catch (IOException e1) {
LOGGER.error("problem searching index \n", e1);
}
LOGGER.debug("Found " + hits.totalHits + " hits for " + queryStr);
int count = 0;
for (ScoreDoc scoreDoc : hits.scoreDocs) {
Document doc;
try {
doc = indexSearcher.doc(scoreDoc.doc);
} catch (IOException e) {
LOGGER.error("Problem searching training set for classif \n"
+ e);
continue;
}
String flag = doc.get("class");
Float scoreForClass = scoredClasses.get(flag);
if (scoreForClass == null)
scoredClasses.put(flag, scoreDoc.score);
else
scoredClasses.put(flag, scoreForClass + scoreDoc.score);
LOGGER.debug(" <<categorized as>> " + flag + " | score="
+ scoreDoc.score + " \n text =" + doc.get("text") + "\n");
if (count > MAX_DOCS_TO_USE_FOR_CLASSIFY) {
break;
}
count++;
}
try {
scoredClasses = ValueSortMap.sortMapByValue(scoredClasses, false);
List<String> resultsAll = new ArrayList<>(scoredClasses.keySet()), resultsAboveThresh = new ArrayList<>();
for (String key : resultsAll) {
if (scoredClasses.get(key) > MIN_TOTAL_SCORE_FOR_CATEGORY)
resultsAboveThresh.add(key);
else
LOGGER.debug("Too low score of " + scoredClasses.get(key)
+ " for category = " + key);
}
int len = resultsAboveThresh.size();
if (len > MAX_CATEG_RESULTS)
results = resultsAboveThresh.subList(0, MAX_CATEG_RESULTS); // get
// maxRes
// elements
else
results = resultsAboveThresh;
} catch (Exception e) {
LOGGER.error("Problem aggregating search results\n" + e);
}
if (results.size() < 2)
return results;
// if two categories, one is very high and another is relatively low
if (scoredClasses.get(results.get(0))
/ scoredClasses.get(results.get(1)) > BEST_TO_NEX_BEST_RATIO) // second
// best
// is
// much
// worse
return results.subList(0, 1);
else
return results;
}