in opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java [196:267]
public static List<String> buildSearchEngineQueryFromSentence(String sentence) {
ParserChunker2MatcherProcessor pos = ParserChunker2MatcherProcessor.getInstance();
List<ParseTreeChunk> nPhrases = pos.formGroupedPhrasesFromChunksForSentence(sentence).get(0);
List<String> queryArrayStr = new ArrayList<>();
for (ParseTreeChunk ch : nPhrases) {
StringBuilder query = new StringBuilder();
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
|| ch.getPOSs().get(i).startsWith("J")) {
query.append(ch.getLemmas().get(i)).append(" ");
}
}
query = new StringBuilder(query.toString().trim());
int len = query.toString().split("\\s+").length;
if (len < 2 || len > 5)
continue;
if (len < 4) { // every word should start with capital
String[] qs = query.toString().split("\\s+");
boolean bAccept = true;
for (String w : qs) {
if (w.toLowerCase().equals(w)) // idf only two words then
// has to be person name,
// title or geolocation
{
bAccept = false;
break;
}
}
if (!bAccept)
continue;
}
query = new StringBuilder(query.toString().trim().replace(" ", " +"));
query.insert(0, " +");
queryArrayStr.add(query.toString());
}
if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
// keywords
for (ParseTreeChunk ch : nPhrases) {
StringBuilder query = new StringBuilder();
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
|| ch.getPOSs().get(i).startsWith("J")) {
query.append(ch.getLemmas().get(i)).append(" ");
}
}
query = new StringBuilder(query.toString().trim());
int len = query.toString().split("\\s+").length;
if (len < 2)
continue;
query = new StringBuilder(query.toString().trim().replace(" ", " +"));
query.insert(0, " +");
queryArrayStr.add(query.toString());
}
}
queryArrayStr = removeDuplicatesFromQueries(queryArrayStr);
queryArrayStr.add(sentence);
return queryArrayStr;
}