in opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java [114:210]
public List<String> extractNounPhraseProductNameCandidate(String sentence) {
List<String> queryArrayStr = new ArrayList<>();
if (sentence.split(" ").length ==1) { // this is a word, return empty
//queryArrayStr.add( sentence);
return queryArrayStr;
}
String quoted1 = StringUtils.substringBetween(sentence, "\"", "\"");
String quoted2 = StringUtils.substringBetween(sentence, "\'", "\'");
List<List<ParseTreeChunk>> groupedChunks = nlProc.formGroupedPhrasesFromChunksForPara(sentence);
if (groupedChunks.size()<1)
return queryArrayStr;
List<ParseTreeChunk> nPhrases = groupedChunks.get(0);
for (ParseTreeChunk ch : nPhrases) {
StringBuilder query = new StringBuilder();
int size = ch.getLemmas().size();
boolean phraseBeingFormed = false;
for (int i = 0; i < size; i++) {
if ((ch.getPOSs().get(i).startsWith("N") || ch.getPOSs().get(i)
.startsWith("J") || ch.getPOSs().get(i).startsWith("CD") ) )
// && StringUtils.isAlpha(ch.getLemmas().get(i)))
{
query.append(ch.getLemmas().get(i)).append(" ");
phraseBeingFormed = true;
} else
if ((ch.getPOSs().get(i).startsWith("PR") || ch.getPOSs().get(i).startsWith("IN") || ch.getPOSs().get(i).startsWith("TO") )
&& phraseBeingFormed )
break;
else if (ch.getPOSs().get(i).startsWith("DT") || ch.getPOSs().get(i).startsWith("CC"))
continue;
}
query = new StringBuilder(query.toString().trim());
int len = query.toString().split(" ").length;
if (len > 5 || len < 2) // too long or too short
continue;
/*
if (len < 4 && len>1) { // every word should start with capital
String[] qs = query.split(" ");
boolean bAccept = true;
for (String w : qs) {
if (w.toLowerCase().equals(w)) // idf only two words then
// has to be person name,
// title or geolocation
bAccept = false;
}
if (!bAccept)
continue;
}
*/
// individual word, possibly a frequent word
// if len==1 do nothing
query = new StringBuilder(query.toString().trim());
queryArrayStr.add(query.toString());
}
/*
if (queryArrayStr.size() < 1) { // release constraints on NP down to 2
// keywords
for (ParseTreeChunk ch : nPhrases) {
String query = "";
int size = ch.getLemmas().size();
for (int i = 0; i < size; i++) {
if (ch.getPOSs().get(i).startsWith("N")
|| ch.getPOSs().get(i).startsWith("J")) {
query += ch.getLemmas().get(i) + " ";
}
}
query = query.trim();
int len = query.split(" ").length;
if (len < 2)
continue;
query = TextProcessor.fastTokenize(query.toLowerCase(), false)
.toString().replace('[', ' ').replace(']', ' ').trim();
if (query.length() > 6)
queryArrayStr.add(query);
}
}
//queryArrayStr = Utils
// .removeDuplicatesFromQueries(queryArrayStr);
if (quoted1 != null
&& ((quoted1.length() > 5 && !stopList.isCommonWord(quoted1)) || quoted1
.length() > 10))
queryArrayStr.add(quoted1);
if (quoted2 != null
&& ((quoted2.length() > 5 && !stopList.isCommonWord(quoted2)) || quoted2
.length() > 10))
queryArrayStr.add(quoted2);
*/
return queryArrayStr;
}