in opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java [364:527]
public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
String originalSentence, List<String> sentsAll) {
if (sentsAll == null)
sentsAll = new ArrayList<>();
// put orig sentence in structure
List<String> origs = new ArrayList<>();
origs.add(originalSentence);
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
.replace(" ", " ").replace(" ", " ");
// generation results for this sentence
List<Fragment> result = new ArrayList<>();
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ")
.replace("</b>", " ").replace(" ", " ").replace(" ", " ");
// fix a template expression which can be substituted by original if
// relevant
String snapshotMarked = snapshot.replace("...",
" _should_find_orig_ . _should_find_orig_");
String[] fragments = sm.splitSentences(snapshotMarked);
List<String> allFragms = new ArrayList<>(Arrays.asList(fragments));
String[] sents = null;
String downloadedPage = null;
try {
if (snapshotMarked.length() != snapshot.length()) {
downloadedPage = pFetcher.fetchPage(item.getUrl());
if (downloadedPage != null && downloadedPage.length() > 100) {
item.setPageContent(downloadedPage);
String pageContent = Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor
.normalizeForSentenceSplitting(pageContent);
pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
//pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
// // ". ")
// .replace("..", ".").replace(". . .", " ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so
// we need to put '.'
sents = sm.splitSentences(pageContent);
sents = ContentGeneratorSupport.cleanListOfSents(sents);
}
}
} catch (Exception e) {
LOG.error("Problem downloading the page and splitting into sentences", e);
return item;
}
for (String fragment : allFragms) {
StringBuilder followSent = new StringBuilder();
if (fragment.length() < 50)
continue;
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.contains("_should_find_orig_") && sents != null && sents.length > 0){
try {
// first try sorted sentences from page by length approach
String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
String[] mainAndFollowSent = null;
try {
mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
} catch (Exception e) {
LOG.error(e.getLocalizedMessage(), e);
}
// if the above gives null than try to match all sentences from snippet fragment
if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sents);
}
if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){
pageSentence = mainAndFollowSent[0];
for(int i = 1; i< mainAndFollowSent.length; i++)
if (mainAndFollowSent[i]!=null)
followSent.append(mainAndFollowSent[i]);
}
} catch (Exception e) {
LOG.error(e.getLocalizedMessage(), e);
}
} else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
pageSentence = pageSentence.replace("_should_find_orig_", "");
// resultant sentence SHOULD NOT be longer than for times the size of
// snippet fragment
if (pageSentence != null && pageSentence.length()>50 )
// && (float) pageSentence.length() / (float) fragment.length() < 4.0)
{ // was 2.0,
try { // get score from syntactic match between sentence in
// original text and mined sentence
double measScore, syntScore, mentalScore = 0.0;
SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
+ " " + title, originalSentence);
List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
LOG.debug("Rejected Sentence : No verb OR Yes imperative verb: {}", pageSentence);
continue;
}
syntScore = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
LOG.debug("{} {}\n pre-processed sent = '{}'", parseTreeChunk.listToString(match), syntScore, pageSentence);
if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
for (String currSent : sentsAll) {
if (currSent.startsWith(originalSentence))
continue;
match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
if (syntScoreCurr > syntScore) {
syntScore = syntScoreCurr;
}
}
if (syntScore > RELEVANCE_THRESHOLD) {
LOG.debug("Got match with other sent: {} {}", parseTreeChunk.listToString(match), syntScore);
}
}
measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
originalSentence, pageSentence);
if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
&& measScore < 0.8 && pageSentence.length() > 40) // >70
{
String pageSentenceProc = GeneratedSentenceProcessor
.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null) {
pageSentenceProc = GeneratedSentenceProcessor
.processSentence(pageSentenceProc);
followSent = new StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString()));
if (followSent != null) {
pageSentenceProc += " "+ followSent;
}
pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
+ mentalScore + (double) pageSentenceProc.length()
/ (double) 50);
f.setSourceURL(item.getUrl());
f.fragment = fragment;
result.add(f);
LOG.debug("Accepted sentence: {} | {} | with title = {}", pageSentenceProc, followSent, title);
LOG.debug("For fragment = {}", fragment);
} else
LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
} else
LOG.debug("Rejected sentence due to low score: {}", pageSentence);
// }
} catch (Throwable t) {
LOG.error(t.getLocalizedMessage(), t);
}
}
}
item.setFragments(result);
return item;
}