in opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java [42:135]
public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item) {
// put orig sentence in structure
List<String> origs = new ArrayList<>();
item.setOriginalSentences(origs);
String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
.replace(" ", " ").replace(" ", " ");
// generation results for this sentence
List<Fragment> result = new ArrayList<>();
// form plain text from snippet
String snapshot = item.getAbstractText().replace("<b>", " ")
.replace("</b>", " ").replace(" ", " ").replace(" ", " ");
String snapshotMarked = snapshot.replace("...",
" _should_find_orig_ . _should_find_orig_");
List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
List<String> allFragms = new ArrayList<>(fragments);
List<String> sents = new ArrayList<>();
String downloadedPage;
try {
if (snapshotMarked.length() != snapshot.length()) {
downloadedPage = pFetcher.fetchPage(item.getUrl());
if (downloadedPage != null && downloadedPage.length() > 100) {
item.setPageContent(downloadedPage);
String pageContent = Utils.fullStripHTML(item.getPageContent());
pageContent = GeneratedSentenceProcessor
.normalizeForSentenceSplitting(pageContent);
pageContent = pageContent.trim().replaceAll(" [A-Z]", ". $0")// .replace(" ",
// ". ")
.replace("..", ".").replace(". . .", " ").trim(); // sometimes
// html breaks
// are converted
// into ' ' (two
// spaces), so
// we need to
// put '.'
sents = TextProcessor.splitToSentences(pageContent);
}
}
} catch (Exception e) {
System.err.println("Problem downloading the page and splitting into sentences");
return item;
}
for (String fragment : allFragms) {
String followSent = null;
if (fragment.length() < 50)
continue;
String pageSentence = "";
// try to find original sentence from webpage
if (fragment.contains("_should_find_orig_") && sents != null
&& sents.size() > 0)
try {
String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
fragment.replace("_should_find_orig_", ""), sents.toArray(new String[]{}));
pageSentence = mainAndFollowSent[0];
followSent = mainAndFollowSent[1];
} catch (Exception e) {
LOG.error(e.getLocalizedMessage(), e);
}
else
// or get original snippet
pageSentence = fragment;
if (pageSentence != null)
pageSentence = pageSentence.replace("_should_find_orig_", "");
String pageSentenceProc = GeneratedSentenceProcessor
.acceptableMinedSentence(pageSentence);
if (pageSentenceProc != null) {
pageSentenceProc = GeneratedSentenceProcessor
.processSentence(pageSentenceProc);
if (followSent != null) {
pageSentenceProc += " "
+ GeneratedSentenceProcessor.processSentence(followSent);
}
pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
Fragment f = new Fragment(pageSentenceProc, 1);
f.setSourceURL(item.getUrl());
f.fragment = fragment;
result.add(f);
LOG.debug("Accepted sentence: {} | with title = {}", pageSentenceProc, title);
LOG.debug("For fragment = {}", fragment);
} else
LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
}
item.setFragments(result);
return item;
}