public HitBase augmentWithMinedSentencesAndVerifyRelevance()

in opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java [364:527]


	public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item,
																														 String originalSentence, List<String> sentsAll) {
		if (sentsAll == null)
			sentsAll = new ArrayList<>();
		// put orig sentence in structure
		List<String> origs = new ArrayList<>();
		origs.add(originalSentence);
		item.setOriginalSentences(origs);
		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
						.replace("  ", " ").replace("  ", " ");
		// generation results for this sentence
		List<Fragment> result = new ArrayList<>();
		// form plain text from snippet
		String snapshot = item.getAbstractText().replace("<b>", " ")
						.replace("</b>", " ").replace("  ", " ").replace("  ", " ");


		// fix a template expression which can be substituted by original if
		// relevant
		String snapshotMarked = snapshot.replace("...",
						" _should_find_orig_ . _should_find_orig_");
		String[] fragments = sm.splitSentences(snapshotMarked);
		List<String> allFragms = new ArrayList<>(Arrays.asList(fragments));

		String[] sents = null;
		String downloadedPage = null;
		try {
			if (snapshotMarked.length() != snapshot.length()) {
				downloadedPage = pFetcher.fetchPage(item.getUrl());
				if (downloadedPage != null && downloadedPage.length() > 100) {
					item.setPageContent(downloadedPage);
					String pageContent = Utils.fullStripHTML(item.getPageContent());
					pageContent = GeneratedSentenceProcessor
									.normalizeForSentenceSplitting(pageContent);
					pageContent = ContentGeneratorSupport.cleanSpacesInCleanedHTMLpage(pageContent);
					//pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
					//		// ". ")
					//		.replace("..", ".").replace(". . .", " ").trim(); // sometimes   html breaks are converted into ' ' (two spaces), so
					// we need to put '.'
					sents = sm.splitSentences(pageContent);

					sents = ContentGeneratorSupport.cleanListOfSents(sents);
				}
			}
		} catch (Exception e) {
			LOG.error("Problem downloading the page and splitting into sentences", e);
			return item;
		}

		for (String fragment : allFragms) {
			StringBuilder followSent = new StringBuilder();
			if (fragment.length() < 50)
				continue;
			String pageSentence = "";
			// try to find original sentence from webpage
			if (fragment.contains("_should_find_orig_") && sents != null && sents.length > 0){
				try {
					// first try sorted sentences from page by length approach
					String[] sentsSortedByLength = extractSentencesFromPage(downloadedPage);
					String[] mainAndFollowSent = null;

					try {
						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
										fragment.replace("_should_find_orig_", ""), sentsSortedByLength);
					} catch (Exception e) {
						LOG.error(e.getLocalizedMessage(), e);
					}
					// if the above gives null than try to match all sentences from snippet fragment
					if (mainAndFollowSent==null || mainAndFollowSent[0]==null){
						mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
										fragment.replace("_should_find_orig_", ""), sents);
					}

					if (mainAndFollowSent!=null || mainAndFollowSent[0]!=null){
						pageSentence = mainAndFollowSent[0];
						for(int i = 1; i< mainAndFollowSent.length; i++)
							if (mainAndFollowSent[i]!=null)
								followSent.append(mainAndFollowSent[i]);
					}

				} catch (Exception e) {
					LOG.error(e.getLocalizedMessage(), e);
				}
			} else
				// or get original snippet
				pageSentence = fragment;
			if (pageSentence != null)
				pageSentence = pageSentence.replace("_should_find_orig_", "");

			// resultant sentence SHOULD NOT be longer than for times the size of
			// snippet fragment
			if (pageSentence != null && pageSentence.length()>50 )
			//		&& (float) pageSentence.length() / (float) fragment.length() < 4.0)
			{ // was 2.0,

				try { // get score from syntactic match between sentence in
					// original text and mined sentence
					double measScore, syntScore, mentalScore = 0.0;

					SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
									+ " " + title, originalSentence);
					List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
					if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
						LOG.debug("Rejected Sentence : No verb OR Yes imperative verb: {}", pageSentence);
						continue;
					}

					syntScore = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
					LOG.debug("{} {}\n pre-processed sent = '{}'", parseTreeChunk.listToString(match), syntScore, pageSentence);

					if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
						for (String currSent : sentsAll) {
							if (currSent.startsWith(originalSentence))
								continue;
							match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
							double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
							if (syntScoreCurr > syntScore) {
								syntScore = syntScoreCurr;
							}
						}
						if (syntScore > RELEVANCE_THRESHOLD) {
							LOG.debug("Got match with other sent: {} {}", parseTreeChunk.listToString(match), syntScore);
						}
					}

					measScore = STRING_DISTANCE_MEASURER.measureStringDistance(
									originalSentence, pageSentence);


					if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
									&& measScore < 0.8 && pageSentence.length() > 40) // >70
					{
						String pageSentenceProc = GeneratedSentenceProcessor
										.acceptableMinedSentence(pageSentence);
						if (pageSentenceProc != null) {
							pageSentenceProc = GeneratedSentenceProcessor
											.processSentence(pageSentenceProc);
							followSent = new StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString()));
							if (followSent != null) {
								pageSentenceProc += " "+ followSent;
							}

							pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
							Fragment f = new Fragment(pageSentenceProc, syntScore + measScore
											+ mentalScore + (double) pageSentenceProc.length()
											/ (double) 50);
							f.setSourceURL(item.getUrl());
							f.fragment = fragment;
							result.add(f);
							LOG.debug("Accepted sentence: {} | {} | with title = {}", pageSentenceProc, followSent, title);
							LOG.debug("For fragment = {}", fragment);
						} else
							LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
					} else
						LOG.debug("Rejected sentence due to low score: {}", pageSentence);
					// }
				} catch (Throwable t) {
					LOG.error(t.getLocalizedMessage(), t);
				}
			}
		}
		item.setFragments(result);
		return item;
	}