public HitBase formTextFromOriginalPageGivenSnippetDirect()

in opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/SnippetToParagraph.java [42:135]


	public HitBase formTextFromOriginalPageGivenSnippetDirect(HitBase item) {

		// put orig sentence in structure
		List<String> origs = new ArrayList<>();

		item.setOriginalSentences(origs);
		String title = item.getTitle().replace("<b>", " ").replace("</b>", " ")
				.replace("  ", " ").replace("  ", " ");
		// generation results for this sentence
		List<Fragment> result = new ArrayList<>();
		// form plain text from snippet
		String snapshot = item.getAbstractText().replace("<b>", " ")
				.replace("</b>", " ").replace("  ", " ").replace("  ", " ");

		String snapshotMarked = snapshot.replace("...",
				" _should_find_orig_ . _should_find_orig_");
		List<String> fragments = TextProcessor.splitToSentences(snapshotMarked);
		List<String> allFragms = new ArrayList<>(fragments);

		List<String> sents = new ArrayList<>();
		String downloadedPage;
		try {
			if (snapshotMarked.length() != snapshot.length()) {
				downloadedPage = pFetcher.fetchPage(item.getUrl());
				if (downloadedPage != null && downloadedPage.length() > 100) {
					item.setPageContent(downloadedPage);
					String pageContent = Utils.fullStripHTML(item.getPageContent());
					pageContent = GeneratedSentenceProcessor
							.normalizeForSentenceSplitting(pageContent);
					pageContent = pageContent.trim().replaceAll("  [A-Z]", ". $0")// .replace("  ",
							// ". ")
							.replace("..", ".").replace(". . .", " ").trim(); // sometimes
					// html breaks
					// are converted
					// into ' ' (two
					// spaces), so
					// we need to
					// put '.'
					sents = TextProcessor.splitToSentences(pageContent);

				}
			}
		} catch (Exception e) {
			System.err.println("Problem downloading  the page and splitting into sentences");
			return item;
		}

		for (String fragment : allFragms) {
			String followSent = null;
			if (fragment.length() < 50)
				continue;
			String pageSentence = "";
			// try to find original sentence from webpage
			if (fragment.contains("_should_find_orig_") && sents != null
					&& sents.size() > 0)
				try {
					String[] mainAndFollowSent = getFullOriginalSentenceFromWebpageBySnippetFragment(
							fragment.replace("_should_find_orig_", ""), sents.toArray(new String[]{}));
					pageSentence = mainAndFollowSent[0];
					followSent = mainAndFollowSent[1];

				} catch (Exception e) {
					LOG.error(e.getLocalizedMessage(), e);
				}
			else
				// or get original snippet
				pageSentence = fragment;
			if (pageSentence != null)
				pageSentence = pageSentence.replace("_should_find_orig_", "");
			String pageSentenceProc = GeneratedSentenceProcessor
					.acceptableMinedSentence(pageSentence);
			if (pageSentenceProc != null) {
				pageSentenceProc = GeneratedSentenceProcessor
						.processSentence(pageSentenceProc);
				if (followSent != null) {
					pageSentenceProc += " "
							+ GeneratedSentenceProcessor.processSentence(followSent);
				}

				pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
				Fragment f = new Fragment(pageSentenceProc, 1);
				f.setSourceURL(item.getUrl());
				f.fragment = fragment;
				result.add(f);
				LOG.debug("Accepted sentence: {} | with title = {}", pageSentenceProc, title);
				LOG.debug("For fragment = {}", fragment);
			} else
				LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
		} 


		item.setFragments(result);
		return item;
	}