private Fragment verifyCandidateSentencesAndFormParagraph()

in opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGenerator.java [283:364]


	private Fragment verifyCandidateSentencesAndFormParagraph(
			String[] candidateSentences, HitBase item, String fragment, String originalSentence, List<String> sentsAll) {
		Fragment result = null;	

		String pageSentence = candidateSentences[0];
		StringBuilder followSent = new StringBuilder();
		for(int i = 1; i< candidateSentences.length; i++)
			followSent.append(candidateSentences[i]);
		String title = item.getTitle();

		// resultant sentence SHOULD NOT be longer than for times the size of
		// snippet fragment
		if (!(pageSentence != null && pageSentence.length()>50 
				&& (float) pageSentence.length() / (float) fragment.length() < 4.0) )
			return null;


		try { // get score from syntactic match between sentence in
			// original text and mined sentence
			double measScore, syntScore, mentalScore = 0.0;

			SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence
					+ " " + title, originalSentence);
			List<List<ParseTreeChunk>> match = matchRes.getMatchResult();
			if (!matchRes.isVerbExists() || matchRes.isImperativeVerb()) {
				System.out.println("Rejected Sentence : No verb OR Yes imperative verb :" + pageSentence);
				return null;
			}

			syntScore = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
			LOG.debug("{} {}\n pre-processed sent = '{}'", parseTreeChunk.listToString(match), syntScore, pageSentence);

			if (syntScore < RELEVANCE_THRESHOLD){ // 1.5) { // trying other sents
				for (String currSent : sentsAll) {
					if (currSent.startsWith(originalSentence))
						continue;
					match = sm.assessRelevance(currSent, pageSentence).getMatchResult();
					double syntScoreCurr = parseTreeChunkListScorer.getParseTreeChunkListScore(match);
					if (syntScoreCurr > syntScore) {
						syntScore = syntScoreCurr;
					}
				}
				if (syntScore > RELEVANCE_THRESHOLD) {
					System.out.println("Got match with other sent: "
							+ parseTreeChunk.listToString(match) + " " + syntScore);
				}
			}

			measScore = STRING_DISTANCE_MEASURER.measureStringDistance(originalSentence, pageSentence);


			if ((syntScore > RELEVANCE_THRESHOLD || measScore > 0.5)
					&& measScore < 0.8 && pageSentence.length() > 40) // >70
			{
				String pageSentenceProc = GeneratedSentenceProcessor
						.acceptableMinedSentence(pageSentence);
				if (pageSentenceProc != null) {
					pageSentenceProc = GeneratedSentenceProcessor
							.processSentence(pageSentenceProc);
					followSent = new StringBuilder(GeneratedSentenceProcessor.processSentence(followSent.toString()));
					if (followSent != null) {
						pageSentenceProc += " "+ followSent;
					}

					pageSentenceProc = Utils.convertToASCII(pageSentenceProc);
					result = new Fragment(pageSentenceProc, syntScore + measScore
							+ mentalScore + (double) pageSentenceProc.length() / (double) 50);
					result.setSourceURL(item.getUrl());
					result.fragment = fragment;

					LOG.debug("Accepted sentence:  {} | with title = {}", pageSentenceProc, title);
					LOG.debug("For fragment = {}", fragment);
				} else
					LOG.debug("Rejected sentence due to wrong area at webpage: {}", pageSentence);
			} else
				LOG.debug("Rejected sentence due to low score: {}", pageSentence);
			// }
		} catch (Throwable t) {
			LOG.error(t.getLocalizedMessage(), t);
		}
		return result;
	}