public ReviewObj extractSentencesWithPotentialReviewPhrases()

in opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java [79:154]


	public ReviewObj extractSentencesWithPotentialReviewPhrases(String url) {
		ReviewObj reviewObj = new ReviewObj();
		int maxSentsFromPage= 20;

		String downloadedPage = pageFetcher.fetchPage(url, 20000);
		if (downloadedPage == null || downloadedPage.length() < 100)
		{
			return null;
		}

		String pageOrigHTML = pageFetcher.fetchOrigHTML(url);

		List<String> productFeaturesList = new ArrayList<>();
		String[] productFeatures = StringUtils.substringsBetween(pageOrigHTML, "<li>", "</li>" );
		if (productFeatures!=null){
			for(String item: productFeatures ){
				if (item.contains("class") || item.contains("www.") || item.contains("href"))
					continue;
				item = item.replace("<span>","").replace("</span>","").replace("<b>","").replace("</b>","");
				if (item.length()>80 && MinedSentenceProcessor.acceptableMinedSentence(item)==null){
					LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", item);
					continue;
				}
				productFeaturesList .add(item);
			}
		}
		
		productFeaturesList = cleanProductFeatures(productFeaturesList);
		
		String startArea = StringUtils.substringBetween(pageOrigHTML, "reviewHistoPop", "t of 5 stars");
		String item =  StringUtils.substringBetween(startArea, "title=\"","ou" );
		if (item==null) { //title="4.0 out of 5 stars" ><span>4.0 out of 5 stars</span>
			int index = pageOrigHTML.indexOf("of 5 stars\"");
			startArea = StringUtils.substringBetween(pageOrigHTML, "of 5 stars\"", "of 5 stars");
			item =  StringUtils.substringBetween(startArea, "<span>","ou" );
		}

		// if found, process
		if (item!=null){
			try {
				float rating = Float.parseFloat(item);
				reviewObj.setRating(rating);
			} catch (NumberFormatException e) {
				LOG.error(e.getLocalizedMessage(), e);
			}
		}
		//productFeaturesList .add(item);

		downloadedPage= downloadedPage.replace("     ", "&");
		downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
		String[] sents = downloadedPage.split("#");
		List<TextChunk> sentsList = new ArrayList<>();
		for(String s: sents) {
			s = s.trim().replace("  ", ". ").replace("..", ".").replace(". . .", " ")
					.replace(": ", ". ").replace("- ", ". ").
					replace (". .",".").trim();
			sentsList.add(new TextChunk(s, s.length()));
		}

		sentsList.sort(new TextChunkComparable());
		String[] longestSents = new String[maxSentsFromPage];
		int j=0;														// -1 removed
		for (int i=sentsList.size()-1 -maxSentsFromPage; i< sentsList.size()&& j<longestSents.length; i++) {
			longestSents[j] = sentsList.get(i).text;
			j++;
		}

		sents = cleanListOfSents(longestSents);
		sents = removeDuplicates(sents);
		sents = verifyEnforceStartsUpperCase(sents);

		reviewObj.setFeaturePhrases(productFeaturesList.toArray(new String[0]));
		reviewObj.setOrigSentences(sents);

		return reviewObj;
	}