in opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java [79:154]
public ReviewObj extractSentencesWithPotentialReviewPhrases(String url) {
ReviewObj reviewObj = new ReviewObj();
int maxSentsFromPage= 20;
String downloadedPage = pageFetcher.fetchPage(url, 20000);
if (downloadedPage == null || downloadedPage.length() < 100)
{
return null;
}
String pageOrigHTML = pageFetcher.fetchOrigHTML(url);
List<String> productFeaturesList = new ArrayList<>();
String[] productFeatures = StringUtils.substringsBetween(pageOrigHTML, "<li>", "</li>" );
if (productFeatures!=null){
for(String item: productFeatures ){
if (item.contains("class") || item.contains("www.") || item.contains("href"))
continue;
item = item.replace("<span>","").replace("</span>","").replace("<b>","").replace("</b>","");
if (item.length()>80 && MinedSentenceProcessor.acceptableMinedSentence(item)==null){
LOG.debug("Rejected sentence by GeneratedSentenceProcessor.acceptableMinedSentence = {}", item);
continue;
}
productFeaturesList .add(item);
}
}
productFeaturesList = cleanProductFeatures(productFeaturesList);
String startArea = StringUtils.substringBetween(pageOrigHTML, "reviewHistoPop", "t of 5 stars");
String item = StringUtils.substringBetween(startArea, "title=\"","ou" );
if (item==null) { //title="4.0 out of 5 stars" ><span>4.0 out of 5 stars</span>
int index = pageOrigHTML.indexOf("of 5 stars\"");
startArea = StringUtils.substringBetween(pageOrigHTML, "of 5 stars\"", "of 5 stars");
item = StringUtils.substringBetween(startArea, "<span>","ou" );
}
// if found, process
if (item!=null){
try {
float rating = Float.parseFloat(item);
reviewObj.setRating(rating);
} catch (NumberFormatException e) {
LOG.error(e.getLocalizedMessage(), e);
}
}
//productFeaturesList .add(item);
downloadedPage= downloadedPage.replace(" ", "&");
downloadedPage = downloadedPage.replaceAll("(?:&)+", "#");
String[] sents = downloadedPage.split("#");
List<TextChunk> sentsList = new ArrayList<>();
for(String s: sents) {
s = s.trim().replace(" ", ". ").replace("..", ".").replace(". . .", " ")
.replace(": ", ". ").replace("- ", ". ").
replace (". .",".").trim();
sentsList.add(new TextChunk(s, s.length()));
}
sentsList.sort(new TextChunkComparable());
String[] longestSents = new String[maxSentsFromPage];
int j=0; // -1 removed
for (int i=sentsList.size()-1 -maxSentsFromPage; i< sentsList.size()&& j<longestSents.length; i++) {
longestSents[j] = sentsList.get(i).text;
j++;
}
sents = cleanListOfSents(longestSents);
sents = removeDuplicates(sents);
sents = verifyEnforceStartsUpperCase(sents);
reviewObj.setFeaturePhrases(productFeaturesList.toArray(new String[0]));
reviewObj.setOrigSentences(sents);
return reviewObj;
}