in opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java [637:735]
public static String generateSummary(String txt, String title, int numChars,
boolean truncateInSentence) {
StringBuilder finalSummary;
try {
String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST", "GMT", "AM", " " };
txt = txt.replace(" | ", " ");
txt = txt.replace(" |", " ");
ArrayList<String> sentences = TextProcessor.splitToSentences(txt);
// System.out.println("Sentences are:");
StringBuilder sum = new StringBuilder();
int cnt = 0;
int lCnt = 0;
for (String s : sentences) {
cnt++;
// System.out.println(s + "\n");
s = trimSentence(s, title);
// see if sentence has a time in it
// boolean containsTime = s.co("[0-9]");
if (s.length() > 60 && !s.contains("By") && !s.contains("Page")
&& !s.contains(">>") && Character.isUpperCase(s.charAt(0))) {
// System.out.println("cleaned: " + s + "\n");
if (Math.abs(cnt - lCnt) != 1 && lCnt != 0) {
if (sum.toString().endsWith(".")) {
sum.append("..");
} else {
sum.append("...");
}
} else {
sum.append(" ");
}
sum.append(s.trim());
lCnt = cnt;
}
if (sum.length() > numChars) {
break;
}
}
finalSummary = new StringBuilder(sum.toString().trim());
if (truncateInSentence) {
finalSummary = new StringBuilder(truncateTextOnSpace(finalSummary.toString(), numChars));
int numPeriods = countTrailingPeriods(finalSummary.toString());
if (numPeriods < 3 && finalSummary.length() > 0) {
for (int i = 0; i < 3 - numPeriods; i++) {
finalSummary.append(".");
}
}
} else {
// trim final period
if (finalSummary.toString().endsWith("..")) {
finalSummary = new StringBuilder(finalSummary.substring(0, finalSummary.length() - 2));
}
}
// check to see if we have anything, if not, return the full content
if (finalSummary.toString().trim().length() < 5) {
finalSummary = new StringBuilder(txt);
}
// see if we have a punctuation character in the first 30 chars
int highestIdx = -1;
int sIdx = Math.min(finalSummary.length() - 1, 45);
for (String p : puncChars) {
int idx = finalSummary.toString().trim().substring(0, sIdx).lastIndexOf(p);
if (idx > highestIdx && idx < 45) {
highestIdx = idx + p.length();
}
}
if (highestIdx > -1) {
finalSummary = new StringBuilder(finalSummary.substring(highestIdx));
}
int closeParenIdx = finalSummary.indexOf(")");
int openParenIdx = finalSummary.indexOf("(");
// if(closeParenIdx < )
if (closeParenIdx != -1 && closeParenIdx < 15
&& (openParenIdx == -1 || openParenIdx > closeParenIdx)) {
finalSummary = new StringBuilder(finalSummary.substring(closeParenIdx + 1).trim());
}
finalSummary = new StringBuilder(trimPunctuationFromStart(finalSummary.toString()));
// check to see if we have anything, if not, return the full content
if (finalSummary.toString().trim().length() < 5) {
finalSummary = new StringBuilder(txt);
}
} catch (Exception e) {
LOG.severe("Problem forming summary for: " + txt);
LOG.severe("Using full text for the summary" + e);
finalSummary = new StringBuilder(txt);
}
return finalSummary.toString().trim();
}