public static String generateSummary()

in opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/TextProcessor.java [637:735]


  public static String generateSummary(String txt, String title, int numChars,
      boolean truncateInSentence) {
    StringBuilder finalSummary;

    try {
      String[] puncChars = { ":", "--", "PM", "MST", "EST", "CST", "PST", "GMT", "AM", "  " };

      txt = txt.replace(" | ", " ");
      txt = txt.replace(" |", " ");
      ArrayList<String> sentences = TextProcessor.splitToSentences(txt);
      // System.out.println("Sentences are:");
      StringBuilder sum = new StringBuilder();
      int cnt = 0;
      int lCnt = 0;
      for (String s : sentences) {
        cnt++;
        // System.out.println(s + "\n");
        s = trimSentence(s, title);
        // see if sentence has a time in it
        // boolean containsTime = s.co("[0-9]");
        if (s.length() > 60 && !s.contains("By") && !s.contains("Page")
            && !s.contains(">>") && Character.isUpperCase(s.charAt(0))) {
          // System.out.println("cleaned: " + s + "\n");
          if (Math.abs(cnt - lCnt) != 1 && lCnt != 0) {

            if (sum.toString().endsWith(".")) {
              sum.append("..");
            } else {
              sum.append("...");
            }
          } else {
            sum.append(" ");
          }
          sum.append(s.trim());
          lCnt = cnt;
        }
        if (sum.length() > numChars) {
          break;
        }
      }

      finalSummary = new StringBuilder(sum.toString().trim());

      if (truncateInSentence) {
        finalSummary = new StringBuilder(truncateTextOnSpace(finalSummary.toString(), numChars));
        int numPeriods = countTrailingPeriods(finalSummary.toString());

        if (numPeriods < 3 && finalSummary.length() > 0) {
          for (int i = 0; i < 3 - numPeriods; i++) {
            finalSummary.append(".");
          }
        }
      } else {
        // trim final period
        if (finalSummary.toString().endsWith("..")) {
          finalSummary = new StringBuilder(finalSummary.substring(0, finalSummary.length() - 2));
        }
      }
      // check to see if we have anything, if not, return the full content
      if (finalSummary.toString().trim().length() < 5) {
        finalSummary = new StringBuilder(txt);
      }
      // see if we have a punctuation character in the first 30 chars
      int highestIdx = -1;
      int sIdx = Math.min(finalSummary.length() - 1, 45);
      for (String p : puncChars) {
        int idx = finalSummary.toString().trim().substring(0, sIdx).lastIndexOf(p);
        if (idx > highestIdx && idx < 45) {
          highestIdx = idx + p.length();
        }
      }

      if (highestIdx > -1) {
        finalSummary = new StringBuilder(finalSummary.substring(highestIdx));
      }

      int closeParenIdx = finalSummary.indexOf(")");
      int openParenIdx = finalSummary.indexOf("(");
      // if(closeParenIdx < )
      if (closeParenIdx != -1 && closeParenIdx < 15
          && (openParenIdx == -1 || openParenIdx > closeParenIdx)) {
        finalSummary = new StringBuilder(finalSummary.substring(closeParenIdx + 1).trim());
      }

      finalSummary = new StringBuilder(trimPunctuationFromStart(finalSummary.toString()));

      // check to see if we have anything, if not, return the full content
      if (finalSummary.toString().trim().length() < 5) {
        finalSummary = new StringBuilder(txt);
      }

    } catch (Exception e) {
      LOG.severe("Problem forming summary for: " + txt);
      LOG.severe("Using full text for the summary" + e);
      finalSummary = new StringBuilder(txt);
    }

    return finalSummary.toString().trim();
  }