protected String mergeFragments()

in jackrabbit-core/src/main/java/org/apache/jackrabbit/core/query/lucene/DefaultHighlighter.java [256:443]


    protected String mergeFragments(TermVectorOffsetInfo[] offsets,
                                    String text,
                                    String excerptStart,
                                    String excerptEnd,
                                    String fragmentStart,
                                    String fragmentEnd,
                                    String hlStart,
                                    String hlEnd,
                                    int maxFragments,
                                    int surround) throws IOException {
        if (offsets == null || offsets.length == 0) {
            // nothing to highlight
            return createDefaultExcerpt(text, excerptStart, excerptEnd,
                    fragmentStart, fragmentEnd, surround * 2);
        }
        int lastOffset = offsets.length; // Math.min(10, offsets.length); // 10 terms is plenty?
        List<FragmentInfo> fragmentInfoList = new ArrayList<FragmentInfo>();
        if (offsets[0].getEndOffset() <= text.length()) {
            FragmentInfo fi = new FragmentInfo(offsets[0], surround * 2);
            for (int i = 1; i < lastOffset; i++) {
                if (offsets[i].getEndOffset() > text.length()) {
                    break;
                }
                if (fi.add(offsets[i])) {
                    continue;
                }
                fragmentInfoList.add(fi);
                fi = new FragmentInfo(offsets[i], surround * 2);
            }
            fragmentInfoList.add(fi);
        }

        if (fragmentInfoList.isEmpty()) {
            // nothing to highlight
            return createDefaultExcerpt(text, excerptStart, excerptEnd,
                    fragmentStart, fragmentEnd, surround * 2);
        }

        // sort with score
        Collections.sort(fragmentInfoList, new FragmentInfoScoreSorter());

        // extract best fragments
        List<FragmentInfo> bestFragmentsList = new ArrayList<FragmentInfo>();
        for (int i = 0; i < Math.min(fragmentInfoList.size(), maxFragments); i++) {
            bestFragmentsList.add(fragmentInfoList.get(i));
        }

        // re-sort with positions
        Collections.sort(bestFragmentsList, new FragmentInfoPositionSorter());

        // merge #maxFragments fragments
        StringReader reader = new StringReader(text);
        StringBuffer sb = new StringBuffer(excerptStart);
        int pos = 0;
        char[] cbuf;
        int skip;
        int nextStart;
        int skippedChars;
        int firstWhitespace;
        for (int i = 0; i < bestFragmentsList.size(); i++) {
            FragmentInfo fi = bestFragmentsList.get(i);
            fi.trim();
            nextStart = fi.getStartOffset();
            skip = nextStart - pos;
            if (skip > surround * 2) {
                skip -= surround;
                if (i > 0) {
                    // end last fragment
                    cbuf = new char[surround];
                    reader.read(cbuf, 0, surround);
                    // find last whitespace
                    skippedChars = 1;
                    for (; skippedChars < surround + 1; skippedChars++) {
                        if (Character.isWhitespace(cbuf[surround - skippedChars])) {
                            break;
                        }
                    }
                    pos += surround;
                    if (skippedChars > surround) {
                        skippedChars = surround;
                    }
                    sb.append(escape(new String(cbuf, 0, surround
                            - skippedChars)));
                    sb.append(fragmentEnd);
                }
            }

            if (skip >= surround) {
                if (i > 0) {
                    skip -= surround;
                }
                // skip
                reader.skip((long) skip);
                pos += skip;
            }
            // start fragment
            cbuf = new char[nextStart - pos];
            skippedChars = Math.max(cbuf.length - 1, 0);
            firstWhitespace = skippedChars;
            reader.read(cbuf, 0, nextStart - pos);
            pos += (nextStart - pos);
            sb.append(fragmentStart);
            // find last period followed by whitespace
            if (cbuf.length > 0) {
                for (; skippedChars >= 0; skippedChars--) {
                    if (Character.isWhitespace(cbuf[skippedChars])) {
                        firstWhitespace = skippedChars;
                        if (skippedChars - 1 >= 0
                                && cbuf[skippedChars - 1] == '.') {
                            skippedChars++;
                            break;
                        }
                    }
                }
            }
            boolean sentenceStart = true;
            if (skippedChars == -1) {
                if (pos == cbuf.length) {
                    // this fragment is the start of the text -> skip none
                    skippedChars = 0;
                } else {
                    sentenceStart = false;
                    skippedChars = firstWhitespace + 1;
                }
            }

            if (!sentenceStart) {
                sb.append("... ");
            }
            sb.append(escape(new String(cbuf, skippedChars, cbuf.length
                    - skippedChars)));

            // iterate terms
            for (Iterator<TermVectorOffsetInfo> iter = fi.iterator(); iter.hasNext();) {
                TermVectorOffsetInfo ti = iter.next();
                nextStart = ti.getStartOffset();
                if (nextStart - pos > 0) {
                    cbuf = new char[nextStart - pos];
                    int charsRead = reader.read(cbuf, 0, nextStart - pos);
                    pos += (nextStart - pos);
                    sb.append(escape(new String(cbuf, 0, charsRead)));
                }
                sb.append(hlStart);
                nextStart = ti.getEndOffset();
                // print term
                cbuf = new char[nextStart - pos];
                reader.read(cbuf, 0, nextStart - pos);
                pos += (nextStart - pos);
                sb.append(escape(new String(cbuf)));
                sb.append(hlEnd);
            }
        }
        if (pos != 0) {
            // end fragment
            if (offsets.length > lastOffset) {
                surround = Math.min(offsets[lastOffset].getStartOffset() - pos, surround);
            }
            cbuf = new char[surround];
            skip = reader.read(cbuf, 0, surround);
            boolean EOF = reader.read() == -1;
            if (skip >= 0) {
                if (!EOF) {
                    skippedChars = 1;
                    for (; skippedChars < surround + 1; skippedChars++) {
                        if (Character.isWhitespace(cbuf[surround - skippedChars])) {
                            break;
                        }
                    }
                    if (skippedChars > surround) {
                        skippedChars = surround;
                    }
                } else {
                    skippedChars = 0;
                }
                sb.append(escape(new String(cbuf, 0, EOF ? skip
                        : (surround - skippedChars))));
                if (!EOF) {
                    char lastChar = sb.charAt(sb.length() - 1);
                    if (lastChar != '.' && lastChar != '!' && lastChar != '?') {
                        sb.append(" ...");
                    }
                }
            }
            sb.append(fragmentEnd);
        }
        sb.append(excerptEnd);
        return sb.toString();
    }