package org.jetbrains.skija; import java.lang.ref.*; import java.util.*; import org.jetbrains.annotations.*; import org.jetbrains.skija.*; import org.jetbrains.skija.impl.*; /** *

A class that locates boundaries in text. This class defines a protocol for * objects that break up a piece of natural-language text according to a set * of criteria. Instances or subclasses of BreakIterator can be provided, for * example, to break a piece of text into words, sentences, or logical characters * according to the conventions of some language or group of languages. * * We provide four built-in types of BreakIterator: *

makeSentenceInstance() returns a BreakIterator that locates boundaries * between sentences. This is useful for triple-click selection, for example. *
makeWordInstance() returns a BreakIterator that locates boundaries between * words. This is useful for double-click selection or "find whole words" searches. * This type of BreakIterator makes sure there is a boundary position at the * beginning and end of each legal word. (Numbers count as words, too.) Whitespace * and punctuation are kept separate from real words. *
makeLineInstance() returns a BreakIterator that locates positions where it is * legal for a text editor to wrap lines. This is similar to word breaking, but * not the same: punctuation and whitespace are generally kept with words (you don't * want a line to start with whitespace, for example), and some special characters * can force a position to be considered a line-break position or prevent a position * from being a line-break position. *
makeCharacterInstance() returns a BreakIterator that locates boundaries between * logical characters. Because of the structure of the Unicode encoding, a logical * character may be stored internally as more than one Unicode code point. (A with an * umlaut may be stored as an a followed by a separate combining umlaut character, * for example, but the user still thinks of it as one character.) This iterator allows * various processes (especially text editors) to treat as characters the units of text * that a user would think of as characters, rather than the units of text that the * computer sees as "characters".

* The text boundary positions are found according to the rules * described in Unicode Standard Annex #29, Text Boundaries, and * Unicode Standard Annex #14, Line Breaking Properties. These * are available at http://www.unicode.org/reports/tr14/ and * http://www.unicode.org/reports/tr29/. *

* BreakIterator's interface follows an "iterator" model (hence the name), meaning it * has a concept of a "current position" and methods like first(), last(), next(), * and previous() that update the current position. All BreakIterators uphold the * following invariants: *

The beginning and end of the text are always treated as boundary positions. *
The current position of the iterator is always a boundary position (random- * access methods move the iterator to the nearest boundary position before or * after the specified position, not _to_ the specified position). *
DONE is used as a flag to indicate when iteration has stopped. DONE is only * returned when the current position is the end of the text and the user calls next(), * or when the current position is the beginning of the text and the user calls * previous(). *
Break positions are numbered by the positions of the characters that follow * them. Thus, under normal circumstances, the position before the first character * is 0, the position after the first character is 1, and the position after the * last character is 1 plus the length of the string. *
The client can change the position of an iterator, or the text it analyzes, * at will, but cannot change the behavior. If the user wants different behavior, he * must instantiate a new iterator.

* * BreakIterator accesses the text it analyzes through a CharacterIterator, which makes * it possible to use BreakIterator to analyze text in any text-storage vehicle that * provides a CharacterIterator interface. * * Note: Some types of BreakIterator can take a long time to create, and * instances of BreakIterator are not currently cached by the system. For * optimal performance, keep instances of BreakIterator around as long as makes * sense. For example, when word-wrapping a document, don't create and destroy a * new BreakIterator for each line. Create one break iterator for the whole document * (or whatever stretch of text you're wrapping) and use it to do the whole job of * wrapping the text. * *

* Examples:

* Creating and using text boundaries *

 * public static void main(String args[]) {
 *      if (args.length == 1) {
 *          String stringToExamine = args[0];
 *          //print each word in order
 *          BreakIterator boundary = BreakIterator.makeWordInstance();
 *          boundary.setText(stringToExamine);
 *          printEachForward(boundary, stringToExamine);
 *          //print each sentence in reverse order
 *          boundary = BreakIterator.makeSentenceInstance(Locale.US);
 *          boundary.setText(stringToExamine);
 *          printEachBackward(boundary, stringToExamine);
 *          printFirst(boundary, stringToExamine);
 *          printLast(boundary, stringToExamine);
 *      }
 * }
 *

* * Print each element in order *

 * public static void printEachForward(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     for (int end = boundary.next();
 *          end != BreakIterator.DONE;
 *          start = end, end = boundary.next()) {
 *          System.out.println(source.substring(start,end));
 *     }
 * }
 *

* * Print each element in reverse order *

 * public static void printEachBackward(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     for (int start = boundary.previous();
 *          start != BreakIterator.DONE;
 *          end = start, start = boundary.previous()) {
 *         System.out.println(source.substring(start,end));
 *     }
 * }
 *

* * Print first element *

 * public static void printFirst(BreakIterator boundary, String source) {
 *     int start = boundary.first();
 *     int end = boundary.next();
 *     System.out.println(source.substring(start,end));
 * }
 *

* * Print last element *

 * public static void printLast(BreakIterator boundary, String source) {
 *     int end = boundary.last();
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start,end));
 * }
 *

* * Print the element at a specified position *

 * public static void printAt(BreakIterator boundary, int pos, String source) {
 *     int end = boundary.following(pos);
 *     int start = boundary.previous();
 *     System.out.println(source.substring(start,end));
 * }
 *

* * Find the next word *

*
 * public static int nextWordStartAfter(int pos, String text) {
 *     BreakIterator wb = BreakIterator.makeWordInstance();
 *     wb.setText(text);
 *     int wordStart = wb.following(pos);
 *     for (;;) {
 *         int wordLimit = wb.next();
 *         if (wordLimit == BreakIterator.DONE) {
 *             return BreakIterator.DONE;
 *         }
 *         int wordStatus = wb.getRuleStatus();
 *         if (wordStatus != BreakIterator.WORD_NONE) {
 *             return wordStart;
 *         }
 *         wordStart = wordLimit;
 *      }
 * }
 * 
* The iterator returned by {@link #makeWordInstance()} is unique in that * the break positions it returns don't represent both the start and end of the * thing being iterated over. That is, a sentence-break iterator returns breaks * that each represent the end of one sentence and the beginning of the next. * With the word-break iterator, the characters between two boundaries might be a * word, or they might be the punctuation or whitespace between two words. The * above code uses {@link #getRuleStatus()} to identify and ignore boundaries associated * with punctuation or other non-word characters. *

*/ public class BreakIterator extends Managed implements Cloneable { static { Library.staticLoad(); } /** * DONE is returned by previous() and next() after all valid * boundaries have been returned. */ public static final int DONE = -1; /** * Tag value for "words" that do not fit into any of other categories. * Includes spaces and most punctuation. */ public static final int WORD_NONE = 0; /** * Upper bound for tags for uncategorized words. */ public static final int WORD_NONE_LIMIT = 100; /** * Tag value for words that appear to be numbers, lower limit. */ public static final int WORD_NUMBER = 100; /** * Tag value for words that appear to be numbers, upper limit. */ public static final int WORD_NUMBER_LIMIT = 200; /** * Tag value for words that contain letters, excluding * hiragana, katakana or ideographic characters, lower limit. */ public static final int WORD_LETTER = 200; /** * Tag value for words containing letters, upper limit */ public static final int WORD_LETTER_LIMIT = 300; /** * Tag value for words containing kana characters, lower limit */ public static final int WORD_KANA = 300; /** * Tag value for words containing kana characters, upper limit */ public static final int WORD_KANA_LIMIT = 400; /** * Tag value for words containing ideographic characters, lower limit */ public static final int WORD_IDEO = 400; /** * Tag value for words containing ideographic characters, upper limit */ public static final int WORD_IDEO_LIMIT = 500; @ApiStatus.Internal public U16String _text; @ApiStatus.Internal public BreakIterator(long ptr) { super(ptr, _FinalizerHolder.PTR); } @Override public void close() { super.close(); if (_text != null) _text.close(); } /** * Create a copy of this iterator */ @Override public BreakIterator clone() { Stats.onNativeCall(); return new BreakIterator(_nClone(_ptr)); } /** * Returns a new BreakIterator instance for character breaks for the default locale. */ public static BreakIterator makeCharacterInstance() { return makeCharacterInstance(null); } /** * Returns a new BreakIterator instance for character breaks for the given locale. */ public static BreakIterator makeCharacterInstance(String locale) { Stats.onNativeCall(); return new BreakIterator(_nMake(0, locale)); // UBRK_CHARACTER } /** * Returns a new BreakIterator instance for word breaks for the default locale. */ public static BreakIterator makeWordInstance() { return makeWordInstance(null); } /** * Returns a new BreakIterator instance for word breaks for the given locale. */ public static BreakIterator makeWordInstance(String locale) { Stats.onNativeCall(); return new BreakIterator(_nMake(1, locale)); // UBRK_WORD } /** * Returns a new BreakIterator instance for line breaks for the default locale. */ public static BreakIterator makeLineInstance() { return makeLineInstance(null); } /** * Returns a new BreakIterator instance for line breaks for the given locale. */ public static BreakIterator makeLineInstance(String locale) { Stats.onNativeCall(); return new BreakIterator(_nMake(2, locale)); // UBRK_LINE } /** * Returns a new BreakIterator instance for sentence breaks for the default locale. */ public static BreakIterator makeSentenceInstance() { return makeSentenceInstance(null); } /** * Returns a new BreakIterator instance for sentence breaks for the given locale. */ public static BreakIterator makeSentenceInstance(String locale) { Stats.onNativeCall(); return new BreakIterator(_nMake(3, locale)); // UBRK_SENTENCE } /** * Returns character index of the text boundary that was most recently * returned by {@link next()}, {@link next(int)}, {@link previous()}, * {@link first()}, {@link last()}, {@link following(int)} or * {@link preceding(int)}. If any of these methods returns * {@link BreakIterator#DONE} because either first or last text boundary * has been reached, it returns the first or last text boundary depending * on which one is reached. */ public int current() { try { Stats.onNativeCall(); return _nCurrent(_ptr); } finally { Reference.reachabilityFence(this); } } /** * Returns the boundary following the current boundary. If the current * boundary is the last text boundary, it returns {@link BreakIterator#DONE} * and the iterator's current position is unchanged. Otherwise, the * iterator's current position is set to the boundary following the current * boundary. */ public int next() { try { Stats.onNativeCall(); return _nNext(_ptr); } finally { Reference.reachabilityFence(this); } } /** * Advances the iterator either forward or backward the specified number of steps. * Negative values move backward, and positive values move forward. This is * equivalent to repeatedly calling next() or previous(). * @param n The number of steps to move. The sign indicates the direction * (negative is backwards, and positive is forwards). * @return The character offset of the boundary position n boundaries away from * the current one. */ public int next(int n) { int result = 0; if (n > 0) { for (; n > 0 && result != DONE; --n) { result = next(); } } else if (n < 0) { for (; n < 0 && result != DONE; ++n) { result = previous(); } } else { result = current(); } return result; } /** * Returns the boundary following the current boundary. If the current * boundary is the last text boundary, it returns {@link BreakIterator#DONE} * and the iterator's current position is unchanged. Otherwise, the * iterator's current position is set to the boundary following the current * boundary. */ public int previous() { try { Stats.onNativeCall(); return _nPrevious(_ptr); } finally { Reference.reachabilityFence(this); } } /** * Returns the first boundary. The iterator's current position is set to the first text boundary. */ public int first() { try { Stats.onNativeCall(); return _nFirst(_ptr); } finally { Reference.reachabilityFence(this); } } /** * Returns the last boundary. The iterator's current position is set to the last text boundary. */ public int last() { try { Stats.onNativeCall(); return _nLast(_ptr); } finally { Reference.reachabilityFence(this); } } /** * Returns the last boundary preceding the specified character offset. * If the specified offset is equal to the first text boundary, it returns * {@link BreakIterator#DONE} and the iterator's current position is * unchanged. Otherwise, the iterator's current position is set to the * returned boundary. The value returned is always less than the offset or * the value {@link BreakIterator#DONE}. */ public int preceding(int offset) { try { Stats.onNativeCall(); return _nPreceding(_ptr, offset); } finally { Reference.reachabilityFence(this); } } /** * Returns the first boundary following the specified character offset. * If the specified offset is equal to the last text boundary, it returns * {@link BreakIterator#DONE} and the iterator's current position is * unchanged. Otherwise, the iterator's current position is set to the * returned boundary. The value returned is always greater than the offset or * the value {@link BreakIterator#DONE}. */ public int following(int offset) { try { Stats.onNativeCall(); return _nFollowing(_ptr, offset); } finally { Reference.reachabilityFence(this); } } /** * Returns true if the specified character offset is a text boundary. */ public boolean isBoundary(int offset) { try { Stats.onNativeCall(); return _nIsBoundary(_ptr, offset); } finally { Reference.reachabilityFence(this); } } /** * For rule-based BreakIterators, return the status tag from the * break rule that determined the boundary at the current iteration position. *

* For break iterator types that do not support a rule status, * a default value of 0 is returned. *

* @return The status from the break rule that determined the boundary * at the current iteration position. */ public int getRuleStatus() { try { Stats.onNativeCall(); return _nGetRuleStatus(_ptr); } finally { Reference.reachabilityFence(this); } } /** * For RuleBasedBreakIterators, get the status (tag) values from the break rule(s) * that determined the the boundary at the current iteration position. *

* For break iterator types that do not support rule status, * no values are returned. * * @return an array with the status values. */ public int[] getRuleStatuses() { try { Stats.onNativeCall(); return _nGetRuleStatuses(_ptr); } finally { Reference.reachabilityFence(this); } } /** * Set a new text string to be scanned. The current scan position is reset to {@link first()}. */ public void setText(String text) { try { Stats.onNativeCall(); _text = new U16String(text); _nSetText(_ptr, Native.getPtr(_text)); } finally { Reference.reachabilityFence(this); Reference.reachabilityFence(_text); } } @ApiStatus.Internal public static class _FinalizerHolder { public static final long PTR = _nGetFinalizer(); } @ApiStatus.Internal public static native long _nGetFinalizer(); @ApiStatus.Internal public static native long _nMake(int type, String locale); @ApiStatus.Internal public static native long _nClone(long ptr); @ApiStatus.Internal public static native int _nCurrent(long ptr); @ApiStatus.Internal public static native int _nNext(long ptr); @ApiStatus.Internal public static native int _nPrevious(long ptr); @ApiStatus.Internal public static native int _nFirst(long ptr); @ApiStatus.Internal public static native int _nLast(long ptr); @ApiStatus.Internal public static native int _nPreceding(long ptr, int offset); @ApiStatus.Internal public static native int _nFollowing(long ptr, int offset); @ApiStatus.Internal public static native boolean _nIsBoundary(long ptr, int offset); @ApiStatus.Internal public static native int _nGetRuleStatus(long ptr); @ApiStatus.Internal public static native int[] _nGetRuleStatuses(long ptr); @ApiStatus.Internal public static native void _nSetText(long ptr, long textPtr); }