public final boolean incrementToken()

in x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/MlStandardTokenizer.java [35:146]


    public final boolean incrementToken() throws IOException {
        clearAttributes();
        skippedPositions = 0;

        int start = -1;
        int length = 0;

        boolean haveNonHex = false;
        int lettersBeforeColon = 0;
        boolean haveColon = false;
        int firstBackslashPos = -1;
        int firstForwardSlashPos = -1;
        int slashCount = 0;
        int curChar;
        while ((curChar = getNextChar()) >= 0) {
            ++nextOffset;
            if (Character.isLetterOrDigit(curChar)
                || (length > 0
                    && (curChar == '_'
                        || curChar == '.'
                        || curChar == '-'
                        || curChar == '@'
                        || (curChar == ':' && lettersBeforeColon == length)))
                || curChar == '/'
                || (curChar == '\\' && (length == 0 || (haveColon && lettersBeforeColon == 1) || firstBackslashPos == 0))) {
                if (length == 0) {
                    // We're at the first character of a candidate token, so record the offset
                    start = nextOffset - 1;
                }
                termAtt.append((char) curChar);
                ++length;

                // Tracking related to colons and slashes
                if (curChar == ':') {
                    haveColon = true;
                } else if (curChar == '/') {
                    ++slashCount;
                    if (firstForwardSlashPos == -1) {
                        firstForwardSlashPos = length - 1;
                    }
                } else if (curChar == '\\') {
                    ++slashCount;
                    if (firstBackslashPos == -1) {
                        firstBackslashPos = length - 1;
                    }
                } else {
                    if (haveColon) {
                        if (firstBackslashPos != lettersBeforeColon + 1 && firstForwardSlashPos != lettersBeforeColon + 1) {
                            // If our token contains a colon but not followed by a slash, drop the colon and everything after it
                            assert length - lettersBeforeColon == 2;
                            length -= 2;
                            putBackChar = curChar;
                            --nextOffset;
                            break;
                        }
                    } else if (Character.isLetter(curChar)) {
                        ++lettersBeforeColon;
                    }
                }

                // We don't return tokens that are hex numbers, and it's most efficient to keep a running note of this
                haveNonHex = haveNonHex ||
                // Count dots, dashes, at symbols and colons as numeric
                    (Character.digit(curChar, 16) == -1 && curChar != '.' && curChar != '-' && curChar != '@' && curChar != ':');
            } else if (length > 0) {
                // If we get here, we've found a separator character having built up a candidate token

                if (haveNonHex && Character.isDigit(termAtt.charAt(0)) == false && length > slashCount) {
                    // The candidate token is valid to return
                    break;
                }

                // The candidate token is not valid to return, i.e. it's hex, begins with a digit or all slashes,
                // so wipe it and carry on searching
                ++skippedPositions;
                start = -1;
                length = 0;
                termAtt.setEmpty();

                haveNonHex = false;
                lettersBeforeColon = 0;
                haveColon = false;
                firstBackslashPos = -1;
                firstForwardSlashPos = -1;
                slashCount = 0;
            }
        }

        // We need to recheck whether we've got a valid token after the loop because
        // the loop can also be exited on reaching the end of the stream
        if (length == 0) {
            return false;
        }

        if (haveNonHex == false || Character.isDigit(termAtt.charAt(0)) || length == slashCount) {
            ++skippedPositions;
            return false;
        }

        // Strip dots, dashes, underscores, at symbols and colons at the end of the token
        char toCheck;
        while ((toCheck = termAtt.charAt(length - 1)) == '_' || toCheck == '.' || toCheck == '-' || toCheck == '@' || toCheck == ':') {
            --length;
        }

        // Characters that may exist in the term attribute beyond its defined length are ignored
        termAtt.setLength(length);
        offsetAtt.setOffset(correctOffset(start), correctOffset(start + length));
        posIncrAtt.setPositionIncrement(skippedPositions + 1);

        return true;
    }