private void tokenize()

in src/main/java/net/starlark/java/syntax/Lexer.java [594:792]


  private void tokenize() {
    if (checkIndentation) {
      checkIndentation = false;
      computeIndentation();
    }

    // Return saved indentation tokens.
    if (dents != 0) {
      if (dents < 0) {
        dents++;
        setToken(TokenKind.OUTDENT, pos - 1, pos);
      } else {
        dents--;
        setToken(TokenKind.INDENT, pos - 1, pos);
      }
      return;
    }

    // TODO(adonovan): cleanup: replace break after setToken with return,
    // and eliminate null-check of this.kind.
    kind = null;
    while (pos < buffer.length) {
      if (tokenizeTwoChars()) {
        pos += 2;
        return;
      }
      char c = buffer[pos];
      pos++;
      switch (c) {
        case '{':
          setToken(TokenKind.LBRACE, pos - 1, pos);
          openParenStackDepth++;
          break;
        case '}':
          setToken(TokenKind.RBRACE, pos - 1, pos);
          popParen();
          break;
        case '(':
          setToken(TokenKind.LPAREN, pos - 1, pos);
          openParenStackDepth++;
          break;
        case ')':
          setToken(TokenKind.RPAREN, pos - 1, pos);
          popParen();
          break;
        case '[':
          setToken(TokenKind.LBRACKET, pos - 1, pos);
          openParenStackDepth++;
          break;
        case ']':
          setToken(TokenKind.RBRACKET, pos - 1, pos);
          popParen();
          break;
        case '>':
          if (peek(0) == '>' && peek(1) == '=') {
            setToken(TokenKind.GREATER_GREATER_EQUALS, pos - 1, pos + 2);
            pos += 2;
          } else if (peek(0) == '>') {
            setToken(TokenKind.GREATER_GREATER, pos - 1, pos + 1);
            pos += 1;
          } else {
            setToken(TokenKind.GREATER, pos - 1, pos);
          }
          break;
        case '<':
          if (peek(0) == '<' && peek(1) == '=') {
            setToken(TokenKind.LESS_LESS_EQUALS, pos - 1, pos + 2);
            pos += 2;
          } else if (peek(0) == '<') {
            setToken(TokenKind.LESS_LESS, pos - 1, pos + 1);
            pos += 1;
          } else {
            setToken(TokenKind.LESS, pos - 1, pos);
          }
          break;
        case ':':
          setToken(TokenKind.COLON, pos - 1, pos);
          break;
        case ',':
          setToken(TokenKind.COMMA, pos - 1, pos);
          break;
        case '+':
          setToken(TokenKind.PLUS, pos - 1, pos);
          break;
        case '-':
          setToken(TokenKind.MINUS, pos - 1, pos);
          break;
        case '|':
          setToken(TokenKind.PIPE, pos - 1, pos);
          break;
        case '=':
          setToken(TokenKind.EQUALS, pos - 1, pos);
          break;
        case '%':
          setToken(TokenKind.PERCENT, pos - 1, pos);
          break;
        case '~':
          setToken(TokenKind.TILDE, pos - 1, pos);
          break;
        case '&':
          setToken(TokenKind.AMPERSAND, pos - 1, pos);
          break;
        case '^':
          setToken(TokenKind.CARET, pos - 1, pos);
          break;
        case '/':
          if (peek(0) == '/' && peek(1) == '=') {
            setToken(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2);
            pos += 2;
          } else if (peek(0) == '/') {
            setToken(TokenKind.SLASH_SLASH, pos - 1, pos + 1);
            pos += 1;
          } else {
            // /= is handled by tokenizeTwoChars.
            setToken(TokenKind.SLASH, pos - 1, pos);
          }
          break;
        case ';':
          setToken(TokenKind.SEMI, pos - 1, pos);
          break;
        case '*':
          setToken(TokenKind.STAR, pos - 1, pos);
          break;
        case ' ':
        case '\t':
        case '\r':
          /* ignore */
          break;
        case '\\':
          // Backslash character is valid only at the end of a line (or in a string)
          if (peek(0) == '\n') {
            pos += 1; // skip the end of line character
          } else if (peek(0) == '\r' && peek(1) == '\n') {
            pos += 2; // skip the CRLF at the end of line
          } else {
            setToken(TokenKind.ILLEGAL, pos - 1, pos);
            setValue(Character.toString(c));
          }
          break;
        case '\n':
          newline();
          break;
        case '#':
          int oldPos = pos - 1;
          while (pos < buffer.length) {
            c = buffer[pos];
            if (c == '\n') {
              break;
            } else {
              pos++;
            }
          }
          addComment(oldPos, pos);
          break;
        case '\'':
        case '\"':
          stringLiteral(c, false);
          break;
        default:
          // detect raw strings, e.g. r"str"
          if (c == 'r') {
            int c0 = peek(0);
            if (c0 == '\'' || c0 == '\"') {
              pos++;
              stringLiteral((char) c0, true);
              break;
            }
          }

          // int or float literal, or dot
          if (c == '.' || isdigit(c)) {
            pos--; // unconsume
            scanNumberOrDot(c);
            break;
          }

          if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
            identifierOrKeyword();
          } else {
            error("invalid character: '" + c + "'", pos - 1);
          }
          break;
      } // switch
      if (kind != null) { // stop here if we scanned a token
        return;
      }
    } // while

    if (indentStack.size() > 1) { // top of stack is always zero
      setToken(TokenKind.NEWLINE, pos - 1, pos);
      while (indentStack.size() > 1) {
        indentStack.pop();
        dents--;
      }
      return;
    }

    setToken(TokenKind.EOF, pos, pos);
  }