in src/main/java/net/starlark/java/syntax/Lexer.java [594:792]
private void tokenize() {
if (checkIndentation) {
checkIndentation = false;
computeIndentation();
}
// Return saved indentation tokens.
if (dents != 0) {
if (dents < 0) {
dents++;
setToken(TokenKind.OUTDENT, pos - 1, pos);
} else {
dents--;
setToken(TokenKind.INDENT, pos - 1, pos);
}
return;
}
// TODO(adonovan): cleanup: replace break after setToken with return,
// and eliminate null-check of this.kind.
kind = null;
while (pos < buffer.length) {
if (tokenizeTwoChars()) {
pos += 2;
return;
}
char c = buffer[pos];
pos++;
switch (c) {
case '{':
setToken(TokenKind.LBRACE, pos - 1, pos);
openParenStackDepth++;
break;
case '}':
setToken(TokenKind.RBRACE, pos - 1, pos);
popParen();
break;
case '(':
setToken(TokenKind.LPAREN, pos - 1, pos);
openParenStackDepth++;
break;
case ')':
setToken(TokenKind.RPAREN, pos - 1, pos);
popParen();
break;
case '[':
setToken(TokenKind.LBRACKET, pos - 1, pos);
openParenStackDepth++;
break;
case ']':
setToken(TokenKind.RBRACKET, pos - 1, pos);
popParen();
break;
case '>':
if (peek(0) == '>' && peek(1) == '=') {
setToken(TokenKind.GREATER_GREATER_EQUALS, pos - 1, pos + 2);
pos += 2;
} else if (peek(0) == '>') {
setToken(TokenKind.GREATER_GREATER, pos - 1, pos + 1);
pos += 1;
} else {
setToken(TokenKind.GREATER, pos - 1, pos);
}
break;
case '<':
if (peek(0) == '<' && peek(1) == '=') {
setToken(TokenKind.LESS_LESS_EQUALS, pos - 1, pos + 2);
pos += 2;
} else if (peek(0) == '<') {
setToken(TokenKind.LESS_LESS, pos - 1, pos + 1);
pos += 1;
} else {
setToken(TokenKind.LESS, pos - 1, pos);
}
break;
case ':':
setToken(TokenKind.COLON, pos - 1, pos);
break;
case ',':
setToken(TokenKind.COMMA, pos - 1, pos);
break;
case '+':
setToken(TokenKind.PLUS, pos - 1, pos);
break;
case '-':
setToken(TokenKind.MINUS, pos - 1, pos);
break;
case '|':
setToken(TokenKind.PIPE, pos - 1, pos);
break;
case '=':
setToken(TokenKind.EQUALS, pos - 1, pos);
break;
case '%':
setToken(TokenKind.PERCENT, pos - 1, pos);
break;
case '~':
setToken(TokenKind.TILDE, pos - 1, pos);
break;
case '&':
setToken(TokenKind.AMPERSAND, pos - 1, pos);
break;
case '^':
setToken(TokenKind.CARET, pos - 1, pos);
break;
case '/':
if (peek(0) == '/' && peek(1) == '=') {
setToken(TokenKind.SLASH_SLASH_EQUALS, pos - 1, pos + 2);
pos += 2;
} else if (peek(0) == '/') {
setToken(TokenKind.SLASH_SLASH, pos - 1, pos + 1);
pos += 1;
} else {
// /= is handled by tokenizeTwoChars.
setToken(TokenKind.SLASH, pos - 1, pos);
}
break;
case ';':
setToken(TokenKind.SEMI, pos - 1, pos);
break;
case '*':
setToken(TokenKind.STAR, pos - 1, pos);
break;
case ' ':
case '\t':
case '\r':
/* ignore */
break;
case '\\':
// Backslash character is valid only at the end of a line (or in a string)
if (peek(0) == '\n') {
pos += 1; // skip the end of line character
} else if (peek(0) == '\r' && peek(1) == '\n') {
pos += 2; // skip the CRLF at the end of line
} else {
setToken(TokenKind.ILLEGAL, pos - 1, pos);
setValue(Character.toString(c));
}
break;
case '\n':
newline();
break;
case '#':
int oldPos = pos - 1;
while (pos < buffer.length) {
c = buffer[pos];
if (c == '\n') {
break;
} else {
pos++;
}
}
addComment(oldPos, pos);
break;
case '\'':
case '\"':
stringLiteral(c, false);
break;
default:
// detect raw strings, e.g. r"str"
if (c == 'r') {
int c0 = peek(0);
if (c0 == '\'' || c0 == '\"') {
pos++;
stringLiteral((char) c0, true);
break;
}
}
// int or float literal, or dot
if (c == '.' || isdigit(c)) {
pos--; // unconsume
scanNumberOrDot(c);
break;
}
if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_') {
identifierOrKeyword();
} else {
error("invalid character: '" + c + "'", pos - 1);
}
break;
} // switch
if (kind != null) { // stop here if we scanned a token
return;
}
} // while
if (indentStack.size() > 1) { // top of stack is always zero
setToken(TokenKind.NEWLINE, pos - 1, pos);
while (indentStack.size() > 1) {
indentStack.pop();
dents--;
}
return;
}
setToken(TokenKind.EOF, pos, pos);
}