in rhino/src/main/java/org/mozilla/javascript/TokenStream.java [649:1398]
final int getToken() throws IOException {
int c;
for (; ; ) {
// Eat whitespace, possibly sensitive to newlines.
for (; ; ) {
c = getChar();
if (c == EOF_CHAR) {
tokenStartLastLineEnd = lastLineEnd;
tokenStartLineno = lineno;
tokenBeg = cursor - 1;
tokenEnd = cursor;
return Token.EOF;
} else if (c == '\n') {
dirtyLine = false;
tokenStartLastLineEnd = lastLineEnd;
tokenStartLineno = lineno;
tokenBeg = cursor - 1;
tokenEnd = cursor;
return Token.EOL;
} else if (!isJSSpace(c)) {
if (c != '-') {
dirtyLine = true;
}
break;
}
}
// Assume the token will be 1 char - fixed up below.
tokenStartLastLineEnd = lastLineEnd;
tokenStartLineno = lineno;
tokenBeg = cursor - 1;
tokenEnd = cursor;
if (c == '@') return Token.XMLATTR;
// identifier/keyword/instanceof?
// watch out for starting with a <backslash>
boolean identifierStart;
boolean isUnicodeEscapeStart = false;
if (c == '\\') {
c = getChar();
if (c == 'u') {
identifierStart = true;
isUnicodeEscapeStart = true;
stringBufferTop = 0;
} else {
identifierStart = false;
ungetChar(c);
c = '\\';
}
} else {
identifierStart = Character.isUnicodeIdentifierStart(c) || c == '$' || c == '_';
if (identifierStart) {
stringBufferTop = 0;
addToString(c);
}
}
if (identifierStart) {
boolean containsEscape = isUnicodeEscapeStart;
for (; ; ) {
if (isUnicodeEscapeStart) {
// strictly speaking we should probably push-back
// all the bad characters if the <backslash>uXXXX
// sequence is malformed. But since there isn't a
// correct context(is there?) for a bad Unicode
// escape sequence in an identifier, we can report
// an error here.
int escapeVal = 0;
if (matchTemplateLiteralChar('{')) {
for (; ; ) {
c = getTemplateLiteralChar();
if (c == '}') {
break;
}
escapeVal = Kit.xDigitToInt(c, escapeVal);
if (escapeVal < 0) {
break;
}
}
if (escapeVal < 0 || escapeVal > 0x10FFFF) {
parser.reportError("msg.invalid.escape");
break;
}
} else {
for (int i = 0; i != 4; ++i) {
c = getChar();
escapeVal = Kit.xDigitToInt(c, escapeVal);
// Next check takes care about c < 0 and bad escape
if (escapeVal < 0) {
parser.reportError("msg.invalid.escape");
break;
}
}
}
if (escapeVal < 0) {
parser.addError("msg.invalid.escape");
return Token.ERROR;
}
addToString(escapeVal);
isUnicodeEscapeStart = false;
} else {
c = getChar();
if (c == '\\') {
c = getChar();
if (c == 'u') {
isUnicodeEscapeStart = true;
containsEscape = true;
} else {
parser.addError("msg.illegal.character", c);
return Token.ERROR;
}
} else {
if (c == EOF_CHAR
|| c == BYTE_ORDER_MARK
|| !(Character.isUnicodeIdentifierPart(c) || c == '$')) {
break;
}
addToString(c);
}
}
}
ungetChar(c);
String str = getStringFromBuffer();
if (!containsEscape
|| parser.compilerEnv.getLanguageVersion() >= Context.VERSION_ES6) {
// OPT we shouldn't have to make a string (object!) to
// check if it's a keyword.
// Return the corresponding token if it's a keyword
int result =
stringToKeyword(
str,
parser.compilerEnv.getLanguageVersion(),
parser.inUseStrictDirective());
if (result != Token.EOF) {
if ((result == Token.LET || result == Token.YIELD)
&& parser.compilerEnv.getLanguageVersion() < Context.VERSION_1_7) {
// LET and YIELD are tokens only in 1.7 and later
string = result == Token.LET ? "let" : "yield";
result = Token.NAME;
}
// Save the string in case we need to use in
// object literal definitions.
this.string = internString(str);
if (result != Token.RESERVED) {
return result;
} else if (parser.compilerEnv.getLanguageVersion() >= Context.VERSION_ES6) {
return result;
} else if (!parser.compilerEnv.isReservedKeywordAsIdentifier()) {
return result;
}
}
} else if (isKeyword(
str,
parser.compilerEnv.getLanguageVersion(),
parser.inUseStrictDirective())) {
// If a string contains unicodes, and converted to a keyword,
// we convert the last character back to unicode
str = convertLastCharToHex(str);
}
if (containsEscape
&& parser.compilerEnv.getLanguageVersion() >= Context.VERSION_ES6
&& !isValidIdentifierName(str)) {
parser.reportError("msg.invalid.escape");
return Token.ERROR;
}
this.string = internString(str);
return Token.NAME;
}
// is it a number?
if (isDigit(c) || (c == '.' && isDigit(peekChar()))) {
stringBufferTop = 0;
int base = 10;
isHex = isOldOctal = isOctal = isBinary = false;
boolean es6 = parser.compilerEnv.getLanguageVersion() >= Context.VERSION_ES6;
if (c == '0') {
c = getChar();
if (c == 'x' || c == 'X') {
base = 16;
isHex = true;
c = getChar();
} else if (es6 && (c == 'o' || c == 'O')) {
base = 8;
isOctal = true;
c = getChar();
} else if (es6 && (c == 'b' || c == 'B')) {
base = 2;
isBinary = true;
c = getChar();
} else if (isDigit(c)) {
base = 8;
isOldOctal = true;
} else {
addToString('0');
}
}
int emptyDetector = stringBufferTop;
if (base == 10 || base == 16 || (base == 8 && !isOldOctal) || base == 2) {
c = readDigits(base, c);
if (c == REPORT_NUMBER_FORMAT_ERROR) {
parser.addError("msg.caught.nfe");
return Token.ERROR;
}
} else {
while (isDigit(c)) {
// finally the oldOctal case
if (c >= '8') {
/*
* We permit 08 and 09 as decimal numbers, which
* makes our behavior a superset of the ECMA
* numeric grammar. We might not always be so
* permissive, so we warn about it.
*/
parser.addWarning("msg.bad.octal.literal", c == '8' ? "8" : "9");
base = 10;
c = readDigits(base, c);
if (c == REPORT_NUMBER_FORMAT_ERROR) {
parser.addError("msg.caught.nfe");
return Token.ERROR;
}
break;
}
addToString(c);
c = getChar();
}
}
if (stringBufferTop == emptyDetector && (isBinary || isOctal || isHex)) {
parser.addError("msg.caught.nfe");
return Token.ERROR;
}
boolean isInteger = true;
boolean isBigInt = false;
if (es6 && c == 'n') {
isBigInt = true;
c = getChar();
} else if (base == 10 && (c == '.' || c == 'e' || c == 'E')) {
isInteger = false;
if (c == '.') {
isInteger = false;
addToString(c);
c = getChar();
c = readDigits(base, c);
if (c == REPORT_NUMBER_FORMAT_ERROR) {
parser.addError("msg.caught.nfe");
return Token.ERROR;
}
}
if (c == 'e' || c == 'E') {
isInteger = false;
addToString(c);
c = getChar();
if (c == '+' || c == '-') {
addToString(c);
c = getChar();
}
if (!isDigit(c)) {
parser.addError("msg.missing.exponent");
return Token.ERROR;
}
c = readDigits(base, c);
if (c == REPORT_NUMBER_FORMAT_ERROR) {
parser.addError("msg.caught.nfe");
return Token.ERROR;
}
}
}
ungetChar(c);
String numString = getStringFromBuffer();
this.string = numString;
// try to remove the separator in a fast way
int pos = numString.indexOf(NUMERIC_SEPARATOR);
if (pos != -1) {
final char[] chars = numString.toCharArray();
for (int i = pos + 1; i < chars.length; i++) {
if (chars[i] != NUMERIC_SEPARATOR) {
chars[pos++] = chars[i];
}
}
numString = new String(chars, 0, pos);
}
if (isBigInt) {
this.bigInt = new BigInteger(numString, base);
return Token.BIGINT;
}
double dval;
if (base == 10 && !isInteger) {
try {
// Use Java conversion to number from string...
dval = Double.parseDouble(numString);
} catch (NumberFormatException ex) {
parser.addError("msg.caught.nfe");
return Token.ERROR;
}
} else {
dval = ScriptRuntime.stringPrefixToNumber(numString, 0, base);
}
this.number = dval;
return Token.NUMBER;
}
// is it a string?
if (c == '"' || c == '\'') {
// We attempt to accumulate a string the fast way, by
// building it directly out of the reader. But if there
// are any escaped characters in the string, we revert to
// building it out of a StringBuffer.
quoteChar = c;
stringBufferTop = 0;
c = getCharIgnoreLineEnd(false);
strLoop:
while (c != quoteChar) {
boolean unterminated = false;
if (c == EOF_CHAR) {
unterminated = true;
} else if (c == '\n') {
switch (lineEndChar) {
case '\n':
case '\r':
unterminated = true;
break;
case 0x2028: // <LS>
case 0x2029: // <PS>
// Line/Paragraph separators need to be included as is
c = lineEndChar;
break;
default:
break;
}
}
if (unterminated) {
ungetCharIgnoreLineEnd(c);
tokenEnd = cursor;
parser.addError("msg.unterminated.string.lit");
return Token.ERROR;
}
if (c == '\\') {
// We've hit an escaped character
int escapeVal;
c = getChar();
switch (c) {
case 'b':
c = '\b';
break;
case 'f':
c = '\f';
break;
case 'n':
c = '\n';
break;
case 'r':
c = '\r';
break;
case 't':
c = '\t';
break;
// \v a late addition to the ECMA spec,
// it is not in Java, so use 0xb
case 'v':
c = 0xb;
break;
case 'u':
// Get 4 hex digits; if the u escape is not
// followed by 4 hex digits, use 'u' + the
// literal character sequence that follows.
int escapeStart = stringBufferTop;
addToString('u');
escapeVal = 0;
if (matchChar('{')) {
for (; ; ) {
c = getChar();
if (c == '}') {
addToString(c);
break;
}
escapeVal = Kit.xDigitToInt(c, escapeVal);
if (escapeVal < 0) {
break;
}
addToString(c);
}
if (escapeVal < 0 || escapeVal > 0x10FFFF) {
parser.reportError("msg.invalid.escape");
continue strLoop;
}
} else {
for (int i = 0; i != 4; ++i) {
c = getChar();
escapeVal = Kit.xDigitToInt(c, escapeVal);
if (escapeVal < 0) {
if (parser.compilerEnv.getLanguageVersion()
>= Context.VERSION_ES6) {
parser.reportError("msg.invalid.escape");
}
continue strLoop;
}
addToString(c);
}
}
// prepare for replace of stored 'u' sequence
// by escape value
stringBufferTop = escapeStart;
c = escapeVal;
break;
case 'x':
// Get 2 hex digits, defaulting to 'x'+literal
// sequence, as above.
c = getChar();
escapeVal = Kit.xDigitToInt(c, 0);
if (escapeVal < 0) {
addToString('x');
continue strLoop;
}
int c1 = c;
c = getChar();
escapeVal = Kit.xDigitToInt(c, escapeVal);
if (escapeVal < 0) {
addToString('x');
addToString(c1);
continue strLoop;
}
// got 2 hex digits
c = escapeVal;
break;
case '\n':
// Remove line terminator after escape to follow
// SpiderMonkey and C/C++
c = getChar();
continue strLoop;
default:
if ('0' <= c && c < '8') {
int val = c - '0';
c = getChar();
if ('0' <= c && c < '8') {
val = 8 * val + c - '0';
c = getChar();
if ('0' <= c && c < '8' && val <= 037) {
// c is 3rd char of octal sequence only
// if the resulting val <= 0377
val = 8 * val + c - '0';
c = getChar();
}
}
ungetChar(c);
c = val;
}
}
}
addToString(c);
c = getChar(false);
}
String str = getStringFromBuffer();
this.string = internString(str);
cursor = sourceCursor;
tokenEnd = cursor;
return Token.STRING;
}
if (c == '#'
&& cursor == 1
&& peekChar() == '!'
&& !this.parser.calledByCompileFunction) {
// #! hashbang: only on the first line of a Script, no leading whitespace
skipLine();
return Token.COMMENT;
}
switch (c) {
case ';':
return Token.SEMI;
case '[':
return Token.LB;
case ']':
return Token.RB;
case '{':
return Token.LC;
case '}':
return Token.RC;
case '(':
return Token.LP;
case ')':
return Token.RP;
case ',':
return Token.COMMA;
case '?':
if (parser.compilerEnv.getLanguageVersion() >= Context.VERSION_ES6) {
if (peekChar() == '.') {
// ?.digit is to be treated as ? .num
getChar();
if (!isDigit(peekChar())) {
return Token.QUESTION_DOT;
}
ungetChar('.');
} else if (matchChar('?')) {
if (matchChar('=')) {
return Token.ASSIGN_NULLISH;
}
return Token.NULLISH_COALESCING;
}
}
return Token.HOOK;
case ':':
if (matchChar(':')) {
return Token.COLONCOLON;
}
return Token.COLON;
case '.':
if (matchChar('.')) {
if (parser.compilerEnv.getLanguageVersion() >= Context.VERSION_1_8
&& matchChar('.')) {
return Token.DOTDOTDOT;
}
return Token.DOTDOT;
} else if (matchChar('(')) {
return Token.DOTQUERY;
} else {
return Token.DOT;
}
case '|':
if (matchChar('|')) {
if (matchChar('=')) return Token.ASSIGN_LOGICAL_OR;
else return Token.OR;
} else if (matchChar('=')) {
return Token.ASSIGN_BITOR;
} else {
return Token.BITOR;
}
case '^':
if (matchChar('=')) {
return Token.ASSIGN_BITXOR;
}
return Token.BITXOR;
case '&':
if (matchChar('&')) {
if (matchChar('=')) return Token.ASSIGN_LOGICAL_AND;
else return Token.AND;
} else if (matchChar('=')) {
return Token.ASSIGN_BITAND;
} else {
return Token.BITAND;
}
case '=':
if (matchChar('=')) {
if (matchChar('=')) {
return Token.SHEQ;
}
return Token.EQ;
} else if (matchChar('>')) {
return Token.ARROW;
} else {
return Token.ASSIGN;
}
case '!':
if (matchChar('=')) {
if (matchChar('=')) {
return Token.SHNE;
}
return Token.NE;
}
return Token.NOT;
case '<':
/* NB:treat HTML begin-comment as comment-till-eol */
if (matchChar('!')) {
if (matchChar('-')) {
if (matchChar('-')) {
tokenStartLastLineEnd = lastLineEnd;
tokenStartLineno = lineno;
tokenBeg = cursor - 4;
skipLine();
commentType = Token.CommentType.HTML;
return Token.COMMENT;
}
ungetCharIgnoreLineEnd('-');
}
ungetCharIgnoreLineEnd('!');
}
if (matchChar('<')) {
if (matchChar('=')) {
return Token.ASSIGN_LSH;
}
return Token.LSH;
}
if (matchChar('=')) {
return Token.LE;
}
return Token.LT;
case '>':
if (matchChar('>')) {
if (matchChar('>')) {
if (matchChar('=')) {
return Token.ASSIGN_URSH;
}
return Token.URSH;
}
if (matchChar('=')) {
return Token.ASSIGN_RSH;
}
return Token.RSH;
}
if (matchChar('=')) {
return Token.GE;
}
return Token.GT;
case '*':
if (parser.compilerEnv.getLanguageVersion() >= Context.VERSION_ES6) {
if (matchChar('*')) {
if (matchChar('=')) {
return Token.ASSIGN_EXP;
}
return Token.EXP;
}
}
if (matchChar('=')) {
return Token.ASSIGN_MUL;
}
return Token.MUL;
case '/':
markCommentStart();
// is it a // comment?
if (matchChar('/')) {
tokenStartLastLineEnd = lastLineEnd;
tokenStartLineno = lineno;
tokenBeg = cursor - 2;
skipLine();
commentType = Token.CommentType.LINE;
return Token.COMMENT;
}
// is it a /* or /** comment?
if (matchChar('*')) {
boolean lookForSlash = false;
tokenStartLastLineEnd = lastLineEnd;
tokenStartLineno = lineno;
tokenBeg = cursor - 2;
if (matchChar('*')) {
lookForSlash = true;
commentType = Token.CommentType.JSDOC;
} else {
commentType = Token.CommentType.BLOCK_COMMENT;
}
for (; ; ) {
c = getChar();
if (c == EOF_CHAR) {
tokenEnd = cursor - 1;
parser.addError("msg.unterminated.comment");
return Token.COMMENT;
} else if (c == '*') {
lookForSlash = true;
} else if (c == '/') {
if (lookForSlash) {
cursor = sourceCursor;
tokenEnd = cursor;
return Token.COMMENT;
}
} else {
lookForSlash = false;
tokenEnd = cursor;
}
}
}
if (matchChar('=')) {
return Token.ASSIGN_DIV;
}
return Token.DIV;
case '%':
if (matchChar('=')) {
return Token.ASSIGN_MOD;
}
return Token.MOD;
case '~':
return Token.BITNOT;
case '+':
if (matchChar('=')) {
return Token.ASSIGN_ADD;
} else if (matchChar('+')) {
return Token.INC;
} else {
return Token.ADD;
}
case '-':
if (matchChar('=')) {
c = Token.ASSIGN_SUB;
} else if (matchChar('-')) {
if (!dirtyLine) {
// treat HTML end-comment after possible whitespace
// after line start as comment-until-eol
if (matchChar('>')) {
markCommentStart("--");
skipLine();
commentType = Token.CommentType.HTML;
return Token.COMMENT;
}
}
c = Token.DEC;
} else {
c = Token.SUB;
}
dirtyLine = true;
return c;
case '`':
return Token.TEMPLATE_LITERAL;
default:
parser.addError("msg.illegal.character", c);
return Token.ERROR;
}
}
}