lib/Parser/JSLexer.cpp (1,814 lines of code) (raw):
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
#include "hermes/Parser/JSLexer.h"
#include "dtoa/dtoa.h"
#include "hermes/Support/Conversions.h"
#include "llvh/ADT/ScopeExit.h"
#include "llvh/ADT/StringSwitch.h"
using llvh::Twine;
namespace hermes {
namespace parser {
namespace {
const char *g_tokenStr[] = {
#define TOK(name, str) str,
#include "hermes/Parser/TokenKinds.def"
};
const int UTF8_LINE_TERMINATOR_CHAR0 = 0xe2;
inline bool matchUnicodeLineTerminatorOffset1(const char *curCharPtr_) {
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
return (unsigned char)curCharPtr_[1] == 0x80 &&
((unsigned char)curCharPtr_[2] == 0xa8 ||
(unsigned char)curCharPtr_[2] == 0xa9);
}
} // namespace
const char *tokenKindStr(TokenKind kind) {
assert(kind <= TokenKind::_last_token);
return g_tokenStr[static_cast<unsigned>(kind)];
}
#if HERMES_PARSE_JSX
static llvh::DenseMap<StringRef, uint32_t> initializeHTMLEntities() {
llvh::DenseMap<StringRef, uint32_t> entities{};
#define HTML_ENTITY(NAME, VALUE) \
entities.insert({llvh::StringLiteral(#NAME), VALUE});
#include "hermes/Parser/HTMLEntities.def"
return entities;
}
static const llvh::DenseMap<StringRef, uint32_t> &getHTMLEntities() {
static const auto entities = initializeHTMLEntities();
return entities;
}
#endif
JSLexer::JSLexer(
uint32_t bufId,
SourceErrorManager &sm,
Allocator &allocator,
StringTable *strTab,
bool strictMode,
bool convertSurrogates)
: sm_(sm),
allocator_(allocator),
ownStrTab_(strTab ? nullptr : new StringTable(allocator_)),
strTab_(strTab ? *strTab : *ownStrTab_),
#if HERMES_PARSE_JSX
htmlEntities_(getHTMLEntities()),
#endif
strictMode_(strictMode),
convertSurrogates_(convertSurrogates) {
initializeWithBufferId(bufId);
initializeReservedIdentifiers();
}
JSLexer::JSLexer(
std::unique_ptr<llvh::MemoryBuffer> input,
SourceErrorManager &sm,
Allocator &allocator,
StringTable *strTab,
bool strictMode,
bool convertSurrogates)
: sm_(sm),
allocator_(allocator),
ownStrTab_(strTab ? nullptr : new StringTable(allocator_)),
strTab_(strTab ? *strTab : *ownStrTab_),
#if HERMES_PARSE_JSX
htmlEntities_(getHTMLEntities()),
#endif
strictMode_(strictMode),
convertSurrogates_(convertSurrogates) {
auto bufId = sm_.addNewSourceBuffer(std::move(input));
initializeWithBufferId(bufId);
initializeReservedIdentifiers();
}
void JSLexer::initializeWithBufferId(uint32_t bufId) {
auto *buffer = sm_.getSourceBuffer(bufId);
bufId_ = bufId;
bufferStart_ = buffer->getBufferStart();
bufferEnd_ = buffer->getBufferEnd();
curCharPtr_ = bufferStart_;
assert(*bufferEnd_ == 0 && "buffer must be zero terminated");
}
void JSLexer::initializeReservedIdentifiers() {
// Add all reserved words to the identifier table
#define RESWORD(name) resWordIdent(TokenKind::rw_##name) = getIdentifier(#name);
#include "hermes/Parser/TokenKinds.def"
}
const Token *JSLexer::advance(GrammarContext grammarContext) {
newLineBeforeCurrentToken_ = false;
for (;;) {
assert(curCharPtr_ <= bufferEnd_ && "lexing past end of input");
#define PUNC_L1_1(ch, tok) \
case ch: \
token_.setStart(curCharPtr_); \
token_.setPunctuator(tok); \
++curCharPtr_; \
break
#define PUNC_L2_3(ch1, tok1, ch2a, tok2a, ch2b, tok2b) \
case ch1: \
token_.setStart(curCharPtr_); \
if (curCharPtr_[1] == ch2a) { \
token_.setPunctuator(tok2a); \
curCharPtr_ += 2; \
} else if (curCharPtr_[1] == ch2b) { \
token_.setPunctuator(tok2b); \
curCharPtr_ += 2; \
} else { \
token_.setPunctuator(tok1); \
curCharPtr_ += 1; \
} \
break
#define PUNC_L2_2(ch1, tok1, ch2, tok2) \
case ch1: \
token_.setStart(curCharPtr_); \
if (curCharPtr_[1] == (ch2)) { \
token_.setPunctuator(tok2); \
curCharPtr_ += 2; \
} else { \
token_.setPunctuator(tok1); \
curCharPtr_ += 1; \
} \
break
#define PUNC_L3_3(ch1, tok1, ch2, tok2, ch3, tok3) \
case ch1: \
token_.setStart(curCharPtr_); \
if (curCharPtr_[1] != (ch2)) { \
token_.setPunctuator(tok1); \
curCharPtr_ += 1; \
} else if (curCharPtr_[2] == (ch3)) { \
token_.setPunctuator(tok3); \
curCharPtr_ += 3; \
} else { \
token_.setPunctuator(tok2); \
curCharPtr_ += 2; \
} \
break
switch ((unsigned char)*curCharPtr_) {
case 0:
token_.setStart(curCharPtr_);
if (curCharPtr_ == bufferEnd_) {
token_.setEof();
} else {
if (!error(
token_.getStartLoc(),
"unrecognized Unicode character \\u0000")) {
token_.setEof();
} else {
++curCharPtr_;
continue;
}
}
break;
// clang-format off
PUNC_L1_1('}', TokenKind::r_brace);
PUNC_L1_1('(', TokenKind::l_paren);
PUNC_L1_1(')', TokenKind::r_paren);
PUNC_L1_1('[', TokenKind::l_square);
PUNC_L1_1(']', TokenKind::r_square);
PUNC_L1_1(';', TokenKind::semi);
PUNC_L1_1(',', TokenKind::comma);
PUNC_L1_1('~', TokenKind::tilde);
PUNC_L1_1(':', TokenKind::colon);
// { {|
case '{':
token_.setStart(curCharPtr_);
if (HERMES_PARSE_FLOW &&
LLVM_UNLIKELY(grammarContext == GrammarContext::Type) &&
curCharPtr_[1] == '|') {
token_.setPunctuator(TokenKind::l_bracepipe);
curCharPtr_ += 2;
} else {
token_.setPunctuator(TokenKind::l_brace);
curCharPtr_ += 1;
}
break;
// = => == ===
case '=':
token_.setStart(curCharPtr_);
if (curCharPtr_[1] == '>') {
token_.setPunctuator(TokenKind::equalgreater);
curCharPtr_ += 2;
} else if (curCharPtr_[1] != '=') {
token_.setPunctuator(TokenKind::equal);
curCharPtr_ += 1;
} else if (curCharPtr_[2] == '=') {
token_.setPunctuator(TokenKind::equalequalequal);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::equalequal);
curCharPtr_ += 2;
}
break;
// ! != !==
PUNC_L3_3('!', TokenKind::exclaim, '=', TokenKind::exclaimequal, '=', TokenKind::exclaimequalequal);
// + ++ +=
// - -- -=
// & && &=
// | || |=
PUNC_L2_3('+', TokenKind::plus, '+', TokenKind::plusplus, '=', TokenKind::plusequal);
PUNC_L2_3('-', TokenKind::minus, '-', TokenKind::minusminus, '=', TokenKind::minusequal);
case '&':
token_.setStart(curCharPtr_);
if (curCharPtr_[1] == '&') {
if (curCharPtr_[2] == '=') {
token_.setPunctuator(TokenKind::ampampequal);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::ampamp);
curCharPtr_ += 2;
}
} else if (curCharPtr_[1] == '=') {
token_.setPunctuator(TokenKind::ampequal);
curCharPtr_ += 2;
} else {
token_.setPunctuator(TokenKind::amp);
curCharPtr_ += 1;
}
break;
case '|':
token_.setStart(curCharPtr_);
if (HERMES_PARSE_FLOW &&
LLVM_UNLIKELY(grammarContext == GrammarContext::Type) &&
curCharPtr_[1] == '}') {
token_.setPunctuator(TokenKind::piper_brace);
curCharPtr_ += 2;
} else {
if (curCharPtr_[1] == '|') {
if (curCharPtr_[2] == '=') {
token_.setPunctuator(TokenKind::pipepipeequal);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::pipepipe);
curCharPtr_ += 2;
}
} else if (curCharPtr_[1] == '=') {
token_.setPunctuator(TokenKind::pipeequal);
curCharPtr_ += 2;
} else {
token_.setPunctuator(TokenKind::pipe);
curCharPtr_ += 1;
}
}
break;
// ? ?? ?.
case '?':
token_.setStart(curCharPtr_);
if (curCharPtr_[1] == '.' && !isdigit(curCharPtr_[2])) {
// OptionalChainingPunctuator ::
// ?. [lookahead does not contain DecimalDigit]
// This is done to prevent `x?.3:y` from being recognized
// as `x ?. 3 : y` instead of `x ? .3 : y`.
token_.setPunctuator(TokenKind::questiondot);
curCharPtr_ += 2;
} else if (
curCharPtr_[1] == '?' &&
LLVM_LIKELY(grammarContext != GrammarContext::Type)) {
if (curCharPtr_[2] == '=') {
token_.setPunctuator(TokenKind::questionquestionequal);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::questionquestion);
curCharPtr_ += 2;
}
} else {
token_.setPunctuator(TokenKind::question);
curCharPtr_ += 1;
}
break;
// * *= ** **=
case '*':
token_.setStart(curCharPtr_);
if (curCharPtr_[1] == '=') {
token_.setPunctuator(TokenKind::starequal);
curCharPtr_ += 2;
} else if (curCharPtr_[1] != '*') {
token_.setPunctuator(TokenKind::star);
curCharPtr_ += 1;
} else if (curCharPtr_[2] == '=') {
token_.setPunctuator(TokenKind::starstarequal);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::starstar);
curCharPtr_ += 2;
}
break;
// * *=
// ^ ^=
// / /=
PUNC_L2_2('^', TokenKind::caret, '=', TokenKind::caretequal);
// % %=
case '%':
token_.setStart(curCharPtr_);
if (HERMES_PARSE_FLOW &&
LLVM_UNLIKELY(grammarContext == GrammarContext::Type) &&
curCharPtr_ + 7 <= bufferEnd_ &&
llvh::StringRef(curCharPtr_, 7) == "%checks") {
token_.setIdentifier(getStringLiteral("%checks"));
curCharPtr_ += 7;
} else if (curCharPtr_[1] == ('=')) {
token_.setPunctuator(TokenKind::percentequal);
curCharPtr_ += 2;
} else {
token_.setPunctuator(TokenKind::percent);
curCharPtr_ += 1;
}
break;
// clang-format on
case '\r':
case '\n':
++curCharPtr_;
newLineBeforeCurrentToken_ = true;
continue;
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is : e2 80 a9
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(curCharPtr_)) {
curCharPtr_ += 3;
newLineBeforeCurrentToken_ = true;
continue;
} else {
goto default_label;
}
case '\v':
case '\f':
++curCharPtr_;
continue;
case '\t':
case ' ':
// Spaces frequently come in groups, so use a tight inner loop to skip.
do
++curCharPtr_;
while (*curCharPtr_ == '\t' || *curCharPtr_ == ' ');
continue;
// No-break space \u00A0 is UTF8 encoded as: c2 a0
case 0xc2:
if ((unsigned char)curCharPtr_[1] == 0xa0) {
curCharPtr_ += 2;
continue;
} else {
goto default_label;
}
// Byte-order mark \uFEFF is encoded as: ef bb bf
case 0xef:
if ((unsigned char)curCharPtr_[1] == 0xbb &&
(unsigned char)curCharPtr_[2] == 0xbf) {
curCharPtr_ += 3;
continue;
} else {
goto default_label;
}
case '/':
if (curCharPtr_[1] == '/') { // Line comment?
scanLineComment(curCharPtr_);
continue;
} else if (curCharPtr_[1] == '*') { // Block comment?
curCharPtr_ = skipBlockComment(curCharPtr_);
continue;
} else {
token_.setStart(curCharPtr_);
if (grammarContext == AllowRegExp) {
scanRegExp();
} else if (curCharPtr_[1] == '=') {
token_.setPunctuator(TokenKind::slashequal);
curCharPtr_ += 2;
} else {
token_.setPunctuator(TokenKind::slash);
curCharPtr_ += 1;
}
}
break;
case '#':
if (LLVM_UNLIKELY(
curCharPtr_ == bufferStart_ && curCharPtr_[1] == '!')) {
// #! (hashbang) at the very start of the buffer.
scanLineComment(curCharPtr_);
continue;
}
if (!scanPrivateIdentifier()) {
continue;
}
break;
// < <= << <<=
case '<':
token_.setStart(curCharPtr_);
if (HERMES_PARSE_FLOW &&
LLVM_UNLIKELY(grammarContext == JSLexer::GrammarContext::Type)) {
token_.setPunctuator(TokenKind::less);
curCharPtr_ += 1;
} else if (curCharPtr_[1] == '=') {
token_.setPunctuator(TokenKind::lessequal);
curCharPtr_ += 2;
} else if (curCharPtr_[1] == '<') {
if (curCharPtr_[2] == '=') {
token_.setPunctuator(TokenKind::lesslessequal);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::lessless);
curCharPtr_ += 2;
}
} else {
token_.setPunctuator(TokenKind::less);
curCharPtr_ += 1;
}
break;
// > >= >> >>> >>= >>>=
case '>':
token_.setStart(curCharPtr_);
if ((HERMES_PARSE_FLOW &&
LLVM_UNLIKELY(grammarContext == JSLexer::GrammarContext::Type)) ||
(HERMES_PARSE_JSX &&
LLVM_UNLIKELY(
grammarContext ==
JSLexer::GrammarContext::AllowJSXIdentifier))) {
token_.setPunctuator(TokenKind::greater);
curCharPtr_ += 1;
} else if (curCharPtr_[1] == '=') { // >=
token_.setPunctuator(TokenKind::greaterequal);
curCharPtr_ += 2;
} else if (curCharPtr_[1] == '>') { // >>
if (curCharPtr_[2] == '=') { // >>=
token_.setPunctuator(TokenKind::greatergreaterequal);
curCharPtr_ += 3;
} else if (curCharPtr_[2] == '>') { // >>>
if (curCharPtr_[3] == '=') { // >>>=
token_.setPunctuator(TokenKind::greatergreatergreaterequal);
curCharPtr_ += 4;
} else {
token_.setPunctuator(TokenKind::greatergreatergreater);
curCharPtr_ += 3;
}
} else {
token_.setPunctuator(TokenKind::greatergreater);
curCharPtr_ += 2;
}
} else {
token_.setPunctuator(TokenKind::greater);
curCharPtr_ += 1;
}
break;
case '.':
token_.setStart(curCharPtr_);
if (curCharPtr_[1] >= '0' && curCharPtr_[1] <= '9') {
scanNumber(grammarContext);
} else if (curCharPtr_[1] == '.' && curCharPtr_[2] == '.') {
token_.setPunctuator(TokenKind::dotdotdot);
curCharPtr_ += 3;
} else {
token_.setPunctuator(TokenKind::period);
++curCharPtr_;
}
break;
// clang-format off
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
// clang-format on
token_.setStart(curCharPtr_);
scanNumber(grammarContext);
break;
// clang-format off
case '_': case '$':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
case 'v': case 'w': case 'x': case 'y': case 'z':
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': case 'L': case 'M': case 'N':
case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
case 'V': case 'W': case 'X': case 'Y': case 'Z':
// clang-format on
token_.setStart(curCharPtr_);
scanIdentifierFastPathInContext(curCharPtr_, grammarContext);
break;
case '@':
token_.setStart(curCharPtr_);
if (HERMES_PARSE_FLOW &&
LLVM_UNLIKELY(grammarContext == GrammarContext::Type)) {
scanIdentifierFastPathInContext(curCharPtr_, grammarContext);
} else {
curCharPtr_ += 1;
errorRange(token_.getStartLoc(), "unrecognized character '@'");
continue;
}
break;
case '\\': {
token_.setStart(curCharPtr_);
tmpStorage_.clear();
uint32_t cp = consumeUnicodeEscape();
if (!isUnicodeIdentifierStart(cp)) {
errorRange(
token_.getStartLoc(),
"Unicode escape \\u" + Twine::utohexstr(cp) +
" is not a valid identifier start");
continue;
} else {
appendUnicodeToStorage(cp);
}
scanIdentifierPartsInContext(grammarContext);
break;
}
case '\'':
case '"':
token_.setStart(curCharPtr_);
scanStringInContext(grammarContext);
break;
case '`':
token_.setStart(curCharPtr_);
scanTemplateLiteral();
break;
default_label:
default: {
token_.setStart(curCharPtr_);
uint32_t ch = decodeUTF8();
if (isUnicodeOnlyLetter(ch)) {
tmpStorage_.clear();
appendUnicodeToStorage(ch);
scanIdentifierPartsInContext(grammarContext);
} else if (isUnicodeOnlySpace(ch)) {
continue;
} else {
if (ch > 31 && ch < 127)
errorRange(
token_.getStartLoc(),
"unrecognized character '" + Twine((char)ch) + "'");
else
errorRange(
token_.getStartLoc(),
"unrecognized Unicode character \\u" + Twine::utohexstr(ch));
continue;
}
break;
}
}
// Always terminate the loop unless "continue" was used.
break;
} // for(;;)
finishToken(curCharPtr_);
return &token_;
}
#if HERMES_PARSE_JSX
const Token *JSLexer::advanceInJSXChild() {
token_.setStart(curCharPtr_);
for (;;) {
assert(curCharPtr_ <= bufferEnd_ && "lexing past end of input");
switch (*curCharPtr_) {
PUNC_L1_1('{', TokenKind::l_brace);
PUNC_L1_1('<', TokenKind::less);
case 0:
if (curCharPtr_ == bufferEnd_) {
token_.setEof();
break;
}
// Fall-through to start scanning text.
LLVM_FALLTHROUGH;
default: {
const char *start = curCharPtr_;
token_.setStart(start);
// Build up cooked value using XHTML entities
tmpStorage_.clear();
rawStorage_.clear();
for (;;) {
char c = *curCharPtr_;
if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
appendUnicodeToStorage(codepoint);
appendUnicodeToStorage(codepoint, rawStorage_);
continue;
} else if (c == '&') {
const char *htmlStart = curCharPtr_;
auto codePoint = consumeHTMLEntityOptional();
if (codePoint.hasValue()) {
appendUnicodeToStorage(*codePoint);
rawStorage_.append(
{htmlStart, (size_t)(curCharPtr_ - htmlStart)});
continue;
}
} else if (
(c == 0 && curCharPtr_ == bufferEnd_) || c == '{' || c == '<') {
token_.setJSXText(
getStringLiteral(tmpStorage_.str()),
getStringLiteral(rawStorage_.str()));
break;
}
tmpStorage_.push_back(c);
rawStorage_.push_back(c);
++curCharPtr_;
}
break;
}
}
// Always terminate the loop unless "continue" was used.
break;
}
finishToken(curCharPtr_);
return &token_;
}
llvh::Optional<uint32_t> JSLexer::consumeHTMLEntityOptional() {
assert(*curCharPtr_ == '&');
const char *start = curCharPtr_;
if (curCharPtr_[1] == '#') {
if (curCharPtr_[2] == 'x') {
// HTML entity with form &#xHEX>;
curCharPtr_ += 3;
const char *numberStart = curCharPtr_;
uint32_t codePoint = 0;
char ch = *curCharPtr_;
// Calculate code point from non-empty sequence of hex digits followed by
// a semicolon.
for (;;) {
if (ch == ';' && curCharPtr_ != numberStart) {
curCharPtr_++;
return codePoint;
} else if (isdigit(ch)) {
ch -= '0';
} else {
ch |= 32;
if (ch >= 'a' && ch <= 'f') {
ch -= 'a' - 10;
} else {
break;
}
}
// Check that this number is representable as a code point
codePoint = (codePoint << 4) + ch;
if (codePoint > UNICODE_MAX_VALUE) {
break;
}
++curCharPtr_;
ch = *curCharPtr_;
}
} else {
// HTML entity with form &#NUMBER;
curCharPtr_ += 2;
const char *numberStart = curCharPtr_;
uint32_t codePoint = 0;
char ch = *curCharPtr_;
// Calculate code point from non-empty sequence of decimal digits followed
// by a semicolon.
for (;;) {
if (ch == ';' && curCharPtr_ != numberStart) {
curCharPtr_++;
return codePoint;
} else if (isdigit(ch)) {
// Check that this number is representable as a code point
codePoint = codePoint * 10 + (ch - '0');
if (codePoint > UNICODE_MAX_VALUE) {
break;
}
} else {
break;
}
++curCharPtr_;
ch = *curCharPtr_;
}
}
} else {
// HTML entity with form &NAME;
++curCharPtr_;
// Gather HTML entity name and lookup name in table. HTML entity names are
// composed of a sequence of up to 8 alphanumeric characters followed by a
// semicolon. To minimize backtracking due to an `&` without a following
// semicolon we only need to look at most 9 characters ahead (8 for the
// name, 1 for the semicolon).
for (int i = 0; i < 9; i++) {
char ch = *curCharPtr_;
if (ch == ';') {
auto it = htmlEntities_.find(StringRef(curCharPtr_ - i, i));
if (it == htmlEntities_.end()) {
break;
}
curCharPtr_++;
return it->second;
} else if (((ch | 32) >= 'a' && (ch | 32) <= 'z') || isdigit(ch)) {
++curCharPtr_;
} else {
break;
}
}
}
curCharPtr_ = start;
return llvh::None;
}
#endif
bool JSLexer::isCurrentTokenADirective() {
// The current token must be a string literal without escapes.
if (token_.getKind() != TokenKind::string_literal ||
token_.getStringLiteralContainsEscapes()) {
return false;
}
const char *ptr = curCharPtr_;
// A directive is a string literal (the current token, directly behind
// curCharPtr_), followed by a semicolon, new line, or eof that we will now
// try to find. There can also be comments. So, we loop, consuming whitespace
// until we encounter:
// - EOF. Don't consume it and succeed.
// - Semicolon. Don't consume it and succeed.
// - Right brace. Don't consume it and succeed.
// - A new line. Don't consume it and succeed.
// - A line comment. It implies a new line. Don't consume it and succeed.
// - A block comment. Consume it and continue.
// - Anything else. We consume nothing and fail.
for (;;) {
assert(ptr <= bufferEnd_ && "lexing past end of input");
switch (*((const unsigned char *)ptr)) {
case 0:
// EOF?
if (ptr == bufferEnd_)
return true;
// We encountered a stray 0 character.
return false;
case ';':
case '}':
return true;
case '\r':
case '\n':
return true;
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is : e2 80 a9
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(ptr))
return true;
return false;
case '\v':
case '\f':
// Skip whitespace.
++ptr;
continue;
case '\t':
case ' ':
// Spaces frequently come in groups, so use a tight inner loop to skip.
do
++ptr;
while (*ptr == '\t' || *ptr == ' ');
continue;
// No-break space \u00A0 is UTF8 encoded as: c2 a0
case 0xc2:
if ((unsigned char)ptr[1] == 0xa0) {
ptr += 2;
continue;
} else {
goto default_label;
}
// Byte-order mark \uFEFF is encoded as: ef bb bf
case 0xef:
if ((unsigned char)ptr[1] == 0xbb && (unsigned char)ptr[2] == 0xbf) {
ptr += 3;
continue;
} else {
goto default_label;
}
case '/':
if (ptr[1] == '/') { // Line comment?
// It implies a new line, so we are good.
return true;
} else if (ptr[1] == '*') { // Block comment?
auto savedCommentStorageSize = commentStorage_.size();
auto commentScope = llvh::make_scope_exit([&] {
if (storeComments_)
commentStorage_.erase(
commentStorage_.begin() + savedCommentStorageSize,
commentStorage_.end());
});
SourceErrorManager::SaveAndSuppressMessages suppress(&sm_);
ptr = skipBlockComment(ptr);
continue;
} else {
return false;
}
// Handle all other characters: if it is a unicode space, skip it.
// Otherwise we have failed.
default_label:
default: {
if (hermes::isUTF8Start(*ptr)) {
auto peeked = _peekUTF8(ptr);
if (isUnicodeOnlySpace(peeked.first)) {
ptr = peeked.second;
continue;
}
}
return false;
}
}
}
// We arrive here if we matched a directive. 'ptr' is the final character.
return true;
}
const Token *JSLexer::rescanRBraceInTemplateLiteral() {
assert(token_.getKind() == TokenKind::r_brace && "need } to rescan");
--curCharPtr_;
// Undo the storage for the '}'.
if (LLVM_UNLIKELY(storeTokens_)) {
tokenStorage_.pop_back();
}
assert(*curCharPtr_ == '}' && "non-} was scanned as r_brace");
token_.setStart(curCharPtr_);
scanTemplateLiteral();
finishToken(curCharPtr_);
return &token_;
}
OptValue<TokenKind> JSLexer::lookahead1(OptValue<TokenKind> expectedToken) {
assert(
(token_.getKind() == TokenKind::identifier || token_.isResWord()) &&
"unsupported current token");
UniqueString *savedIdent = token_.getResWordOrIdentifier();
TokenKind savedKind = token_.getKind();
SMLoc start = token_.getStartLoc();
SMLoc end = token_.getEndLoc();
const char *cur = curCharPtr_;
SourceErrorManager::SaveAndSuppressMessages suppress(&sm_);
// Remove any comments that were stored during the lookahead
auto savedCommentStorageSize = commentStorage_.size();
auto commentScope = llvh::make_scope_exit([&] {
if (storeComments_)
commentStorage_.erase(
commentStorage_.begin() + savedCommentStorageSize,
commentStorage_.end());
});
advance();
OptValue<TokenKind> kind = token_.getKind();
if (isNewLineBeforeCurrentToken()) {
// Disregard anything after LineTerminator.
kind = llvh::None;
} else if (expectedToken == kind) {
// Do not move the cursor back.
return kind;
}
token_.setStart(start.getPointer());
token_.setEnd(end.getPointer());
if (savedKind == TokenKind::identifier) {
token_.setIdentifier(savedIdent);
} else {
token_.setResWord(savedKind, savedIdent);
}
seek(SMLoc::getFromPointer(cur));
// Undo the storage for the token we just advanced to.
if (LLVM_UNLIKELY(storeTokens_)) {
tokenStorage_.pop_back();
}
return kind;
}
uint32_t JSLexer::consumeUnicodeEscape() {
assert(*curCharPtr_ == '\\');
++curCharPtr_;
if (*curCharPtr_ != 'u') {
error(
{SMLoc::getFromPointer(curCharPtr_ - 1),
SMLoc::getFromPointer(curCharPtr_ + 1)},
"invalid Unicode escape");
return UNICODE_REPLACEMENT_CHARACTER;
}
++curCharPtr_;
if (*curCharPtr_ == '{') {
auto cp = consumeBracedCodePoint();
if (!cp.hasValue()) {
// consumeBracedCodePoint has reported an error.
return UNICODE_REPLACEMENT_CHARACTER;
}
return *cp;
}
auto cp = consumeHex(4);
if (!cp)
return UNICODE_REPLACEMENT_CHARACTER;
// We don't need t check for valid UTF-16. JavaScript allows invalid surrogate
// pairs, so we just encode every UTF-16 code into a UTF-8 sequence, even
// though theoretically it is not a valid UTF-8. (UTF-8 would be "valid" if we
// collected the surrogate pair, decoded it into UTF-32 and encoded that into
// UTF-16).
return cp.getValue();
}
llvh::Optional<uint32_t> JSLexer::consumeUnicodeEscapeOptional() {
const char *start = curCharPtr_;
assert(*curCharPtr_ == '\\');
++curCharPtr_;
if (*curCharPtr_ != 'u') {
curCharPtr_ = start;
return llvh::None;
}
++curCharPtr_;
if (*curCharPtr_ == '{') {
// Avoid reporting an error because we are consuming the escape optionally.
auto cp = consumeBracedCodePoint(false);
if (!cp) {
curCharPtr_ = start;
return llvh::None;
}
return *cp;
}
auto cp = consumeHex(4, false);
if (!cp) {
curCharPtr_ = start;
return llvh::None;
}
// We don't need t check for valid UTF-16. JavaScript allows invalid surrogate
// pairs, so we just encode every UTF-16 code into a UTF-8 sequence, even
// though theoretically it is not a valid UTF-8. (UTF-8 would be "valid" if we
// collected the surrogate pair, decoded it into UTF-32 and encoded that into
// UTF-16).
return cp.getValue();
}
bool JSLexer::consumeIdentifierStart() {
if (*curCharPtr_ == '_' || *curCharPtr_ == '$' ||
((*curCharPtr_ | 32) >= 'a' && (*curCharPtr_ | 32) <= 'z')) {
tmpStorage_.clear();
tmpStorage_.push_back(*curCharPtr_++);
return true;
}
if (*curCharPtr_ == '\\') {
SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
tmpStorage_.clear();
uint32_t cp = consumeUnicodeEscape();
if (!isUnicodeIdentifierStart(cp)) {
errorRange(
startLoc,
"Unicode escape \\u" + Twine::utohexstr(cp) +
"is not a valid identifier start");
} else {
appendUnicodeToStorage(cp);
}
return true;
}
if (LLVM_LIKELY(!isUTF8Start(*curCharPtr_)))
return false;
auto decoded = _peekUTF8();
if (isUnicodeIdentifierStart(decoded.first)) {
tmpStorage_.clear();
appendUnicodeToStorage(decoded.first);
curCharPtr_ = decoded.second;
return true;
}
return false;
}
template <JSLexer::IdentifierMode Mode>
bool JSLexer::consumeOneIdentifierPartNoEscape() {
char ch = *curCharPtr_;
if (ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z') ||
(ch >= '0' && ch <= '9') || (Mode == IdentifierMode::JSX && ch == '-') ||
(Mode == IdentifierMode::Flow && ch == '@')) {
tmpStorage_.push_back(*curCharPtr_++);
return true;
} else if (LLVM_UNLIKELY(isUTF8Start(ch))) {
// If we have encountered a Unicode character, we try to decode it. If it
// can be a part of the identifier, we consume it, otherwise we leave it
// alone.
auto decoded = _peekUTF8();
if (isUnicodeIdentifierPart(decoded.first)) {
appendUnicodeToStorage(decoded.first);
curCharPtr_ = decoded.second;
return true;
}
}
return false;
}
template <JSLexer::IdentifierMode Mode>
void JSLexer::consumeIdentifierParts() {
for (;;) {
// Try consuming an non-escaped identifier part. Failing that, check for an
// escape.
if (consumeOneIdentifierPartNoEscape<Mode>())
continue;
else if (*curCharPtr_ == '\\') {
// Decode the escape.
SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
uint32_t cp = consumeUnicodeEscape();
if (!isUnicodeIdentifierPart(cp)) {
errorRange(
startLoc,
"Unicode escape \\u" + Twine::utohexstr(cp) +
"is not a valid identifier codepoint");
} else {
appendUnicodeToStorage(cp);
}
} else
break;
}
}
unsigned char JSLexer::consumeOctal(unsigned maxLen) {
assert(*curCharPtr_ >= '0' && *curCharPtr_ <= '7');
if (strictMode_) {
if (!error(
SMLoc::getFromPointer(curCharPtr_ - 1),
"octals not allowed in strict mode")) {
return 0;
}
}
auto res = (unsigned char)(*curCharPtr_++ - '0');
while (--maxLen && *curCharPtr_ >= '0' && *curCharPtr_ <= '7')
res = (res << 3) + *curCharPtr_++ - '0';
return res;
}
llvh::Optional<uint32_t> JSLexer::consumeHex(
unsigned requiredLen,
bool errorOnFail) {
uint32_t cp = 0;
for (unsigned i = 0; i != requiredLen; ++i) {
unsigned ch = *curCharPtr_;
if (ch >= '0' && ch <= '9') {
ch -= '0';
} else {
// Now that we know it is not a digit, it is safe to lowercase.
ch |= 32;
if (ch >= 'a' && ch <= 'f') {
ch -= 'a' - 10;
} else {
if (errorOnFail) {
error(SMLoc::getFromPointer(curCharPtr_), "invalid hex number");
}
return llvh::None;
}
}
cp = (cp << 4) + ch;
++curCharPtr_;
}
return cp;
}
llvh::Optional<uint32_t> JSLexer::consumeBracedCodePoint(bool errorOnFail) {
assert(*curCharPtr_ == '{' && "braced codepoint must begin with {");
++curCharPtr_;
const char *start = curCharPtr_;
// Set to true if we failed to get a code point that is in bounds or saw
// an invalid character.
bool failed = false;
// Loop until we hit the } or eof, max out the value, or see an invalid char.
uint32_t cp = 0;
for (; *curCharPtr_ != '}'; ++curCharPtr_) {
int ch = *curCharPtr_;
if (ch >= '0' && ch <= '9') {
ch -= '0';
} else if (ch >= 'a' && ch <= 'f') {
ch -= 'a' - 10;
} else if (ch >= 'A' && ch <= 'F') {
ch -= 'A' - 10;
} else {
// The only way this can be the end of the buffer is if this is a \0.
// Check if this is the end of the buffer, else continue so that we
// may report more errors after this braced code point.
if (curCharPtr_ == bufferEnd_) {
if (!failed && errorOnFail) {
error(
SMLoc::getFromPointer(start),
"non-terminated unicode codepoint escape");
}
return llvh::None;
}
// Invalid character, set the failed flag and continue.
if (!failed && errorOnFail) {
if (!error(
SMLoc::getFromPointer(curCharPtr_),
"invalid character in unicode codepoint escape")) {
return llvh::None;
}
}
failed = true;
continue;
}
cp = (cp << 4) + ch;
if (cp > UNICODE_MAX_VALUE) {
// Number grew too big, set the failed flag and continue.
if (!failed && errorOnFail) {
if (!error(
SMLoc::getFromPointer(start),
"unicode codepoint escape is too large")) {
return llvh::None;
}
}
failed = true;
}
}
assert(curCharPtr_ < bufferEnd_ && "bufferEnd_ should cause early return");
// An empty escape sequence is invalid.
if (curCharPtr_ == start) {
if (!failed && errorOnFail) {
if (!error(
SMLoc::getFromPointer(start), "empty unicode codepoint escape")) {
return llvh::None;
}
}
failed = true;
}
// Consume the final } and return.
++curCharPtr_;
return failed ? llvh::None : llvh::Optional<uint32_t>{cp};
}
llvh::StringRef JSLexer::lineCommentHelper(const char *start) {
assert(
(start[0] == '/' && start[1] == '/') ||
(start[0] == '#' && start[1] == '!'));
const char *lineCommentEnd;
const char *cur = start + 2;
for (;;) {
switch ((unsigned char)*cur) {
case 0:
if (cur == bufferEnd_) {
lineCommentEnd = cur;
goto endLoop;
} else {
++cur;
}
break;
case '\r':
case '\n':
lineCommentEnd = cur;
++cur;
newLineBeforeCurrentToken_ = true;
goto endLoop;
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(cur)) {
lineCommentEnd = cur;
cur += 3;
newLineBeforeCurrentToken_ = true;
goto endLoop;
} else {
_decodeUTF8SlowPath(cur);
}
break;
default:
if (LLVM_UNLIKELY(isUTF8Start(*cur)))
_decodeUTF8SlowPath(cur);
else
++cur;
break;
}
}
endLoop:
curCharPtr_ = cur;
return llvh::StringRef(start, lineCommentEnd - start);
}
void JSLexer::scanLineComment(const char *start) {
llvh::StringRef comment = lineCommentHelper(start);
if (storeComments_) {
commentStorage_.emplace_back(
start[0] == '/' ? StoredComment::Kind::Line
: StoredComment::Kind::Hashbang,
SMRange{
SMLoc::getFromPointer(comment.begin()),
SMLoc::getFromPointer(comment.end())});
}
// Check for magic comments, which excludes #!.
// Syntax is //# name=value
if (!comment.consume_front(llvh::StringLiteral("//# ")))
return;
if (comment.consume_front(llvh::StringLiteral("sourceURL=")))
sm_.setSourceUrl(bufId_, comment);
else if (comment.consume_front(llvh::StringLiteral("sourceMappingURL=")))
sm_.setSourceMappingUrl(bufId_, comment);
}
const char *JSLexer::skipBlockComment(const char *start) {
assert(start[0] == '/' && start[1] == '*');
SMLoc blockCommentStart = SMLoc::getFromPointer(start);
const char *cur = start + 2;
for (;;) {
switch ((unsigned char)*cur) {
case 0:
if (cur == bufferEnd_) {
error(SMLoc::getFromPointer(cur), "non-terminated block comment");
sm_.note(blockCommentStart, "comment started here");
goto endLoop;
} else {
++cur;
}
break;
case '\r':
case '\n':
++cur;
newLineBeforeCurrentToken_ = true;
break;
// Line separator \u2028 UTF8 encoded is : e2 80 a8
// Paragraph separator \u2029 UTF8 encoded is: e2 80 a9
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(cur)) {
cur += 3;
newLineBeforeCurrentToken_ = true;
} else {
_decodeUTF8SlowPath(cur);
}
break;
case '*':
++cur;
if (*cur == '/') {
++cur;
goto endLoop;
}
break;
default:
if (LLVM_UNLIKELY(isUTF8Start(*cur)))
_decodeUTF8SlowPath(cur);
else
++cur;
break;
}
}
endLoop:
if (storeComments_) {
commentStorage_.emplace_back(
StoredComment::Kind::Block,
SMRange{blockCommentStart, SMLoc::getFromPointer(cur)});
}
return cur;
}
void JSLexer::scanNumber(GrammarContext grammarContext) {
// A somewhat ugly state machine for scanning a number
unsigned radix = 10;
bool real = false;
bool ok = true;
const char *rawStart = curCharPtr_;
const char *start = curCharPtr_;
// True when we encounter the numeric literal separator: '_'.
bool seenSeparator = false;
// True when we encounter a legacy octal number (starts with '0').
bool legacyOctal = false;
// Detect the radix
if (*curCharPtr_ == '0') {
if ((curCharPtr_[1] | 32) == 'x') {
radix = 16;
curCharPtr_ += 2;
start += 2;
} else if ((curCharPtr_[1] | 32) == 'o') {
radix = 8;
curCharPtr_ += 2;
start += 2;
} else if ((curCharPtr_[1] | 32) == 'b') {
radix = 2;
curCharPtr_ += 2;
start += 2;
} else if (curCharPtr_[1] == '.') {
curCharPtr_ += 2;
goto fraction;
} else if ((curCharPtr_[1] | 32) == 'e') {
curCharPtr_ += 2;
goto exponent;
} else {
radix = 8;
legacyOctal = true;
++curCharPtr_;
}
}
while (isdigit(*curCharPtr_) ||
(radix == 16 && (*curCharPtr_ | 32) >= 'a' &&
(*curCharPtr_ | 32) <= 'f') ||
(*curCharPtr_ == '_')) {
seenSeparator |= *curCharPtr_ == '_';
++curCharPtr_;
}
if (radix == 10 || legacyOctal) {
// It is not necessarily an integer.
// We could have interpreted as legacyOctal initially but will have to
// change to decimal later.
if (*curCharPtr_ == '.') {
++curCharPtr_;
goto fraction;
}
if ((*curCharPtr_ | 32) == 'e') {
++curCharPtr_;
goto exponent;
}
}
goto end;
fraction:
// We arrive here after we have consumed the decimal dot ".".
//
real = true;
while (isdigit(*curCharPtr_) || *curCharPtr_ == '_') {
seenSeparator |= *curCharPtr_ == '_';
++curCharPtr_;
}
if ((*curCharPtr_ | 32) == 'e') {
++curCharPtr_;
goto exponent;
} else {
goto end;
}
exponent:
// We arrive here after we have consumed the exponent character 'e' or 'E'.
//
real = true;
if (*curCharPtr_ == '+' || *curCharPtr_ == '-')
++curCharPtr_;
if (isdigit(*curCharPtr_)) {
do {
seenSeparator |= *curCharPtr_ == '_';
++curCharPtr_;
} while (isdigit(*curCharPtr_) || *curCharPtr_ == '_');
} else {
ok = false;
}
end:
// We arrive here after we have consumed all we can from the number. Now,
// as per the spec, we consume a sequence of identifier characters if they
// follow directly, which means the number is invalid if it's not BigInt.
if (consumeIdentifierStart()) {
consumeIdentifierParts<IdentifierMode::JS>();
llvh::StringRef raw{rawStart, (size_t)(curCharPtr_ - rawStart)};
if (ok && !real && (!legacyOctal || raw == "0n") && tmpStorage_ == "n") {
// This is a BigInt.
rawStorage_.clear();
rawStorage_.append(raw);
token_.setBigIntLiteral(getStringLiteral(rawStorage_));
return;
}
ok = false;
}
double val;
/// ES6.0 B.1.1
/// If we encounter a "legacy" octal number (starting with a '0') but if
/// the integer contains '8' or '9' we interpret it as decimal.
const auto updateLegacyOctalRadix =
[this, &radix, start, &legacyOctal]() -> void {
assert(
legacyOctal &&
"updateLegacyOctalRadix can only be called in legacyOctal mode");
(void)legacyOctal;
for (auto *scanPtr = start; scanPtr != curCharPtr_; ++scanPtr) {
if (*scanPtr == '.' || *scanPtr == 'e') {
break;
}
if (LLVM_UNLIKELY(*scanPtr >= '8') && LLVM_LIKELY(*scanPtr != '_')) {
sm_.warning(
SMRange(token_.getStartLoc(), SMLoc::getFromPointer(curCharPtr_)),
"Numeric literal starts with 0 but contains an 8 or 9 digit. "
"Interpreting as decimal (not octal).");
radix = 10;
break;
}
}
};
if (!ok) {
errorRange(token_.getStartLoc(), "invalid numeric literal");
val = std::numeric_limits<double>::quiet_NaN();
} else if (
!real && radix == 10 && curCharPtr_ - start <= 9 &&
LLVM_LIKELY(!seenSeparator)) {
// If this is a decimal integer of at most 9 digits (log10(2**31-1), it
// can fit in a 32-bit integer. Use a faster conversion.
int32_t ival = *start - '0';
while (++start != curCharPtr_)
ival = ival * 10 + (*start - '0');
val = ival;
} else if (real || radix == 10) {
if (legacyOctal) {
if (strictMode_ || grammarContext == GrammarContext::Type) {
if (!errorRange(
token_.getStartLoc(),
"Decimals with leading zeros are not allowed in strict mode")) {
val = std::numeric_limits<double>::quiet_NaN();
goto done;
}
} else {
// Check to see if we can actually scan this as radix 10.
// Non-integer numbers must be in base 10, otherwise we error.
updateLegacyOctalRadix();
if (LLVM_LIKELY(radix != 10)) {
if (!errorRange(
token_.getStartLoc(),
"Octal numeric literals must be integers")) {
val = std::numeric_limits<double>::quiet_NaN();
goto done;
}
}
}
}
// We need a zero-terminated buffer for hermes_g_strtod().
llvh::SmallString<32> buf;
buf.reserve(curCharPtr_ - start + 1);
if (LLVM_UNLIKELY(seenSeparator)) {
for (const char *it = start; it != curCharPtr_; ++it) {
if (LLVM_LIKELY(*it != '_')) {
buf.push_back(*it);
} else {
// Check to ensure that '_' is surrounded by digits.
// This is safe because the source buffer is zero-terminated and
// we know that the numeric literal didn't start with '_'.
// Note that we could have a 0b_11 literal, but we'd still fail
// properly because of the radix==16 check.
char prev = *(it - 1);
char next = *(it + 1);
if (!isdigit(prev) &&
!(radix == 16 && 'a' <= (prev | 32) && (prev | 32) <= 'f')) {
errorRange(
token_.getStartLoc(),
"numeric separator must come after a digit");
} else if (
!isdigit(next) &&
!(radix == 16 && 'a' <= (next | 32) && (next | 32) <= 'f')) {
errorRange(
token_.getStartLoc(),
"numeric separator must come before a digit");
}
}
}
} else {
buf.append(start, curCharPtr_);
}
buf.push_back(0);
char *endPtr;
val = ::hermes_g_strtod(buf.data(), &endPtr);
if (endPtr != &buf.back()) {
errorRange(token_.getStartLoc(), "invalid numeric literal");
val = std::numeric_limits<double>::quiet_NaN();
}
} else {
if (legacyOctal &&
(strictMode_ || grammarContext == GrammarContext::Type) &&
curCharPtr_ - start > 1) {
if (!errorRange(
token_.getStartLoc(),
"Octal literals must use '0o' in strict mode")) {
val = std::numeric_limits<double>::quiet_NaN();
goto done;
}
}
// Handle the zero-radix case. This could only happen with radix 16
// because otherwise start wouldn't have been changed.
if (curCharPtr_ == start) {
errorRange(
token_.getStartLoc(),
llvh::Twine("No digits after ") + StringRef(start - 2, 2));
val = std::numeric_limits<double>::quiet_NaN();
} else {
// Parse the rest of the number:
if (legacyOctal) {
updateLegacyOctalRadix();
// LegacyOctalLikeDecimalIntegerLiteral cannot contain separators.
if (LLVM_UNLIKELY(seenSeparator)) {
errorRange(
token_.getStartLoc(),
"Numeric separator cannot be used in literal after leading 0");
}
}
auto parsedInt = parseIntWithRadix</* AllowNumericSeparator */ true>(
llvh::ArrayRef<char>{start, (size_t)(curCharPtr_ - start)}, radix);
if (!parsedInt) {
errorRange(token_.getStartLoc(), "invalid integer literal");
val = std::numeric_limits<double>::quiet_NaN();
} else {
val = parsedInt.getValue();
}
}
}
done:
token_.setNumericLiteral(val);
}
static TokenKind matchReservedWord(const char *str, unsigned len) {
return llvh::StringSwitch<TokenKind>(StringRef(str, len))
#define RESWORD(name) .Case(#name, TokenKind::rw_##name)
#include "hermes/Parser/TokenKinds.def"
.Default(TokenKind::identifier);
}
TokenKind JSLexer::scanReservedWord(const char *start, unsigned length) {
TokenKind rw = matchReservedWord(start, length);
// Check for "Future reserved words" which should not be recognised in non-
// strict mode.
if (!strictMode_ && rw != TokenKind::identifier) {
switch (rw) {
case TokenKind::rw_implements:
case TokenKind::rw_interface:
case TokenKind::rw_package:
case TokenKind::rw_private:
case TokenKind::rw_protected:
case TokenKind::rw_public:
case TokenKind::rw_static:
case TokenKind::rw_yield:
rw = TokenKind::identifier;
default:
break;
}
}
return rw;
}
template <JSLexer::IdentifierMode Mode>
void JSLexer::scanIdentifierFastPath(const char *start) {
const char *end = start;
// Quickly consume the ASCII identifier part.
char ch;
do
ch = (unsigned char)*++end;
while (ch == '_' || ch == '$' || ((ch | 32) >= 'a' && (ch | 32) <= 'z') ||
(ch >= '0' && ch <= '9') ||
(Mode == IdentifierMode::JSX && ch == '-') ||
(Mode == IdentifierMode::Flow && ch == '@'));
// Check whether a slow part of the identifier follows.
if (LLVM_UNLIKELY(ch == '\\')) {
// An escape. Pass the baton to the slow path.
initStorageWith(start, end);
curCharPtr_ = end;
scanIdentifierParts<Mode>();
return;
} else if (LLVM_UNLIKELY(isUTF8Start(ch))) {
// If we have encountered a Unicode character, we try to decode it. If it
// can be a part of the identifier,
// we consume it, otherwise we leave it alone.
auto decoded = _peekUTF8(end);
if (isUnicodeIdentifierPart(decoded.first)) {
initStorageWith(start, end);
appendUnicodeToStorage(decoded.first);
curCharPtr_ = decoded.second;
scanIdentifierParts<Mode>();
return;
}
}
curCharPtr_ = end;
size_t length = end - start;
auto rw = scanReservedWord(start, (unsigned)length);
if (rw != TokenKind::identifier) {
token_.setResWord(rw, resWordIdent(rw));
} else {
token_.setIdentifier(getIdentifier(StringRef(start, length)));
}
}
template <JSLexer::IdentifierMode Mode>
void JSLexer::scanIdentifierParts() {
consumeIdentifierParts<Mode>();
token_.setIdentifier(getIdentifier(tmpStorage_.str()));
}
bool JSLexer::scanPrivateIdentifier() {
assert(*curCharPtr_ == '#');
// Skip the '#'.
const char *start = curCharPtr_;
++curCharPtr_;
// Scan the actual identifier.
if (LLVM_LIKELY(isASCIIIdentifierStart(*curCharPtr_))) {
scanIdentifierFastPath<IdentifierMode::JS>(curCharPtr_);
} else if (consumeIdentifierStart()) {
// curCharPtr_ has been updated by consumeIdentifierStart.
scanIdentifierParts<IdentifierMode::JS>();
} else {
error(SMLoc::getFromPointer(start), "empty private identifier");
return false;
}
// Reset the start to the '#' because the scanIdentifier functions were
// not aware of the true start of the token.
token_.setStart(start);
// Parsed a resword or identifier.
// Convert the TokenKind to private_identifier after the fact.
// This avoids adding another Mode to IdentifierMode.
token_.setPrivateIdentifier(token_.getResWordOrIdentifier());
return true;
}
template <bool JSX>
void JSLexer::scanString() {
assert(*curCharPtr_ == '\'' || *curCharPtr_ == '"');
char quoteCh = *curCharPtr_++;
// Track whether we encounter any escapes or new line continuations. We need
// that information in order to detect directives.
bool escapes = false;
tmpStorage_.clear();
for (;;) {
if (*curCharPtr_ == quoteCh) {
++curCharPtr_;
break;
} else if (!JSX && *curCharPtr_ == '\\') {
escapes = true;
++curCharPtr_;
switch ((unsigned char)*curCharPtr_) {
case '\'':
case '"':
case '\\':
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
break;
case 'b':
++curCharPtr_;
tmpStorage_.push_back(8);
break;
case 'f':
++curCharPtr_;
tmpStorage_.push_back(12);
break;
case 'n':
++curCharPtr_;
tmpStorage_.push_back(10);
break;
case 'r':
++curCharPtr_;
tmpStorage_.push_back(13);
break;
case 't':
++curCharPtr_;
tmpStorage_.push_back(9);
break;
case 'v':
++curCharPtr_;
tmpStorage_.push_back(11);
break;
case '\0': // EOF?
if (curCharPtr_ == bufferEnd_) { // eof?
error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
sm_.note(token_.getStartLoc(), "string started here");
goto breakLoop;
} else {
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
}
break;
case '0':
// '\0' is not an octal so handle it separately.
if (!(curCharPtr_[1] >= '0' && curCharPtr_[1] <= '7')) {
++curCharPtr_;
appendUnicodeToStorage(0);
break;
}
LLVM_FALLTHROUGH;
case '1':
case '2':
case '3':
appendUnicodeToStorage(consumeOctal(3));
break;
case '4':
case '5':
case '6':
case '7':
appendUnicodeToStorage(consumeOctal(2));
break;
case 'x': {
++curCharPtr_;
auto v = consumeHex(2);
appendUnicodeToStorage(v ? *v : 0);
break;
}
case 'u':
--curCharPtr_;
appendUnicodeToStorage(consumeUnicodeEscape());
break;
// Escaped line terminator. We just need to skip it.
case '\n':
++curCharPtr_;
break;
case '\r':
++curCharPtr_;
if (*curCharPtr_ == '\n') // skip CR LF
++curCharPtr_;
break;
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(curCharPtr_)) {
curCharPtr_ += 3;
break;
}
appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
break;
default:
if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_)))
appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
else
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
break;
}
} else if (LLVM_UNLIKELY(*curCharPtr_ == '\n' || *curCharPtr_ == '\r')) {
if (JSX) {
tmpStorage_.push_back(*curCharPtr_++);
} else {
error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
sm_.note(token_.getStartLoc(), "string started here");
break;
}
#if HERMES_PARSE_JSX
} else if (LLVM_UNLIKELY(JSX && *curCharPtr_ == '&')) {
auto codePoint = consumeHTMLEntityOptional();
if (codePoint.hasValue()) {
appendUnicodeToStorage(*codePoint);
} else {
tmpStorage_.push_back(*curCharPtr_++);
}
#endif
} else if (LLVM_UNLIKELY(*curCharPtr_ == 0 && curCharPtr_ == bufferEnd_)) {
error(SMLoc::getFromPointer(curCharPtr_), "non-terminated string");
sm_.note(token_.getStartLoc(), "string started here");
break;
} else {
if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
// Decode and re-encode the character and append it to the string
// storage
appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
} else {
tmpStorage_.push_back(*curCharPtr_++);
}
}
}
breakLoop:
token_.setStringLiteral(getStringLiteral(tmpStorage_.str()), escapes);
}
void JSLexer::scanTemplateLiteral() {
assert(*curCharPtr_ == '`' || *curCharPtr_ == '}');
// Whether the token will result in TemplateHead upon encountering ${.
// If we end the literal with `, then the result is NoSubstitutionTemplate,
// so this will be ignored.
bool isHead = *curCharPtr_ == '`';
// If the token ended with a ` then it's a tail (or NoSubstitutionTemplate),
// and if it ended with a ${ then it's not a tail.
bool isTail = false;
// Advance past the initial `.
++curCharPtr_;
// Track whether we encounter any NotEscapeSequence instances,
// which will be used to error out on non-tagged sequences.
bool foundNotEscapeSequence = false;
// Store the Template Value (TV) in the tmpStorage_.
tmpStorage_.clear();
// Store the Template Raw Value (TRV) in the rawStorage_.
rawStorage_.clear();
/// Return the Template Raw Value (TRV) of character \p c.
/// The only time the TRV is different from c is when c is a <CR>.
/// In that case, this function will return 0x0a (LINE FEED).
const auto trv = [](char c) -> char {
if (c == '\r') {
// This case takes \r and \r\n into account.
// The code below which consumes line separators will skip the following
// \n if there is a \r\n.
// For the purposes of finding the TRV it doesn't matter.
return 0x0a;
}
return c;
};
for (;;) {
if (*curCharPtr_ == '`') {
isTail = true;
++curCharPtr_;
break;
} else if (*curCharPtr_ == '$' && curCharPtr_[1] == '{') {
// End of the TemplateCharacters.
isTail = false;
curCharPtr_ += 2;
break;
} else if (*curCharPtr_ == '\\') {
rawStorage_.push_back(*curCharPtr_);
++curCharPtr_;
rawStorage_.push_back(trv(*curCharPtr_));
switch ((unsigned char)*curCharPtr_) {
case '\'':
case '"':
case '\\':
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
break;
case 'b':
++curCharPtr_;
tmpStorage_.push_back(8);
break;
case 'f':
++curCharPtr_;
tmpStorage_.push_back(12);
break;
case 'n':
++curCharPtr_;
tmpStorage_.push_back(10);
break;
case 'r':
++curCharPtr_;
tmpStorage_.push_back(13);
break;
case 't':
++curCharPtr_;
tmpStorage_.push_back(9);
break;
case 'v':
++curCharPtr_;
tmpStorage_.push_back(11);
break;
case '\0': // EOF?
if (curCharPtr_ == bufferEnd_) { // eof?
error(
SMLoc::getFromPointer(curCharPtr_),
"non-terminated template literal");
sm_.note(token_.getStartLoc(), "template literal started here");
goto breakLoop;
} else {
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
}
break;
case '0':
// '\0' is only a valid escape sequence if not followed by a
// DecimalDigit.
if (!(curCharPtr_[1] >= '0' && curCharPtr_[1] <= '9')) {
++curCharPtr_;
appendUnicodeToStorage(0);
break;
}
// fall-through
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
// NotEscapeSequence :: DecimalDigit but not 0
// NotEscapeSequence :: 0 DecimalDigit
// Octal numbers are not supported in template strings,
// so leave the number in the raw storage (done above) and move on.
++curCharPtr_;
foundNotEscapeSequence = true;
break;
case 'x': {
++curCharPtr_;
const char *start = curCharPtr_;
auto v = consumeHex(2, false);
if (!v) {
foundNotEscapeSequence = true;
}
appendUnicodeToStorage(v ? *v : 0);
rawStorage_.append({start, (size_t)(curCharPtr_ - start)});
break;
}
case 'u': {
// Pointer to the first character after the 'u', which is where we
// can continue scanning from if we fail to decode an escape.
const char *start = curCharPtr_ + 1;
// Reset the pointer to the '\' to scan the unicode escape.
--curCharPtr_;
assert(*curCharPtr_ == '\\' && "must have started with \\");
auto codepoint = consumeUnicodeEscapeOptional();
if (!codepoint) {
foundNotEscapeSequence = true;
curCharPtr_ = start;
break;
}
appendUnicodeToStorage(*codepoint);
rawStorage_.append({start, (size_t)(curCharPtr_ - start)});
break;
}
// Escaped line terminator. We just need to skip it, because it was
// added to the raw storage at the start of the switch statement.
case '\n':
++curCharPtr_;
break;
case '\r':
++curCharPtr_;
if (*curCharPtr_ == '\n') // skip CR LF
++curCharPtr_;
break;
case UTF8_LINE_TERMINATOR_CHAR0: {
bool isLineTerminator =
matchUnicodeLineTerminatorOffset1(curCharPtr_);
uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
// Needs to be added to the rawStorage_ regardless,
// but we first need to pop off the byte that was added prior to the
// switch statement.
rawStorage_.pop_back();
appendUnicodeToStorage(codepoint, rawStorage_);
if (!isLineTerminator) {
// Only add the codepoint to the tmpStorage if it wasn't a line
// terminator.
appendUnicodeToStorage(codepoint);
}
break;
}
default:
if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
appendUnicodeToStorage(codepoint);
// Remove the last byte from rawStorage_ and then append the
// unicode codepoint to it. The already inserted byte will change
// if this codepoint is in Supplementary Planes.
rawStorage_.pop_back();
appendUnicodeToStorage(codepoint, rawStorage_);
} else {
// The TV of EscapeSequence is the SV of EscapeSequence.
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
}
break;
}
} else if (LLVM_UNLIKELY(*curCharPtr_ == 0 && curCharPtr_ == bufferEnd_)) {
error(
SMLoc::getFromPointer(curCharPtr_),
"non-terminated template literal");
sm_.note(token_.getStartLoc(), "template literal started here");
break;
} else if (*curCharPtr_ == '\r') {
// The TV of LineTerminatorSequence is the TRV of
// LineTerminatorSequence. The only time this differs from the same
// characters as the bytes in the file is when the sequence begins with
// a <CR>.
tmpStorage_.push_back(trv(*curCharPtr_));
rawStorage_.push_back(trv(*curCharPtr_));
curCharPtr_++;
if (*curCharPtr_ == '\n') {
// Skip the <CR> <LF>
curCharPtr_++;
}
} else {
if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
// Decode and re-encode the character and append it to the string
// storage
uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
appendUnicodeToStorage(codepoint);
appendUnicodeToStorage(codepoint, rawStorage_);
} else {
rawStorage_.push_back(*curCharPtr_);
tmpStorage_.push_back(*curCharPtr_++);
}
}
}
breakLoop:
// If the template literal is tagged and contains invalid escapes, then
// cooked should be null because there is no way to cook it, per the ESTree
// 2018 spec. The parser will error when encountering an untagged literal
// with invalid escapes, so we place nullptr here.
UniqueString *cookedStr =
foundNotEscapeSequence ? nullptr : getStringLiteral(tmpStorage_.str());
UniqueString *rawStr = getStringLiteral(rawStorage_.str());
if (isHead) {
if (isTail) {
// ` characters `
token_.setTemplateLiteral(
TokenKind::no_substitution_template, cookedStr, rawStr);
} else {
// ` characters ${
token_.setTemplateLiteral(TokenKind::template_head, cookedStr, rawStr);
}
} else {
if (isTail) {
// } characters `
token_.setTemplateLiteral(TokenKind::template_tail, cookedStr, rawStr);
} else {
// } characters ${
token_.setTemplateLiteral(TokenKind::template_middle, cookedStr, rawStr);
}
}
}
/// TODO: this has to be implemented properly.
void JSLexer::scanRegExp() {
SMLoc startLoc = SMLoc::getFromPointer(curCharPtr_);
assert(*curCharPtr_ == '/');
++curCharPtr_;
tmpStorage_.clear();
bool inClass = false;
for (;;) {
switch ((unsigned char)*curCharPtr_) {
case '/':
if (!inClass) {
++curCharPtr_;
goto exitLoop;
}
break;
case '[':
inClass = true; // It may be true already, but so what.
break;
case ']':
inClass = false; // It may be false already, but so what.
break;
case '\\': // an escape
tmpStorage_.push_back((unsigned char)*curCharPtr_);
++curCharPtr_;
switch ((unsigned char)*curCharPtr_) {
case '\0':
if (curCharPtr_ == bufferEnd_)
goto unterminated;
break;
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(curCharPtr_))
goto unterminated;
break;
case '\n':
case '\r':
goto unterminated;
}
break;
case '\0':
if (curCharPtr_ == bufferEnd_)
goto unterminated;
break;
case UTF8_LINE_TERMINATOR_CHAR0:
if (matchUnicodeLineTerminatorOffset1(curCharPtr_))
goto unterminated;
break;
case '\n':
case '\r':
unterminated:
error(
SMLoc::getFromPointer(curCharPtr_),
"non-terminated regular expression literal");
sm_.note(startLoc, "regular expression started here");
goto exitLoop;
}
if (LLVM_UNLIKELY(isUTF8Start((unsigned char)*curCharPtr_)))
appendUnicodeToStorage(_decodeUTF8SlowPath(curCharPtr_));
else
tmpStorage_.push_back((unsigned char)*curCharPtr_++);
}
exitLoop:
UniqueString *body = getStringLiteral(tmpStorage_.str());
// Scan the flags. We must not interpret escape sequences.
// E6 5.1 7.8.5: "The Strings of characters comprising the
// RegularExpressionBody and the RegularExpressionFlags are passed
// uninterpreted to the regular expression constructor"
tmpStorage_.clear();
bool escapingBackslash = false;
for (;;) {
if (consumeOneIdentifierPartNoEscape<IdentifierMode::JS>()) {
escapingBackslash = false;
continue;
} else if (*curCharPtr_ == '\\') {
tmpStorage_.push_back(*curCharPtr_++);
// ES6 11.8.5.1: It is a Syntax Error if IdentifierPart contains a
// Unicode escape sequence.
escapingBackslash = !escapingBackslash;
if (escapingBackslash && *curCharPtr_ == 'u') {
error(
SMLoc::getFromPointer(curCharPtr_),
"Unicode escape sequences are not allowed in regular expression flags");
}
} else {
break;
}
}
UniqueString *flags = getStringLiteral(tmpStorage_.str());
token_.setRegExpLiteral(new (allocator_.Allocate<RegExpLiteral>(1))
RegExpLiteral(body, flags));
}
UniqueString *JSLexer::convertSurrogatesInString(StringRef str) {
llvh::SmallVector<char16_t, 8> ustr;
ustr.reserve(str.size());
char16_t *ustrEnd =
convertUTF8WithSurrogatesToUTF16(ustr.data(), str.begin(), str.end());
std::string output;
convertUTF16ToUTF8WithReplacements(
output, llvh::makeArrayRef(ustr.data(), ustrEnd));
return strTab_.getString(output);
}
bool JSLexer::error(llvh::SMLoc loc, const llvh::Twine &msg) {
sm_.error(loc, msg, Subsystem::Lexer);
if (!sm_.isErrorLimitReached())
return true;
forceEOF();
return false;
}
bool JSLexer::error(llvh::SMRange range, const llvh::Twine &msg) {
sm_.error(range, msg, Subsystem::Lexer);
if (!sm_.isErrorLimitReached())
return true;
forceEOF();
return false;
}
bool JSLexer::error(
llvh::SMLoc loc,
llvh::SMRange range,
const llvh::Twine &msg) {
sm_.error(loc, range, msg, Subsystem::Lexer);
if (!sm_.isErrorLimitReached())
return true;
forceEOF();
return false;
}
} // namespace parser
} // namespace hermes