in tools/clang/lib/Lex/Lexer.cpp [2921:3665]
bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
LexNextToken:
// New token, can't need cleaning yet.
Result.clearFlag(Token::NeedsCleaning);
Result.setIdentifierInfo(nullptr);
// CurPtr - Cache BufferPtr in an automatic variable.
const char *CurPtr = BufferPtr;
// Small amounts of horizontal whitespace is very common between tokens.
if ((*CurPtr == ' ') || (*CurPtr == '\t')) {
++CurPtr;
while ((*CurPtr == ' ') || (*CurPtr == '\t'))
++CurPtr;
// If we are keeping whitespace and other tokens, just return what we just
// skipped. The next lexer invocation will return the token after the
// whitespace.
if (isKeepWhitespaceMode()) {
FormTokenWithChars(Result, CurPtr, tok::unknown);
// FIXME: The next token will not have LeadingSpace set.
return true;
}
BufferPtr = CurPtr;
Result.setFlag(Token::LeadingSpace);
}
unsigned SizeTmp, SizeTmp2; // Temporaries for use in cases below.
// Read a character, advancing over it.
char Char = getAndAdvanceChar(CurPtr, Result);
tok::TokenKind Kind;
switch (Char) {
case 0: // Null.
// Found end of file?
if (CurPtr-1 == BufferEnd)
return LexEndOfFile(Result, CurPtr-1);
// Check if we are performing code completion.
if (isCodeCompletionPoint(CurPtr-1)) {
// Return the code-completion token.
Result.startToken();
FormTokenWithChars(Result, CurPtr, tok::code_completion);
return true;
}
if (!isLexingRawMode())
Diag(CurPtr-1, diag::null_in_file);
Result.setFlag(Token::LeadingSpace);
if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
return true; // KeepWhitespaceMode
// We know the lexer hasn't changed, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
case 26: // DOS & CP/M EOF: "^Z".
// If we're in Microsoft extensions mode, treat this as end of file.
if (LangOpts.MicrosoftExt)
return LexEndOfFile(Result, CurPtr-1);
// If Microsoft extensions are disabled, this is just random garbage.
Kind = tok::unknown;
break;
case '\n':
case '\r':
// If we are inside a preprocessor directive and we see the end of line,
// we know we are done with the directive, so return an EOD token.
if (ParsingPreprocessorDirective) {
// Done parsing the "line".
ParsingPreprocessorDirective = false;
// Restore comment saving mode, in case it was disabled for directive.
if (PP)
resetExtendedTokenMode();
// Since we consumed a newline, we are back at the start of a line.
IsAtStartOfLine = true;
IsAtPhysicalStartOfLine = true;
Kind = tok::eod;
break;
}
// No leading whitespace seen so far.
Result.clearFlag(Token::LeadingSpace);
if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
return true; // KeepWhitespaceMode
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
case ' ':
case '\t':
case '\f':
case '\v':
SkipHorizontalWhitespace:
Result.setFlag(Token::LeadingSpace);
if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
return true; // KeepWhitespaceMode
SkipIgnoredUnits:
CurPtr = BufferPtr;
// If the next token is obviously a // or /* */ comment, skip it efficiently
// too (without going through the big switch stmt).
if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
LangOpts.LineComment &&
(LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
return true; // There is a token to return.
goto SkipIgnoredUnits;
} else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
return true; // There is a token to return.
goto SkipIgnoredUnits;
} else if (isHorizontalWhitespace(*CurPtr)) {
goto SkipHorizontalWhitespace;
}
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
// C99 6.4.4.1: Integer Constants.
// C99 6.4.4.2: Floating Constants.
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7': case '8': case '9':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexNumericConstant(Result, CurPtr);
case 'u': // Identifier (uber) or C11/C++11 UTF-8 or UTF-16 string literal
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
if (LangOpts.CPlusPlus11 || LangOpts.C11) {
Char = getCharAndSize(CurPtr, SizeTmp);
// UTF-16 string literal
if (Char == '"')
return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::utf16_string_literal);
// UTF-16 character constant
if (Char == '\'')
return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::utf16_char_constant);
// UTF-16 raw string literal
if (Char == 'R' && LangOpts.CPlusPlus11 &&
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
return LexRawStringLiteral(Result,
ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
tok::utf16_string_literal);
if (Char == '8') {
char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
// UTF-8 string literal
if (Char2 == '"')
return LexStringLiteral(Result,
ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
tok::utf8_string_literal);
if (Char2 == '\'' && LangOpts.CPlusPlus1z)
return LexCharConstant(
Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
tok::utf8_char_constant);
if (Char2 == 'R' && LangOpts.CPlusPlus11) {
unsigned SizeTmp3;
char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
// UTF-8 raw string literal
if (Char3 == '"') {
return LexRawStringLiteral(Result,
ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
SizeTmp3, Result),
tok::utf8_string_literal);
}
}
}
}
// treat u like the start of an identifier.
return LexIdentifier(Result, CurPtr);
case 'U': // Identifier (Uber) or C11/C++11 UTF-32 string literal
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
if (LangOpts.CPlusPlus11 || LangOpts.C11) {
Char = getCharAndSize(CurPtr, SizeTmp);
// UTF-32 string literal
if (Char == '"')
return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::utf32_string_literal);
// UTF-32 character constant
if (Char == '\'')
return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::utf32_char_constant);
// UTF-32 raw string literal
if (Char == 'R' && LangOpts.CPlusPlus11 &&
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
return LexRawStringLiteral(Result,
ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
tok::utf32_string_literal);
}
// treat U like the start of an identifier.
return LexIdentifier(Result, CurPtr);
case 'R': // Identifier or C++0x raw string literal
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
if (LangOpts.CPlusPlus11) {
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '"')
return LexRawStringLiteral(Result,
ConsumeChar(CurPtr, SizeTmp, Result),
tok::string_literal);
}
// treat R like the start of an identifier.
return LexIdentifier(Result, CurPtr);
case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
Char = getCharAndSize(CurPtr, SizeTmp);
// Wide string literal.
if (Char == '"')
return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::wide_string_literal);
// Wide raw string literal.
if (LangOpts.CPlusPlus11 && Char == 'R' &&
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
return LexRawStringLiteral(Result,
ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result),
tok::wide_string_literal);
// Wide character constant.
if (Char == '\'')
return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
tok::wide_char_constant);
// FALL THROUGH, treating L like the start of an identifier.
// C99 6.4.2: Identifiers.
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
case 'O': case 'P': case 'Q': /*'R'*/case 'S': case 'T': /*'U'*/
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
case 'v': case 'w': case 'x': case 'y': case 'z':
case '_':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexIdentifier(Result, CurPtr);
case '$': // $ in identifiers.
if (LangOpts.DollarIdents) {
if (!isLexingRawMode())
Diag(CurPtr-1, diag::ext_dollar_in_identifier);
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexIdentifier(Result, CurPtr);
}
Kind = tok::unknown;
break;
// C99 6.4.4: Character Constants.
case '\'':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexCharConstant(Result, CurPtr, tok::char_constant);
// C99 6.4.5: String Literals.
case '"':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexStringLiteral(Result, CurPtr, tok::string_literal);
// C99 6.4.6: Punctuators.
case '?':
Kind = tok::question;
break;
case '[':
Kind = tok::l_square;
break;
case ']':
Kind = tok::r_square;
break;
case '(':
Kind = tok::l_paren;
break;
case ')':
Kind = tok::r_paren;
break;
case '{':
Kind = tok::l_brace;
break;
case '}':
Kind = tok::r_brace;
break;
case '.':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char >= '0' && Char <= '9') {
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
} else if (LangOpts.CPlusPlus && Char == '*') {
Kind = tok::periodstar;
CurPtr += SizeTmp;
} else if (Char == '.' &&
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
Kind = tok::ellipsis;
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
} else {
Kind = tok::period;
}
break;
case '&':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '&') {
Kind = tok::ampamp;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else if (Char == '=') {
Kind = tok::ampequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
Kind = tok::amp;
}
break;
case '*':
if (getCharAndSize(CurPtr, SizeTmp) == '=') {
Kind = tok::starequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
Kind = tok::star;
}
break;
case '+':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '+') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::plusplus;
} else if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::plusequal;
} else {
Kind = tok::plus;
}
break;
case '-':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '-') { // --
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::minusminus;
} else if (Char == '>' && LangOpts.CPlusPlus &&
getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') { // C++ ->*
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
Kind = tok::arrowstar;
} else if (Char == '>') { // ->
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::arrow;
} else if (Char == '=') { // -=
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::minusequal;
} else {
Kind = tok::minus;
}
break;
case '~':
Kind = tok::tilde;
break;
case '!':
if (getCharAndSize(CurPtr, SizeTmp) == '=') {
Kind = tok::exclaimequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
Kind = tok::exclaim;
}
break;
case '/':
// 6.4.9: Comments
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '/') { // Line comment.
// Even if Line comments are disabled (e.g. in C89 mode), we generally
// want to lex this as a comment. There is one problem with this though,
// that in one particular corner case, this can change the behavior of the
// resultant program. For example, In "foo //**/ bar", C89 would lex
// this as "foo / bar" and langauges with Line comments would lex it as
// "foo". Check to see if the character after the second slash is a '*'.
// If so, we will lex that as a "/" instead of the start of a comment.
// However, we never do this if we are just preprocessing.
bool TreatAsComment = LangOpts.LineComment &&
(LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
if (!TreatAsComment)
if (!(PP && PP->isPreprocessedOutput()))
TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
if (TreatAsComment) {
if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
TokAtPhysicalStartOfLine))
return true; // There is a token to return.
// It is common for the tokens immediately after a // comment to be
// whitespace (indentation for the next line). Instead of going through
// the big switch, handle it efficiently now.
goto SkipIgnoredUnits;
}
}
if (Char == '*') { // /**/ comment.
if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
TokAtPhysicalStartOfLine))
return true; // There is a token to return.
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::slashequal;
} else {
Kind = tok::slash;
}
break;
case '%':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '=') {
Kind = tok::percentequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else if (LangOpts.Digraphs && Char == '>') {
Kind = tok::r_brace; // '%>' -> '}'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else if (LangOpts.Digraphs && Char == ':') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
Kind = tok::hashhash; // '%:%:' -> '##'
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
} else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
if (!isLexingRawMode())
Diag(BufferPtr, diag::ext_charize_microsoft);
Kind = tok::hashat;
} else { // '%:' -> '#'
// We parsed a # character. If this occurs at the start of the line,
// it's actually the start of a preprocessing directive. Callback to
// the preprocessor to handle it.
// TODO: -fpreprocessed mode??
if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
goto HandleDirective;
Kind = tok::hash;
}
} else {
Kind = tok::percent;
}
break;
case '<':
Char = getCharAndSize(CurPtr, SizeTmp);
if (ParsingFilename) {
return LexAngledStringLiteral(Result, CurPtr);
} else if (Char == '<') {
char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
if (After == '=') {
Kind = tok::lesslessequal;
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
} else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
// If this is actually a '<<<<<<<' version control conflict marker,
// recognize it as such and recover nicely.
goto LexNextToken;
} else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
// If this is '<<<<' and we're in a Perforce-style conflict marker,
// ignore it.
goto LexNextToken;
} else if (LangOpts.CUDA && After == '<') {
Kind = tok::lesslessless;
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
} else {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::lessless;
}
} else if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::lessequal;
} else if (LangOpts.Digraphs && Char == ':') { // '<:' -> '['
if (LangOpts.CPlusPlus11 &&
getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
// C++0x [lex.pptoken]p3:
// Otherwise, if the next three characters are <:: and the subsequent
// character is neither : nor >, the < is treated as a preprocessor
// token by itself and not as the first character of the alternative
// token <:.
unsigned SizeTmp3;
char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
if (After != ':' && After != '>') {
Kind = tok::less;
if (!isLexingRawMode())
Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
break;
}
}
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::l_square;
} else if (LangOpts.Digraphs && Char == '%') { // '<%' -> '{'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::l_brace;
} else {
Kind = tok::less;
}
break;
case '>':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::greaterequal;
} else if (Char == '>') {
char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
if (After == '=') {
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
Kind = tok::greatergreaterequal;
} else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
// If this is actually a '>>>>' conflict marker, recognize it as such
// and recover nicely.
goto LexNextToken;
} else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
// If this is '>>>>>>>' and we're in a conflict marker, ignore it.
goto LexNextToken;
} else if (LangOpts.CUDA && After == '>') {
Kind = tok::greatergreatergreater;
CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
SizeTmp2, Result);
} else {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::greatergreater;
}
} else {
Kind = tok::greater;
}
break;
case '^':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '=') {
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
Kind = tok::caretequal;
} else {
Kind = tok::caret;
}
break;
case '|':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '=') {
Kind = tok::pipeequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else if (Char == '|') {
// If this is '|||||||' and we're in a conflict marker, ignore it.
if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
goto LexNextToken;
Kind = tok::pipepipe;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
Kind = tok::pipe;
}
break;
case ':':
Char = getCharAndSize(CurPtr, SizeTmp);
if (LangOpts.Digraphs && Char == '>') {
Kind = tok::r_square; // ':>' -> ']'
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else if (LangOpts.CPlusPlus && Char == ':') {
Kind = tok::coloncolon;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
Kind = tok::colon;
}
break;
case ';':
Kind = tok::semi;
break;
case '=':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '=') {
// If this is '====' and we're in a conflict marker, ignore it.
if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
goto LexNextToken;
Kind = tok::equalequal;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
Kind = tok::equal;
}
break;
case ',':
Kind = tok::comma;
break;
case '#':
Char = getCharAndSize(CurPtr, SizeTmp);
if (Char == '#') {
Kind = tok::hashhash;
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else if (Char == '@' && LangOpts.MicrosoftExt) { // #@ -> Charize
Kind = tok::hashat;
if (!isLexingRawMode())
Diag(BufferPtr, diag::ext_charize_microsoft);
CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
} else {
// We parsed a # character. If this occurs at the start of the line,
// it's actually the start of a preprocessing directive. Callback to
// the preprocessor to handle it.
// TODO: -fpreprocessed mode??
if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
goto HandleDirective;
Kind = tok::hash;
}
break;
case '@':
// Objective C support.
if (CurPtr[-1] == '@' && LangOpts.ObjC1)
Kind = tok::at;
else
Kind = tok::unknown;
break;
// UCNs (C99 6.4.3, C++11 [lex.charset]p2)
case '\\':
if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
return true; // KeepWhitespaceMode
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
return LexUnicode(Result, CodePoint, CurPtr);
}
Kind = tok::unknown;
break;
default: {
if (isASCII(Char)) {
Kind = tok::unknown;
break;
}
UTF32 CodePoint;
// We can't just reset CurPtr to BufferPtr because BufferPtr may point to
// an escaped newline.
--CurPtr;
ConversionResult Status =
llvm::convertUTF8Sequence((const UTF8 **)&CurPtr,
(const UTF8 *)BufferEnd,
&CodePoint,
strictConversion);
if (Status == conversionOK) {
if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
return true; // KeepWhitespaceMode
// We only saw whitespace, so just try again with this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
return LexUnicode(Result, CodePoint, CurPtr);
}
if (isLexingRawMode() || ParsingPreprocessorDirective ||
PP->isPreprocessedOutput()) {
++CurPtr;
Kind = tok::unknown;
break;
}
// Non-ASCII characters tend to creep into source code unintentionally.
// Instead of letting the parser complain about the unknown token,
// just diagnose the invalid UTF-8, then drop the character.
Diag(CurPtr, diag::err_invalid_utf8);
BufferPtr = CurPtr+1;
// We're pretending the character didn't exist, so just try again with
// this lexer.
// (We manually eliminate the tail call to avoid recursion.)
goto LexNextToken;
}
}
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
// Update the location of token as well as BufferPtr.
FormTokenWithChars(Result, CurPtr, Kind);
return true;
HandleDirective:
// We parsed a # character and it's the start of a preprocessing directive.
FormTokenWithChars(Result, CurPtr, tok::hash);
PP->HandleDirective(Result);
if (PP->hadModuleLoaderFatalFailure()) {
// With a fatal failure in the module loader, we abort parsing.
assert(Result.is(tok::eof) && "Preprocessor did not set tok:eof");
return true;
}
// We parsed the directive; lex a token with the new state.
return false;
}