in src/tokenizer.rs [938:1694]
fn next_token(
&self,
chars: &mut State,
prev_token: Option<&Token>,
) -> Result<Option<Token>, TokenizerError> {
match chars.peek() {
Some(&ch) => match ch {
' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)),
'\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)),
'\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)),
'\r' => {
// Emit a single Whitespace::Newline token for \r and \r\n
chars.next();
if let Some('\n') = chars.peek() {
chars.next();
}
Ok(Some(Token::Whitespace(Whitespace::Newline)))
}
// BigQuery and MySQL use b or B for byte string literal, Postgres for bit strings
b @ 'B' | b @ 'b' if dialect_of!(self is BigQueryDialect | PostgreSqlDialect | MySqlDialect | GenericDialect) =>
{
chars.next(); // consume
match chars.peek() {
Some('\'') => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedByteStringLiteral,
Token::TripleSingleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\'', false)?;
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
}
Some('\"') => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedByteStringLiteral,
Token::TripleDoubleQuotedByteStringLiteral,
);
}
let s = self.tokenize_single_quoted_string(chars, '\"', false)?;
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
}
_ => {
// regular identifier starting with an "b" or "B"
let s = self.tokenize_word(b, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// BigQuery uses r or R for raw string literal
b @ 'R' | b @ 'r' if dialect_of!(self is BigQueryDialect | GenericDialect) => {
chars.next(); // consume
match chars.peek() {
Some('\'') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
false,
Token::SingleQuotedRawStringLiteral,
Token::TripleSingleQuotedRawStringLiteral,
),
Some('\"') => self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
false,
Token::DoubleQuotedRawStringLiteral,
Token::TripleDoubleQuotedRawStringLiteral,
),
_ => {
// regular identifier starting with an "r" or "R"
let s = self.tokenize_word(b, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// Redshift uses lower case n for national string literal
n @ 'N' | n @ 'n' => {
chars.next(); // consume, to check the next char
match chars.peek() {
Some('\'') => {
// N'...' - a <national character string literal>
let backslash_escape =
self.dialect.supports_string_literal_backslash_escape();
let s =
self.tokenize_single_quoted_string(chars, '\'', backslash_escape)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
// regular identifier starting with an "N"
let s = self.tokenize_word(n, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// PostgreSQL accepts "escape" string constants, which are an extension to the SQL standard.
x @ 'e' | x @ 'E' if self.dialect.supports_string_escape_constant() => {
let starting_loc = chars.location();
chars.next(); // consume, to check the next char
match chars.peek() {
Some('\'') => {
let s =
self.tokenize_escaped_single_quoted_string(starting_loc, chars)?;
Ok(Some(Token::EscapedStringLiteral(s)))
}
_ => {
// regular identifier starting with an "E" or "e"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// Unicode string literals like U&'first \000A second' are supported in some dialects, including PostgreSQL
x @ 'u' | x @ 'U' if self.dialect.supports_unicode_string_literal() => {
chars.next(); // consume, to check the next char
if chars.peek() == Some(&'&') {
// we cannot advance the iterator here, as we need to consume the '&' later if the 'u' was an identifier
let mut chars_clone = chars.peekable.clone();
chars_clone.next(); // consume the '&' in the clone
if chars_clone.peek() == Some(&'\'') {
chars.next(); // consume the '&' in the original iterator
let s = unescape_unicode_single_quoted_string(chars)?;
return Ok(Some(Token::UnicodeStringLiteral(s)));
}
}
// regular identifier starting with an "U" or "u"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
// The spec only allows an uppercase 'X' to introduce a hex
// string, but PostgreSQL, at least, allows a lowercase 'x' too.
x @ 'x' | x @ 'X' => {
chars.next(); // consume, to check the next char
match chars.peek() {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_single_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
// regular identifier starting with an "X"
let s = self.tokenize_word(x, chars);
Ok(Some(Token::make_word(&s, None)))
}
}
}
// single quoted string
'\'' => {
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
Token::SingleQuotedString,
Token::TripleSingleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::SingleQuotedString(s)))
}
// double quoted string
'\"' if !self.dialect.is_delimited_identifier_start(ch)
&& !self.dialect.is_identifier_start(ch) =>
{
if self.dialect.supports_triple_quoted_string() {
return self
.tokenize_single_or_triple_quoted_string::<fn(String) -> Token>(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
Token::DoubleQuotedString,
Token::TripleDoubleQuotedString,
);
}
let s = self.tokenize_single_quoted_string(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
)?;
Ok(Some(Token::DoubleQuotedString(s)))
}
// delimited (quoted) identifier
quote_start if self.dialect.is_delimited_identifier_start(ch) => {
let word = self.tokenize_quoted_identifier(quote_start, chars)?;
Ok(Some(Token::make_word(&word, Some(quote_start))))
}
// Potentially nested delimited (quoted) identifier
quote_start
if self
.dialect
.is_nested_delimited_identifier_start(quote_start)
&& self
.dialect
.peek_nested_delimited_identifier_quotes(chars.peekable.clone())
.is_some() =>
{
let Some((quote_start, nested_quote_start)) = self
.dialect
.peek_nested_delimited_identifier_quotes(chars.peekable.clone())
else {
return self.tokenizer_error(
chars.location(),
format!("Expected nested delimiter '{quote_start}' before EOF."),
);
};
let Some(nested_quote_start) = nested_quote_start else {
let word = self.tokenize_quoted_identifier(quote_start, chars)?;
return Ok(Some(Token::make_word(&word, Some(quote_start))));
};
let mut word = vec![];
let quote_end = Word::matching_end_quote(quote_start);
let nested_quote_end = Word::matching_end_quote(nested_quote_start);
let error_loc = chars.location();
chars.next(); // skip the first delimiter
peeking_take_while(chars, |ch| ch.is_whitespace());
if chars.peek() != Some(&nested_quote_start) {
return self.tokenizer_error(
error_loc,
format!("Expected nested delimiter '{nested_quote_start}' before EOF."),
);
}
word.push(nested_quote_start.into());
word.push(self.tokenize_quoted_identifier(nested_quote_end, chars)?);
word.push(nested_quote_end.into());
peeking_take_while(chars, |ch| ch.is_whitespace());
if chars.peek() != Some("e_end) {
return self.tokenizer_error(
error_loc,
format!("Expected close delimiter '{quote_end}' before EOF."),
);
}
chars.next(); // skip close delimiter
Ok(Some(Token::make_word(&word.concat(), Some(quote_start))))
}
// numbers and period
'0'..='9' | '.' => {
// Some dialects support underscore as number separator
// There can only be one at a time and it must be followed by another digit
let is_number_separator = |ch: char, next_char: Option<char>| {
self.dialect.supports_numeric_literal_underscores()
&& ch == '_'
&& next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit())
};
let mut s = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});
// match binary literal that starts with 0x
if s == "0" && chars.peek() == Some(&'x') {
chars.next();
let s2 = peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_hexdigit() || is_number_separator(ch, next_ch)
});
return Ok(Some(Token::HexStringLiteral(s2)));
}
// match one period
if let Some('.') = chars.peek() {
s.push('.');
chars.next();
}
// If the dialect supports identifiers that start with a numeric prefix
// and we have now consumed a dot, check if the previous token was a Word.
// If so, what follows is definitely not part of a decimal number and
// we should yield the dot as a dedicated token so compound identifiers
// starting with digits can be parsed correctly.
if s == "." && self.dialect.supports_numeric_prefix() {
if let Some(Token::Word(_)) = prev_token {
return Ok(Some(Token::Period));
}
}
// Consume fractional digits.
s += &peeking_next_take_while(chars, |ch, next_ch| {
ch.is_ascii_digit() || is_number_separator(ch, next_ch)
});
// No fraction -> Token::Period
if s == "." {
return Ok(Some(Token::Period));
}
// Parse exponent as number
let mut exponent_part = String::new();
if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') {
let mut char_clone = chars.peekable.clone();
exponent_part.push(char_clone.next().unwrap());
// Optional sign
match char_clone.peek() {
Some(&c) if matches!(c, '+' | '-') => {
exponent_part.push(c);
char_clone.next();
}
_ => (),
}
match char_clone.peek() {
// Definitely an exponent, get original iterator up to speed and use it
Some(&c) if c.is_ascii_digit() => {
for _ in 0..exponent_part.len() {
chars.next();
}
exponent_part +=
&peeking_take_while(chars, |ch| ch.is_ascii_digit());
s += exponent_part.as_str();
}
// Not an exponent, discard the work done
_ => (),
}
}
// If the dialect supports identifiers that start with a numeric prefix,
// we need to check if the value is in fact an identifier and must thus
// be tokenized as a word.
if self.dialect.supports_numeric_prefix() {
if exponent_part.is_empty() {
// If it is not a number with an exponent, it may be
// an identifier starting with digits.
let word =
peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch));
if !word.is_empty() {
s += word.as_str();
return Ok(Some(Token::make_word(s.as_str(), None)));
}
} else if prev_token == Some(&Token::Period) {
// If the previous token was a period, thus not belonging to a number,
// the value we have is part of an identifier.
return Ok(Some(Token::make_word(s.as_str(), None)));
}
}
let long = if chars.peek() == Some(&'L') {
chars.next();
true
} else {
false
};
Ok(Some(Token::Number(s, long)))
}
// punctuation
'(' => self.consume_and_return(chars, Token::LParen),
')' => self.consume_and_return(chars, Token::RParen),
',' => self.consume_and_return(chars, Token::Comma),
// operators
'-' => {
chars.next(); // consume the '-'
match chars.peek() {
Some('-') => {
let mut is_comment = true;
if self.dialect.requires_single_line_comment_whitespace() {
is_comment = Some(' ') == chars.peekable.clone().nth(1);
}
if is_comment {
chars.next(); // consume second '-'
let comment = self.tokenize_single_line_comment(chars);
return Ok(Some(Token::Whitespace(
Whitespace::SingleLineComment {
prefix: "--".to_owned(),
comment,
},
)));
}
self.start_binop(chars, "-", Token::Minus)
}
Some('>') => {
chars.next();
match chars.peek() {
Some('>') => self.consume_for_binop(chars, "->>", Token::LongArrow),
_ => self.start_binop(chars, "->", Token::Arrow),
}
}
// a regular '-' operator
_ => self.start_binop(chars, "-", Token::Minus),
}
}
'/' => {
chars.next(); // consume the '/'
match chars.peek() {
Some('*') => {
chars.next(); // consume the '*', starting a multi-line comment
self.tokenize_multiline_comment(chars)
}
Some('/') if dialect_of!(self is SnowflakeDialect) => {
chars.next(); // consume the second '/', starting a snowflake single-line comment
let comment = self.tokenize_single_line_comment(chars);
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "//".to_owned(),
comment,
})))
}
Some('/') if dialect_of!(self is DuckDbDialect | GenericDialect) => {
self.consume_and_return(chars, Token::DuckIntDiv)
}
// a regular '/' operator
_ => Ok(Some(Token::Div)),
}
}
'+' => self.consume_and_return(chars, Token::Plus),
'*' => self.consume_and_return(chars, Token::Mul),
'%' => {
chars.next(); // advance past '%'
match chars.peek() {
Some(s) if s.is_whitespace() => Ok(Some(Token::Mod)),
Some(sch) if self.dialect.is_identifier_start('%') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
}
_ => self.start_binop(chars, "%", Token::Mod),
}
}
'|' => {
chars.next(); // consume the '|'
match chars.peek() {
Some('/') => self.consume_for_binop(chars, "|/", Token::PGSquareRoot),
Some('|') => {
chars.next(); // consume the second '|'
match chars.peek() {
Some('/') => {
self.consume_for_binop(chars, "||/", Token::PGCubeRoot)
}
_ => self.start_binop(chars, "||", Token::StringConcat),
}
}
Some('&') if self.dialect.supports_geometric_types() => {
chars.next(); // consume
match chars.peek() {
Some('>') => self.consume_for_binop(
chars,
"|&>",
Token::VerticalBarAmpersandRightAngleBracket,
),
_ => self.start_binop_opt(chars, "|&", None),
}
}
Some('>') if self.dialect.supports_geometric_types() => {
chars.next(); // consume
match chars.peek() {
Some('>') => self.consume_for_binop(
chars,
"|>>",
Token::VerticalBarShiftRight,
),
_ => self.start_binop_opt(chars, "|>", None),
}
}
Some('>') if self.dialect.supports_pipe_operator() => {
self.consume_for_binop(chars, "|>", Token::VerticalBarRightAngleBracket)
}
// Bitshift '|' operator
_ => self.start_binop(chars, "|", Token::Pipe),
}
}
'=' => {
chars.next(); // consume
match chars.peek() {
Some('>') => self.consume_and_return(chars, Token::RArrow),
Some('=') => self.consume_and_return(chars, Token::DoubleEq),
_ => Ok(Some(Token::Eq)),
}
}
'!' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_and_return(chars, Token::Neq),
Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark),
Some('~') => {
chars.next();
match chars.peek() {
Some('*') => self
.consume_and_return(chars, Token::ExclamationMarkTildeAsterisk),
Some('~') => {
chars.next();
match chars.peek() {
Some('*') => self.consume_and_return(
chars,
Token::ExclamationMarkDoubleTildeAsterisk,
),
_ => Ok(Some(Token::ExclamationMarkDoubleTilde)),
}
}
_ => Ok(Some(Token::ExclamationMarkTilde)),
}
}
_ => Ok(Some(Token::ExclamationMark)),
}
}
'<' => {
chars.next(); // consume
match chars.peek() {
Some('=') => {
chars.next();
match chars.peek() {
Some('>') => self.consume_for_binop(chars, "<=>", Token::Spaceship),
_ => self.start_binop(chars, "<=", Token::LtEq),
}
}
Some('|') if self.dialect.supports_geometric_types() => {
self.consume_for_binop(chars, "<<|", Token::ShiftLeftVerticalBar)
}
Some('>') => self.consume_for_binop(chars, "<>", Token::Neq),
Some('<') if self.dialect.supports_geometric_types() => {
chars.next(); // consume
match chars.peek() {
Some('|') => self.consume_for_binop(
chars,
"<<|",
Token::ShiftLeftVerticalBar,
),
_ => self.start_binop(chars, "<<", Token::ShiftLeft),
}
}
Some('<') => self.consume_for_binop(chars, "<<", Token::ShiftLeft),
Some('-') if self.dialect.supports_geometric_types() => {
chars.next(); // consume
match chars.peek() {
Some('>') => {
self.consume_for_binop(chars, "<->", Token::TwoWayArrow)
}
_ => self.start_binop_opt(chars, "<-", None),
}
}
Some('^') if self.dialect.supports_geometric_types() => {
self.consume_for_binop(chars, "<^", Token::LeftAngleBracketCaret)
}
Some('@') => self.consume_for_binop(chars, "<@", Token::ArrowAt),
_ => self.start_binop(chars, "<", Token::Lt),
}
}
'>' => {
chars.next(); // consume
match chars.peek() {
Some('=') => self.consume_for_binop(chars, ">=", Token::GtEq),
Some('>') => self.consume_for_binop(chars, ">>", Token::ShiftRight),
Some('^') if self.dialect.supports_geometric_types() => {
self.consume_for_binop(chars, ">^", Token::RightAngleBracketCaret)
}
_ => self.start_binop(chars, ">", Token::Gt),
}
}
':' => {
chars.next();
match chars.peek() {
Some(':') => self.consume_and_return(chars, Token::DoubleColon),
Some('=') => self.consume_and_return(chars, Token::Assignment),
_ => Ok(Some(Token::Colon)),
}
}
';' => self.consume_and_return(chars, Token::SemiColon),
'\\' => self.consume_and_return(chars, Token::Backslash),
'[' => self.consume_and_return(chars, Token::LBracket),
']' => self.consume_and_return(chars, Token::RBracket),
'&' => {
chars.next(); // consume the '&'
match chars.peek() {
Some('>') if self.dialect.supports_geometric_types() => {
chars.next();
self.consume_and_return(chars, Token::AmpersandRightAngleBracket)
}
Some('<') if self.dialect.supports_geometric_types() => {
chars.next(); // consume
match chars.peek() {
Some('|') => self.consume_and_return(
chars,
Token::AmpersandLeftAngleBracketVerticalBar,
),
_ => {
self.start_binop(chars, "&<", Token::AmpersandLeftAngleBracket)
}
}
}
Some('&') => {
chars.next(); // consume the second '&'
self.start_binop(chars, "&&", Token::Overlap)
}
// Bitshift '&' operator
_ => self.start_binop(chars, "&", Token::Ampersand),
}
}
'^' => {
chars.next(); // consume the '^'
match chars.peek() {
Some('@') => self.consume_and_return(chars, Token::CaretAt),
_ => Ok(Some(Token::Caret)),
}
}
'{' => self.consume_and_return(chars, Token::LBrace),
'}' => self.consume_and_return(chars, Token::RBrace),
'#' if dialect_of!(self is SnowflakeDialect | BigQueryDialect | MySqlDialect | HiveDialect) =>
{
chars.next(); // consume the '#', starting a snowflake single-line comment
let comment = self.tokenize_single_line_comment(chars);
Ok(Some(Token::Whitespace(Whitespace::SingleLineComment {
prefix: "#".to_owned(),
comment,
})))
}
'~' => {
chars.next(); // consume
match chars.peek() {
Some('*') => self.consume_for_binop(chars, "~*", Token::TildeAsterisk),
Some('=') if self.dialect.supports_geometric_types() => {
self.consume_for_binop(chars, "~=", Token::TildeEqual)
}
Some('~') => {
chars.next();
match chars.peek() {
Some('*') => {
self.consume_for_binop(chars, "~~*", Token::DoubleTildeAsterisk)
}
_ => self.start_binop(chars, "~~", Token::DoubleTilde),
}
}
_ => self.start_binop(chars, "~", Token::Tilde),
}
}
'#' => {
chars.next();
match chars.peek() {
Some('-') => self.consume_for_binop(chars, "#-", Token::HashMinus),
Some('>') => {
chars.next();
match chars.peek() {
Some('>') => {
self.consume_for_binop(chars, "#>>", Token::HashLongArrow)
}
_ => self.start_binop(chars, "#>", Token::HashArrow),
}
}
Some(' ') => Ok(Some(Token::Sharp)),
Some('#') if self.dialect.supports_geometric_types() => {
self.consume_for_binop(chars, "##", Token::DoubleSharp)
}
Some(sch) if self.dialect.is_identifier_start('#') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
}
_ => self.start_binop(chars, "#", Token::Sharp),
}
}
'@' => {
chars.next();
match chars.peek() {
Some('@') if self.dialect.supports_geometric_types() => {
self.consume_and_return(chars, Token::AtAt)
}
Some('-') if self.dialect.supports_geometric_types() => {
chars.next();
match chars.peek() {
Some('@') => self.consume_and_return(chars, Token::AtDashAt),
_ => self.start_binop_opt(chars, "@-", None),
}
}
Some('>') => self.consume_and_return(chars, Token::AtArrow),
Some('?') => self.consume_and_return(chars, Token::AtQuestion),
Some('@') => {
chars.next();
match chars.peek() {
Some(' ') => Ok(Some(Token::AtAt)),
Some(tch) if self.dialect.is_identifier_start('@') => {
self.tokenize_identifier_or_keyword([ch, '@', *tch], chars)
}
_ => Ok(Some(Token::AtAt)),
}
}
Some(' ') => Ok(Some(Token::AtSign)),
// We break on quotes here, because no dialect allows identifiers starting
// with @ and containing quotation marks (e.g. `@'foo'`) unless they are
// quoted, which is tokenized as a quoted string, not here (e.g.
// `"@'foo'"`). Further, at least two dialects parse `@` followed by a
// quoted string as two separate tokens, which this allows. For example,
// Postgres parses `@'1'` as the absolute value of '1' which is implicitly
// cast to a numeric type. And when parsing MySQL-style grantees (e.g.
// `GRANT ALL ON *.* to 'root'@'localhost'`), we also want separate tokens
// for the user, the `@`, and the host.
Some('\'') => Ok(Some(Token::AtSign)),
Some('\"') => Ok(Some(Token::AtSign)),
Some('`') => Ok(Some(Token::AtSign)),
Some(sch) if self.dialect.is_identifier_start('@') => {
self.tokenize_identifier_or_keyword([ch, *sch], chars)
}
_ => Ok(Some(Token::AtSign)),
}
}
// Postgres uses ? for jsonb operators, not prepared statements
'?' if self.dialect.supports_geometric_types() => {
chars.next(); // consume
match chars.peek() {
Some('|') => {
chars.next();
match chars.peek() {
Some('|') => self.consume_and_return(
chars,
Token::QuestionMarkDoubleVerticalBar,
),
_ => Ok(Some(Token::QuestionPipe)),
}
}
Some('&') => self.consume_and_return(chars, Token::QuestionAnd),
Some('-') => {
chars.next(); // consume
match chars.peek() {
Some('|') => self
.consume_and_return(chars, Token::QuestionMarkDashVerticalBar),
_ => Ok(Some(Token::QuestionMarkDash)),
}
}
Some('#') => self.consume_and_return(chars, Token::QuestionMarkSharp),
_ => self.consume_and_return(chars, Token::Question),
}
}
'?' => {
chars.next();
let s = peeking_take_while(chars, |ch| ch.is_numeric());
Ok(Some(Token::Placeholder(String::from("?") + &s)))
}
// identifier or keyword
ch if self.dialect.is_identifier_start(ch) => {
self.tokenize_identifier_or_keyword([ch], chars)
}
'$' => Ok(Some(self.tokenize_dollar_preceded_value(chars)?)),
// whitespace check (including unicode chars) should be last as it covers some of the chars above
ch if ch.is_whitespace() => {
self.consume_and_return(chars, Token::Whitespace(Whitespace::Space))
}
other => self.consume_and_return(chars, Token::Char(other)),
},
None => Ok(None),
}
}