in lib/src/parser.c [316:497]
static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId parse_state) {
Length start_position = ts_stack_position(self->stack, version);
Subtree external_token = ts_stack_last_external_token(self->stack, version);
TSLexMode lex_mode = self->language->lex_modes[parse_state];
const bool *valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
);
bool found_external_token = false;
bool error_mode = parse_state == ERROR_STATE;
bool skipped_error = false;
int32_t first_error_character = 0;
Length error_start_position = length_zero();
Length error_end_position = length_zero();
uint32_t lookahead_end_byte = 0;
ts_lexer_reset(&self->lexer, start_position);
for (;;) {
Length current_position = self->lexer.current_position;
if (valid_external_tokens) {
LOG(
"lex_external state:%d, row:%u, column:%u",
lex_mode.external_lex_state,
current_position.extent.row + 1,
current_position.extent.column
);
ts_lexer_start(&self->lexer);
ts_parser__restore_external_scanner(self, external_token);
bool found_token = self->language->external_scanner.scan(
self->external_scanner_payload,
&self->lexer.data,
valid_external_tokens
);
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
// Zero-length external tokens are generally allowed, but they're not
// allowed right after a syntax error. This is for two reasons:
// 1. After a syntax error, the lexer is looking for any possible token,
// as opposed to the specific set of tokens that are valid in some
// parse state. In this situation, it's very easy for an external
// scanner to produce unwanted zero-length tokens.
// 2. The parser sometimes inserts *missing* tokens to recover from
// errors. These tokens are also zero-length. If we allow more
// zero-length tokens to be created after missing tokens, it
// can lead to infinite loops. Forbidding zero-length tokens
// right at the point of error recovery is a conservative strategy
// for preventing this kind of infinite loop.
if (found_token && (
self->lexer.token_end_position.bytes > current_position.bytes ||
(!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
)) {
found_external_token = true;
break;
}
ts_lexer_reset(&self->lexer, current_position);
}
LOG(
"lex_internal state:%d, row:%u, column:%u",
lex_mode.lex_state,
current_position.extent.row + 1,
current_position.extent.column
);
ts_lexer_start(&self->lexer);
bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state);
ts_lexer_finish(&self->lexer, &lookahead_end_byte);
if (found_token) break;
if (!error_mode) {
error_mode = true;
lex_mode = self->language->lex_modes[ERROR_STATE];
valid_external_tokens = ts_language_enabled_external_tokens(
self->language,
lex_mode.external_lex_state
);
ts_lexer_reset(&self->lexer, start_position);
continue;
}
if (!skipped_error) {
LOG("skip_unrecognized_character");
skipped_error = true;
error_start_position = self->lexer.token_start_position;
error_end_position = self->lexer.token_start_position;
first_error_character = self->lexer.data.lookahead;
}
if (self->lexer.current_position.bytes == error_end_position.bytes) {
if (self->lexer.data.lookahead == 0) {
self->lexer.data.result_symbol = ts_builtin_sym_error;
break;
}
self->lexer.data.advance(&self->lexer.data, false);
}
error_end_position = self->lexer.current_position;
}
Subtree result;
if (skipped_error) {
Length padding = length_sub(error_start_position, start_position);
Length size = length_sub(error_end_position, error_start_position);
uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes;
result = ts_subtree_new_error(
&self->tree_pool,
first_error_character,
padding,
size,
lookahead_bytes,
parse_state,
self->language
);
LOG(
"lexed_lookahead sym:%s, size:%u, character:'%c'",
SYM_NAME(ts_subtree_symbol(result)),
ts_subtree_total_size(result).bytes,
first_error_character
);
} else {
if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) {
self->lexer.token_start_position = self->lexer.token_end_position;
}
bool is_keyword = false;
TSSymbol symbol = self->lexer.data.result_symbol;
Length padding = length_sub(self->lexer.token_start_position, start_position);
Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes;
if (found_external_token) {
symbol = self->language->external_scanner.symbol_map[symbol];
} else if (symbol == self->language->keyword_capture_token && symbol != 0) {
uint32_t end_byte = self->lexer.token_end_position.bytes;
ts_lexer_reset(&self->lexer, self->lexer.token_start_position);
ts_lexer_start(&self->lexer);
if (
self->language->keyword_lex_fn(&self->lexer.data, 0) &&
self->lexer.token_end_position.bytes == end_byte &&
ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol)
) {
is_keyword = true;
symbol = self->lexer.data.result_symbol;
}
}
result = ts_subtree_new_leaf(
&self->tree_pool,
symbol,
padding,
size,
lookahead_bytes,
parse_state,
found_external_token,
is_keyword,
self->language
);
if (found_external_token) {
unsigned length = self->language->external_scanner.serialize(
self->external_scanner_payload,
self->lexer.debug_buffer
);
ts_external_scanner_state_init(
&((SubtreeHeapData *)result.ptr)->external_scanner_state,
self->lexer.debug_buffer,
length
);
}
LOG(
"lexed_lookahead sym:%s, size:%u",
SYM_NAME(ts_subtree_symbol(result)),
ts_subtree_total_size(result).bytes
);
}
return result;
}