static Subtree ts_parser__lex()

in lib/src/parser.c [316:497]


static Subtree ts_parser__lex(TSParser *self, StackVersion version, TSStateId parse_state) {
  Length start_position = ts_stack_position(self->stack, version);
  Subtree external_token = ts_stack_last_external_token(self->stack, version);
  TSLexMode lex_mode = self->language->lex_modes[parse_state];
  const bool *valid_external_tokens = ts_language_enabled_external_tokens(
    self->language,
    lex_mode.external_lex_state
  );

  bool found_external_token = false;
  bool error_mode = parse_state == ERROR_STATE;
  bool skipped_error = false;
  int32_t first_error_character = 0;
  Length error_start_position = length_zero();
  Length error_end_position = length_zero();
  uint32_t lookahead_end_byte = 0;
  ts_lexer_reset(&self->lexer, start_position);

  for (;;) {
    Length current_position = self->lexer.current_position;

    if (valid_external_tokens) {
      LOG(
        "lex_external state:%d, row:%u, column:%u",
        lex_mode.external_lex_state,
        current_position.extent.row + 1,
        current_position.extent.column
      );
      ts_lexer_start(&self->lexer);
      ts_parser__restore_external_scanner(self, external_token);
      bool found_token = self->language->external_scanner.scan(
        self->external_scanner_payload,
        &self->lexer.data,
        valid_external_tokens
      );
      ts_lexer_finish(&self->lexer, &lookahead_end_byte);

      // Zero-length external tokens are generally allowed, but they're not
      // allowed right after a syntax error. This is for two reasons:
      // 1. After a syntax error, the lexer is looking for any possible token,
      //    as opposed to the specific set of tokens that are valid in some
      //    parse state. In this situation, it's very easy for an external
      //    scanner to produce unwanted zero-length tokens.
      // 2. The parser sometimes inserts *missing* tokens to recover from
      //    errors. These tokens are also zero-length. If we allow more
      //    zero-length tokens to be created after missing tokens, it
      //    can lead to infinite loops. Forbidding zero-length tokens
      //    right at the point of error recovery is a conservative strategy
      //    for preventing this kind of infinite loop.
      if (found_token && (
        self->lexer.token_end_position.bytes > current_position.bytes ||
        (!error_mode && ts_stack_has_advanced_since_error(self->stack, version))
      )) {
        found_external_token = true;
        break;
      }

      ts_lexer_reset(&self->lexer, current_position);
    }

    LOG(
      "lex_internal state:%d, row:%u, column:%u",
      lex_mode.lex_state,
      current_position.extent.row + 1,
      current_position.extent.column
    );
    ts_lexer_start(&self->lexer);
    bool found_token = self->language->lex_fn(&self->lexer.data, lex_mode.lex_state);
    ts_lexer_finish(&self->lexer, &lookahead_end_byte);
    if (found_token) break;

    if (!error_mode) {
      error_mode = true;
      lex_mode = self->language->lex_modes[ERROR_STATE];
      valid_external_tokens = ts_language_enabled_external_tokens(
        self->language,
        lex_mode.external_lex_state
      );
      ts_lexer_reset(&self->lexer, start_position);
      continue;
    }

    if (!skipped_error) {
      LOG("skip_unrecognized_character");
      skipped_error = true;
      error_start_position = self->lexer.token_start_position;
      error_end_position = self->lexer.token_start_position;
      first_error_character = self->lexer.data.lookahead;
    }

    if (self->lexer.current_position.bytes == error_end_position.bytes) {
      if (self->lexer.data.lookahead == 0) {
        self->lexer.data.result_symbol = ts_builtin_sym_error;
        break;
      }
      self->lexer.data.advance(&self->lexer.data, false);
    }

    error_end_position = self->lexer.current_position;
  }

  Subtree result;
  if (skipped_error) {
    Length padding = length_sub(error_start_position, start_position);
    Length size = length_sub(error_end_position, error_start_position);
    uint32_t lookahead_bytes = lookahead_end_byte - error_end_position.bytes;
    result = ts_subtree_new_error(
      &self->tree_pool,
      first_error_character,
      padding,
      size,
      lookahead_bytes,
      parse_state,
      self->language
    );

    LOG(
      "lexed_lookahead sym:%s, size:%u, character:'%c'",
      SYM_NAME(ts_subtree_symbol(result)),
      ts_subtree_total_size(result).bytes,
      first_error_character
    );
  } else {
    if (self->lexer.token_end_position.bytes < self->lexer.token_start_position.bytes) {
      self->lexer.token_start_position = self->lexer.token_end_position;
    }

    bool is_keyword = false;
    TSSymbol symbol = self->lexer.data.result_symbol;
    Length padding = length_sub(self->lexer.token_start_position, start_position);
    Length size = length_sub(self->lexer.token_end_position, self->lexer.token_start_position);
    uint32_t lookahead_bytes = lookahead_end_byte - self->lexer.token_end_position.bytes;

    if (found_external_token) {
      symbol = self->language->external_scanner.symbol_map[symbol];
    } else if (symbol == self->language->keyword_capture_token && symbol != 0) {
      uint32_t end_byte = self->lexer.token_end_position.bytes;
      ts_lexer_reset(&self->lexer, self->lexer.token_start_position);
      ts_lexer_start(&self->lexer);
      if (
        self->language->keyword_lex_fn(&self->lexer.data, 0) &&
        self->lexer.token_end_position.bytes == end_byte &&
        ts_language_has_actions(self->language, parse_state, self->lexer.data.result_symbol)
      ) {
        is_keyword = true;
        symbol = self->lexer.data.result_symbol;
      }
    }

    result = ts_subtree_new_leaf(
      &self->tree_pool,
      symbol,
      padding,
      size,
      lookahead_bytes,
      parse_state,
      found_external_token,
      is_keyword,
      self->language
    );

    if (found_external_token) {
      unsigned length = self->language->external_scanner.serialize(
        self->external_scanner_payload,
        self->lexer.debug_buffer
      );
      ts_external_scanner_state_init(
        &((SubtreeHeapData *)result.ptr)->external_scanner_state,
        self->lexer.debug_buffer,
        length
      );
    }

    LOG(
      "lexed_lookahead sym:%s, size:%u",
      SYM_NAME(ts_subtree_symbol(result)),
      ts_subtree_total_size(result).bytes
    );
  }

  return result;
}