void Tokenizer::get_tokens()

in mysqlshdk/libs/db/mysqlx/tokenizer.cc [300:503]


void Tokenizer::get_tokens() {
  bool arrow_last = false;
  bool inside_arrow = false;
  for (size_t i = 0; i < _input.size(); ++i) {
    char c = _input[i];
    if (std::isspace(c)) {
      // do nothing
      continue;
    } else if (std::isdigit(c)) {
      // numerical literal
      const int start = i;
      // floating grammar is
      // float -> int '.' (int | (int expo[sign] int))
      // int -> digit +
      // expo -> 'E' | 'e'
      // sign -> '-' | '+'
      while (i < _input.size() && std::isdigit(c = _input[i])) ++i;
      if (i < _input.size() && _input[i] == '.') {
        ++i;
        while (i < _input.size() && std::isdigit(_input[i])) ++i;
        if (i < _input.size() && std::toupper(_input[i]) == 'E') {
          ++i;
          if (i < _input.size() && (((c = _input[i]) == '-') || (c == '+')))
            ++i;
          size_t j = i;
          while (i < _input.size() && std::isdigit(_input[i])) i++;
          if (i == j)
            throw Parser_error(
                "Missing exponential value for floating point at position " +
                std::to_string(start));
        }
        _tokens.push_back(Token(Token::Type::LNUM,
                                std::string(_input, start, i - start), start));
      } else {
        _tokens.push_back(Token(Token::Type::LINTEGER,
                                std::string(_input, start, i - start), start));
      }
      if (i < _input.size()) --i;
    } else if (!std::isalpha(c) && c != '_') {
      // # non-identifier, e.g. operator or quoted literal
      if (c == '?') {
        _tokens.push_back(
            Token(Token::Type::PLACEHOLDER, std::string(1, c), i));
      } else if (c == '+') {
        _tokens.push_back(Token(Token::Type::PLUS, std::string(1, c), i));
      } else if (c == '-') {
        if (!arrow_last && next_char_is(i, '>')) {
          if (next_char_is(i + 1, '>')) {
            _tokens.push_back(Token(Token::Type::TWOHEADARROW, "->>", i));
            i += 2;
          } else {
            _tokens.push_back(Token(Token::Type::ARROW, "->", i++));
          }
          arrow_last = true;
          continue;
        } else {
          _tokens.push_back(Token(Token::Type::MINUS, std::string(1, c), i));
        }
      } else if (c == '*') {
        if (next_char_is(i, '*')) {
          _tokens.push_back(
              Token(Token::Type::DOUBLESTAR, std::string("**"), i++));
        } else {
          _tokens.push_back(Token(Token::Type::MUL, std::string(1, c), i));
        }
      } else if (c == '/') {
        _tokens.push_back(Token(Token::Type::DIV, std::string(1, c), i));
      } else if (c == '$') {
        _tokens.push_back(Token(Token::Type::DOLLAR, std::string(1, c), i));
      } else if (c == '%') {
        _tokens.push_back(Token(Token::Type::MOD, std::string(1, c), i));
      } else if (c == '=') {
        _tokens.push_back(Token(Token::Type::EQ, std::string(1, c), i));
      } else if (c == '&') {
        _tokens.push_back(Token(Token::Type::BITAND, std::string(1, c), i));
      } else if (c == '|') {
        _tokens.push_back(Token(Token::Type::BITOR, std::string(1, c), i));
      } else if (c == '(') {
        _tokens.push_back(Token(Token::Type::LPAREN, std::string(1, c), i));
      } else if (c == ')') {
        _tokens.push_back(Token(Token::Type::RPAREN, std::string(1, c), i));
      } else if (c == '[') {
        _tokens.push_back(Token(Token::Type::LSQBRACKET, std::string(1, c), i));
      } else if (c == ']') {
        _tokens.push_back(Token(Token::Type::RSQBRACKET, std::string(1, c), i));
      } else if (c == '{') {
        _tokens.push_back(Token(Token::Type::LCURLY, std::string(1, c), i));
      } else if (c == '}') {
        _tokens.push_back(Token(Token::Type::RCURLY, std::string(1, c), i));
      } else if (c == '~') {
        _tokens.push_back(Token(Token::Type::NEG, std::string(1, c), i));
      } else if (c == ',') {
        _tokens.push_back(Token(Token::Type::COMMA, std::string(1, c), i));
      } else if (c == ':') {
        _tokens.push_back(Token(Token::Type::COLON, std::string(1, c), i));
      } else if (c == '!') {
        if (next_char_is(i, '=')) {
          _tokens.push_back(Token(Token::Type::NE, std::string("!="), i++));
        } else {
          _tokens.push_back(Token(Token::Type::BANG, std::string(1, c), i));
        }
      } else if (c == '<') {
        if (next_char_is(i, '<')) {
          _tokens.push_back(Token(Token::Type::LSHIFT, std::string("<<"), i++));
        } else if (next_char_is(i, '=')) {
          _tokens.push_back(Token(Token::Type::LE, std::string("<="), i++));
        } else if (next_char_is(i, '>')) {
          _tokens.push_back(Token(Token::Type::NE, std::string("!="), i++));
        } else {
          _tokens.push_back(Token(Token::Type::LT, std::string("<"), i));
        }
      } else if (c == '>') {
        if (next_char_is(i, '>')) {
          _tokens.push_back(Token(Token::Type::RSHIFT, std::string(">>"), i++));
        } else if (next_char_is(i, '=')) {
          _tokens.push_back(Token(Token::Type::GE, std::string(">="), i++));
        } else {
          _tokens.push_back(Token(Token::Type::GT, std::string(1, c), i));
        }
      } else if (c == '.') {
        if ((i + 1) < _input.size() && std::isdigit(_input[i + 1])) {
          const size_t start = i;
          ++i;
          // floating grammar is
          // float -> '.' (int | (int expo[sign] int))
          // nint->digit +
          // expo -> 'E' | 'e'
          // sign -> '-' | '+'
          while (i < _input.size() && std::isdigit(_input[i])) ++i;
          if (i < _input.size() && std::toupper(_input[i]) == 'E') {
            ++i;
            if (i < _input.size() && (((c = _input[i]) == '+') || (c == '-')))
              ++i;
            size_t j = i;
            while (i < _input.size() && std::isdigit(_input[i])) ++i;
            if (i == j)
              throw Parser_error(
                  "Missing exponential value for floating point at position " +
                  std::to_string(start));
          }
          _tokens.push_back(Token(
              Token::Type::LNUM, std::string(_input, start, i - start), start));
          if (i < _input.size()) --i;
        } else {
          _tokens.push_back(Token(Token::Type::DOT, std::string(1, c), i));
        }
      } else if (c == '\'' && arrow_last) {
        _tokens.push_back(Token(Token::Type::QUOTE, "'", i));
        if (!inside_arrow)
          inside_arrow = true;
        else {
          arrow_last = false;
          inside_arrow = false;
        }
      } else if (c == '"' || c == '\'' || c == '`') {
        char quote_char = c;
        std::string val;
        size_t start = ++i;

        while (i < _input.size()) {
          c = _input[i];
          if ((c == quote_char) && ((i + 1) < _input.size()) &&
              (_input[i + 1] != quote_char)) {
            // break if we have a quote char that's not double
            break;
          } else if ((c == quote_char) || (c == '\\' && quote_char != '`')) {
            // && quote_char != '`'
            // this quote char has to be doubled
            if ((i + 1) >= _input.size()) break;
            val.append(1, _input[++i]);
          } else {
            val.append(1, c);
          }
          ++i;
        }
        if ((i >= _input.size()) && (_input[i] != quote_char)) {
          throw Parser_error(
              "Unterminated quoted string starting at position " +
              std::to_string(start));
        }
        if (quote_char == '`') {
          _tokens.push_back(Token(Token::Type::IDENT, val, start));
        } else {
          _tokens.push_back(Token(Token::Type::LSTRING, val, start));
        }
      } else {
        throw Parser_error("Unknown character at position " +
                           std::to_string(i));
      }
    } else {
      size_t start = i;
      while (i < _input.size() && (std::isalnum(_input[i]) || _input[i] == '_'))
        ++i;
      std::string val(_input, start, i - start);
      Maps::reserved_words_t::const_iterator it = map.reserved_words.find(val);
      if (it != map.reserved_words.end()) {
        _tokens.push_back(Token(it->second, val, start));
      } else {
        _tokens.push_back(Token(Token::Type::IDENT, val, start));
      }
      --i;
    }
  }
}