cdk/parser/tokenizer.cc

/* * Copyright (c) 2015, 2024, Oracle and/or its affiliates. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, version 2.0, as * published by the Free Software Foundation. * * This program is designed to work with certain software (including * but not limited to OpenSSL) that is licensed under separate terms, as * designated in a particular file or component or in included license * documentation. The authors of MySQL hereby grant you an additional * permission to link the program and your derivative works with the * separately licensed software that they have either included with * the program or referenced in the documentation. * * Without limiting anything contained in the foregoing, this file, * which is part of Connector/C++, is also subject to the * Universal FOSS Exception, version 1.0, a copy of which can be found at * https://oss.oracle.com/licenses/universal-foss-exception. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License, version 2.0, for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include <mysql/cdk/common.h> PUSH_SYS_WARNINGS_CDK #include <stdexcept> #include <memory> #include <cstdlib> #include <cctype> #include <cstring> #include <cstdlib> POP_SYS_WARNINGS_CDK #include "tokenizer.h" using namespace parser; using std::string; bool Tokenizer::iterator::get_next_token() { skip_ws(); m_pos = char_iterator::cur_pos(); if (m_at_end || char_iterator::at_end()) { m_at_end = true; return false; } if ((unsigned)*m_pos < 127) { switch (*m_pos) { case '"': case '\'': if (parse_string()) return true; break; case 'x': case 'X': case '0': if (parse_hex()) return true; case '.': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': if (parse_number()) return true; break; default: break; } assert(!char_iterator::at_end()); // check symbol tokens, starting with 2+ char ones static struct symb_table_t { std::map<char, std::vector<std::pair<const char*, Token::Type>>> m_map; symb_table_t() { #define symbol_check(T,X) \ { \ auto &entry = m_map[(X)[0]]; \ entry.push_back({X,Token::T}); \ } SYMBOL_LIST2(symbol_check) } } symb_table; auto it = symb_table.m_map.find((char)*m_pos); if (it != symb_table.m_map.end()) { for (auto symb : it->second) { if (consume_chars(symb.first)) { set_token(symb.second); return true; } } } switch (*m_pos) { #define symbol_check1(T,X) \ case (X)[0]: consume_char(*m_pos); set_token(Token::T); return true; SYMBOL_LIST1(symbol_check1) default: break; } } /* Note: it is important to parse word last as some words can qualify as other tokens. */ if (parse_word()) return true; return false; } /* Parse number literal starting at position i. Returns Token::T_NULL if no number literal can start at position i (and leaves i unchanged). Otherwise returns Token::LINTEGER or Token::LNUM and sets i to the first position after the literal. The grammar used for numeric literals: number -> int | float int -> digit+ float -> digit* '.' digit+ expo? | digit+ expo expo -> ('E'|'e') ('+'|'-')? digit+ which is replaced by equivalent: number -> digit* ('.' digit+)? expo? with extra check that there is at least one digit if fractional part is missing. Original grammar for floating numbers: FLOAT ::= DIGIT* '.' DIGIT+ ('E' ('+'|'-')? DIGIT+)? | DIGIT+ 'E' ('+'|'-')? DIGIT+ */ bool Tokenizer::iterator::parse_digits() noexcept { bool has_digits = false; while (!char_iterator::at_end() && cur_char_in("0123456789")) { has_digits = true; next_unit(); } return has_digits; } bool Tokenizer::iterator::parse_number() { if (at_end()) return false; bool is_float = false; bool exponent = false; /* Note: '.' starts NUMBER token only if followed by a digit. Otherwise it is a single DOT token. */ if (cur_char_is(L'.') && !char_iterator::at_end(1) && !next_char_in("0123456789")) return false; // Parse leading digits, if any if (!parse_digits() && !cur_char_is('.')) { return false; } // Handle decimal point, if any if (!char_iterator::at_end() && consume_char('.')) { is_float = true; if (!parse_digits()) throw_error("No digits after decimal point"); } // See if we have exponent (but it is not parsed yet) if (!char_iterator::at_end() && consume_char("Ee")) { is_float = true; exponent = true; } /* If nothing indicates a floating number, we have already parsed the digits of an integer number and we can report it now. */ if (!is_float) { set_token(Token::INTEGER); return true; } // Parse exponent if present. if (exponent) { consume_char("+-"); if (!parse_digits()) throw_error("No digits in the exponent"); } // Report floating number. set_token(Token::NUMBER); return true; } /* Check if we have a Hexadecimal literal: X'12ab' x'12ab' 0x12ab */ bool Tokenizer::iterator::parse_hex() { if (char_iterator::at_end()) return false; if (!cur_char_in("Xx0")) return false; switch (cur_char()) { case 'X': case 'x': { if (char_iterator::at_end(1) || !next_char_is('\'')) return false; next_unit(); next_unit(); pos_type start = char_iterator::cur_pos(); if (!parse_hex_digits()) throw_error("Unexpected character inside hex literal"); set_token(Token::HEX, start); if (char_iterator::at_end() || !consume_char('\'')) throw_error("Unexpected character inside hex literal"); return true; } case '0': { if (char_iterator::at_end(1) || !next_char_in("Xx")) return false; next_unit(); next_unit(); pos_type start = char_iterator::cur_pos(); if (!parse_hex_digits()) throw_error("No hex digits found after 0x"); set_token(Token::HEX, start); return true; } default: return false; } } bool Tokenizer::iterator::parse_hex_digits() noexcept { bool ret = false; for (; !char_iterator::at_end() && consume_char("0123456789ABCDEFabcdef"); ret = true); return ret; } /* See if next token is: WORD - plain word QWORD - word quotted in back-ticks */ bool Tokenizer::iterator::parse_word() { if (char_iterator::at_end()) return false; if (cur_char_is('`')) { parse_quotted_string('`'); set_tok_type(Token::QWORD); return true; } bool has_word = false; while (!char_iterator::at_end() && cur_char_is_word()) { next_unit(); has_word = true; } if (!has_word) return false; set_token(Token::WORD); return true; } /* See if next token is: QSTRING - a string in single quotes QQSTRING - a string in double quotes */ bool Tokenizer::iterator::parse_string() { char_t quote = cur_char(); if (!(U'\"' == quote || U'\'' == quote)) return false; if (!parse_quotted_string((char)quote)) return false; set_tok_type('\"' == quote ? Token::QQSTRING : Token::QSTRING); return true; } bool Tokenizer::iterator::parse_quotted_string(char qchar) { if (!consume_char(qchar)) return false; pos_type start_pos = char_iterator::cur_pos(); // Store first few characters for use in error message. static const size_t start_len = 8; cdk::string error("Unterminated quoted string starting with "); error.push_back((char_t)qchar); while (!char_iterator::at_end()) { // if we do not have escaped char, look at the end of the string if (!consume_char('\\')) { // if quote char is repeated, then it does not terminate string if ( consume_char(qchar) && (char_iterator::at_end() || !cur_char_is(qchar)) ) { // end of the string, set token extend set_tok_pos(start_pos, char_iterator::cur_pos() - 1); return true; } } char_t c = consume_char(); if (c == invalid_char) throw_error("Invalid utf8 string"); if (char_iterator::cur_pos() < start_pos + start_len) error.push_back(c); } throw_error(error + "..."); return false; // quiet compile warnings } /* Low-level character iterator. */ std::locale char_iterator::m_cloc("C"); bytes char_iterator::get_seen(size_t len, bool *complete) { char_iterator_base it(m_ctx_beg, cur_pos()); while (!it.at_end() && (it.cur_pos() + len <= cur_pos())) it++; if (complete) *complete = (it.cur_pos() == get_beg()); return { (byte*)it.cur_pos(), (byte*)cur_pos() }; } bytes char_iterator::get_ahead(size_t len, bool *complete) { char_iterator_base it(cur_pos(), get_end()); const char *pos = it.cur_pos(); while (!it.at_end() && (it.cur_pos() < cur_pos() + len)) { pos = it.cur_pos(); it++; } if (complete) *complete = (pos == get_end()); return { (byte*)cur_pos(), (byte*)pos }; }

cdk/parser/tokenizer.cc (252 lines of code) (raw):