cdk/parser/tokenizer.h

/* * Copyright (c) 2015, 2024, Oracle and/or its affiliates. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, version 2.0, as * published by the Free Software Foundation. * * This program is designed to work with certain software (including * but not limited to OpenSSL) that is licensed under separate terms, as * designated in a particular file or component or in included license * documentation. The authors of MySQL hereby grant you an additional * permission to link the program and your derivative works with the * separately licensed software that they have either included with * the program or referenced in the documentation. * * Without limiting anything contained in the foregoing, this file, * which is part of Connector/C++, is also subject to the * Universal FOSS Exception, version 1.0, a copy of which can be found at * https://oss.oracle.com/licenses/universal-foss-exception. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License, version 2.0, for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #ifndef _TOKENIZER_H_ #define _TOKENIZER_H_ #include <mysql/cdk.h> #include "char_iterator.h" PUSH_SYS_WARNINGS_CDK #include <string> #include <vector> #include <map> #include <set> #include <memory> #include <stdexcept> #include <sstream> #include <algorithm> POP_SYS_WARNINGS_CDK #ifdef _MSC_VER DISABLE_WARNING_CDK(4061) // not all enums listed inside switch() statement #endif #undef WORD /* Definitions of tokens recognized by tokenizer. Each macro TOKEN_LIST(), SYMBOL_LIST1() and SYMBOL_LSIT2() defines list of tokens with the following entry for each token: X(NNN,SSS) where Token::NNN is this token's enum constant and SSS, if not NULL, defines characters of the token. For tokens which are not simple 1 or 2 character sequences, but are recognized by tokenizer logic, SSS is NULL. */ #define TOKEN_LIST(X) \ X(WORD, NULL) \ X(QWORD, NULL) /* word in backtick quotes */ \ X(QSTRING, NULL) /* string in single quotes */ \ X(QQSTRING, NULL) /* string in double quotes */ \ X(NUMBER, NULL) /* floating number */ \ X(INTEGER, NULL) /* integer number */ \ X(HEX, NULL) /* hexadecimal number*/\ SYMBOL_LIST1(X) \ SYMBOL_LIST2(X) \ // 2 char symbols #define SYMBOL_LIST2(X) \ X(NE, "!=") \ X(DF, "<>") \ X(GE, ">=") \ X(LE, "<=") \ X(LSHIFT, "<<") \ X(RSHIFT, ">>") \ X(DOUBLESTAR, "**") \ X(ARROW2, "->>") \ X(ARROW, "->") \ X(AMPERSTAND2, "&&") \ X(BAR2, "||") \ X(EQ2, "==") // 1 char symbols #define SYMBOL_LIST1(X) \ X(LPAREN,"(") \ X(RPAREN,")") \ X(LCURLY, "{") \ X(RCURLY, "}") \ X(LSQBRACKET,"[") \ X(RSQBRACKET,"]") \ X(DOT, ".") \ X(COMMA, ",") \ X(EQ, "=") \ X(GT, ">") \ X(LT, "<") \ X(AMPERSTAND, "&") \ X(BAR, "|") \ X(HAT, "^") \ X(PLUS, "+") \ X(MINUS, "-") \ X(STAR, "*") \ X(SLASH, "/") \ X(PERCENT, "%") \ X(BANG, "!") \ X(TILDE, "~") \ X(QUESTION, "?") \ X(COLON, ":") \ X(DOLLAR, "$") \ /*X(AT,"@")*/ namespace parser { using cdk::byte; using cdk::bytes; using cdk::char_t; using cdk::invalid_char; class Token; class iterator; /* Base class for all parser and tokenizer errors. This class can be used in catch handlers to catch all errors related to parsing. */ struct Error : public cdk::Error { Error() = delete; using cdk::Error::Error; }; /* Tokenizer and parser error base which shows parsing context in error description. Instances of Error keep parsing context information which consists of the current parser position within the string, and fragments of the string before and after parsing position. This information is stored directly in the error object and uses statically allocated memory to avoid dynamic memory allocation at the time when error is thrown. Parser errors use error code cdkerrc::parser_error in the generic cdk category. Unless overridden, parser errors produce error descriptions of the form: "CTX: MSG" where MSG is the message passed to the error constructor and CTX describes position of the parser in the parsed string. It can be something like "After seeing '...', looking at '...'" (see print_ctx() for exact forms of the context string). Note: This class template is parametrized by the string type, which can be either a wide or a standard string, depending on which strings the parser is working on (we have both cases). Remaining template parameters specify sizes of buffers used to store input string fragments. */ constexpr size_t seen_buf_len = 64; constexpr size_t ahead_buf_len = 12; class Error_base : public cdk::Error_class< Error_base, parser::Error > { using Base = cdk::Error_class< Error_base, parser::Error >; Error_base() = delete; protected: using string = std::string; public: /* Parser error with description 'descr' and parsing context specified by remaining arguments (see set_ctx() for possibilities). */ template<typename... Ty> Error_base( const string &descr, Ty&&... args ) : Base(nullptr, cdk::cdkerrc::parse_error) , m_msg(descr) { set_ctx(std::forward<Ty>(args)...); } virtual ~Error_base() throw () {} protected: // Storage for context data. char m_seen[seen_buf_len]; // Characters seen before current position. char m_ahead[ahead_buf_len]; // Few characters ahead of the current void set_ctx(char_iterator &pos); void set_ctx(const std::string&, size_t pos); string m_msg; // Print parser context description to the given ostream. virtual void print_ctx(std::ostream&) const; virtual void do_describe1(std::ostream &out) const { print_ctx(out); if (!m_msg.empty()) out << ": " << m_msg; } using Base::code; void do_describe(std::ostream &out) const { do_describe1(out); out << " (" << code() << ")"; } }; // ------------------------------------------------------------------------- /* Class representing a single token. It stores token type and its position within the parsed string (begin and end position). Note: For tokens such as quoted string, the characters of the token do not include the quotes. */ class Token { public: #define token_enum(T,X) T, enum Type { EMPTY = 0, TOKEN_LIST(token_enum) }; typedef std::set<Type> Set; cdk::string get_text() const; bytes get_bytes() const; std::string get_utf8() const; Type get_type() const { return m_type; } #define token_name(X,T) case X: return #X; static const char* get_name(int type) { switch (type) { TOKEN_LIST(token_name) default: return "<UNKNOWN>"; } } const char* get_name() const { return get_name(m_type); } protected: Type m_type = EMPTY; const char *m_begin = nullptr; const char *m_end = nullptr; }; inline cdk::string Token::get_text() const { cdk::string ret; if (m_begin) { assert(m_begin <= m_end); // Note: only strings and quoted words can contain non-ASCII characters. switch (m_type) { case QSTRING: case QQSTRING: case QWORD: ret.set_utf8({ (byte*)m_begin, (byte*)m_end }); break; default: ret.set_ascii(m_begin, (size_t)(m_end - m_begin)); break; } } return ret; } inline bytes Token::get_bytes() const { return { (byte*)m_begin, (byte*)m_end }; } inline std::string Token::get_utf8() const { return { (const char*)m_begin, (const char*)m_end }; } // ------------------------------------------------------------------------- /* Class implementing parsing characters into tokens. After creating a Tokenizer instance from a given string, one can use Tokenizer::iterator returned by method begin() to iterate through the sequence of tokens. */ class Tokenizer { public: class Error; class iterator; Tokenizer(bytes input); /* Return true if there are no tokens in the input string. */ bool empty() const; iterator begin() const; const iterator& end() const; public: char_iterator _begin; friend Error; }; /* Iterator for accessing a sequence of tokens of a tokenizer. cur_pos() | char_iterator::m_pos | | v v ---[--]---- ^^^^ m_token */ class Tokenizer::iterator : public char_iterator { using pos_type = const char*; pos_type m_pos; bool m_at_end = true; iterator(const char_iterator &input) : char_iterator(input) , m_at_end(false) { get_next_token(); } pos_type cur_pos() const noexcept { return m_pos; } bool at_end() const noexcept { return m_at_end; } public: iterator() = default; const Token& operator*() const noexcept { assert(!(at_end())); //if (at_end()) // THROW("token iterator: accessing null iterator"); return m_token; } const Token* operator->() const noexcept { assert(!(at_end())); //if (at_end()) // THROW("token iterator: accessing null iterator"); return &m_token; } iterator& operator++() //noexcept { get_next_token(); return *this; } bool operator==(const iterator &other) const noexcept { if (at_end()) return other.at_end(); return m_pos == other.m_pos; } bool operator!=(const iterator &other) const noexcept { return !(*this == other); } private: struct : public Token { friend iterator; } m_token; bool get_next_token(); // Methods that parse characters into various kinds of tokens. bool parse_number(); bool parse_digits() noexcept; // string *digits = NULL); bool parse_hex(); bool parse_hex_digits() noexcept; bool parse_string(); bool parse_word(); bool parse_quotted_string(char); /* Add to the sequence new token of a given type. The token ends at the current position within the input string and starts at the position marked with set_token_start(). The characters of the token are all the characters of the input string between token's start and end position. */ void set_token(Token::Type type, pos_type beg = nullptr, pos_type end = nullptr) noexcept { set_tok_type(type); set_tok_pos( beg == nullptr ? m_pos : beg, end == nullptr ? char_iterator::cur_pos() : end ); } void set_tok_pos(pos_type, pos_type) noexcept; void set_tok_type(Token::Type) noexcept; // Error reporting void throw_error(const std::string&) const; friend Tokenizer; friend Tokenizer::Error; }; inline Tokenizer::Tokenizer(cdk::bytes input) : _begin(input) {} inline bool Tokenizer::empty() const { return _begin.at_end(); } inline Tokenizer::iterator Tokenizer::begin() const { return _begin; } inline const Tokenizer::iterator& Tokenizer::end() const { static iterator end_iter; return end_iter; } inline void Tokenizer::iterator::set_tok_pos(pos_type beg, pos_type end) noexcept { m_token.m_begin = (const char*)beg; m_token.m_end = (const char*)end; } inline void Tokenizer::iterator::set_tok_type(Token::Type type) noexcept { m_token.m_type = type; } /* Tokenizer error class. It is a specialization of the generic Error_base which defines convenience constructors. */ class Tokenizer::Error : public parser::Error_base { public: Error(char_iterator &it, const string &msg = string()) : Error_base(msg, it) {} }; inline void Tokenizer::iterator::throw_error(const std::string &msg) const { throw Error(*(char_iterator*)this, msg); } // ------------------------------------------------------------------------- // Error class implementation // ------------------------------------------------------------------------- /* Construct error instance copying fragments of the parsed string to the internal buffers to be used in the error description. Note: MSVC generates warning for std::string::copy() method used below because it is considered unsafe. */ inline void Error_base::set_ctx( const std::string &input, size_t pos ) { char_iterator it(input, input.data() + pos); set_ctx(it); } inline void Error_base::set_ctx( char_iterator &it ) { memset(m_seen, 0, sizeof(m_seen)); memset(m_ahead, 0, sizeof(m_ahead)); /* Copy characters seen so far to m_seen[] buffer. */ bool complete; bytes seen = it.get_seen(seen_buf_len -2, &complete); char *dst = m_seen; /* If seen characters cover only a fragment of the parsed text, set first character in m_seen[] to 0 to indicate that trailing '...' should be added (see print_ctx()). The characters are then copied starting from m_seen[1]. */ if (!complete) { m_seen[0] = '\0'; dst++; } std::copy_n(seen.begin(), seen.size(), dst); dst[seen.size()] = '\0'; /* Copy some characters ahead of the current position to m_ahead[] buffer. If this is just a fragment of the remaining text, indicate that a trailing '...' should be added. This is done by setting the last element in m_ahead to 1 (see print_ctx()). */ bytes ahead = it.get_ahead(ahead_buf_len - 2, &complete); std::copy_n(ahead.begin(), ahead.size(), m_ahead); m_ahead[ahead.size()] = '\0'; if (!complete) m_ahead[ahead_buf_len - 1] = 1; } /* Print parser context description used in parser error descriptions. It has one of these forms: "After seeing '...AAA', looking at 'BBB...'" "After seeing '...AAA', with no more characters in the string" "While looking at 'BBB...'" "While looking at empty string" */ inline void parser::Error_base::print_ctx(std::ostream &out) const { bool seen_part = false; // Note: cdk::string() used for utf8 conversion. if (m_seen[0] || m_seen[1]) { seen_part = true; out << "After seeing '"; if (!m_seen[0]) out << "..." << (m_seen + 1); else out << m_seen; out << "'"; } if (m_ahead[0]) { if (seen_part) out << ", looking at '"; else out << "While looking at '"; out << m_ahead; if (1 == m_ahead[ahead_buf_len - 1]) out << "..."; out << "'"; } else { if (seen_part) out << ", with no more characters in the string"; else out << "While looking at empty string"; } } // ------------------------------------------------------------------------- // String to number conversions. // ------------------------------------------------------------------------- // // TODO: Consider if it should not be implemented as a numeric codec. // Numeric conversion error classes. class Numeric_conversion_error : public cdk::Error_class<Numeric_conversion_error> { typedef cdk::Error_class<Numeric_conversion_error> Base; protected: std::string m_inp; void do_describe(std::ostream &out) const { out << msg() << " (" << code() << ")"; } public: Numeric_conversion_error(const std::string &inp) : Base(NULL, cdk::cdkerrc::parse_error) , m_inp(inp) {} virtual std::string msg() const { std::string msg("Failed to convert string '"); msg.append(m_inp); msg.append("' to a number"); return msg; } }; class Numeric_conversion_partial : public cdk::Error_class< Numeric_conversion_partial, Numeric_conversion_error > { typedef cdk::Error_class< Numeric_conversion_partial, Numeric_conversion_error > Base; public: Numeric_conversion_partial(const std::string &inp) : Base(NULL, inp) {} std::string msg() const override { std::string msg("Not all characters consumed when converting string '"); msg.append(m_inp); msg.append("' to a number"); return msg; } }; /* Generic string to number conversion function template. Returns numeric value after converting given string in a given base, which should be either 10, 16 or 8. Throws error if the whole string could not be converted to a number. Unlike strtod() and friends, this function does not depend on the current locale setting but always uses the "C" locale (so that, e.g., decimal point character is always '.'). */ template< typename Num_t > inline Num_t strtonum(const std::string &str, int radix = 10) { // TODO: Allow white-space at the beginning or end of the string? typedef std::istreambuf_iterator<char> iter_t; static std::locale c_locale("C"); static const std::num_get<char> &cvt = std::use_facet<std::num_get<char>>(c_locale); std::istringstream inp(str); Num_t val; inp.imbue(c_locale); switch (radix) { case 10: inp.setf(std::ios_base::dec, std::ios_base::basefield); break; case 16: inp.setf(std::ios_base::hex, std::ios_base::basefield); break; case 8: inp.setf(std::ios_base::oct, std::ios_base::basefield); break; default: inp.setf(std::ios_base::fmtflags(0), std::ios_base::basefield); break; } /* Note: We could use istream::operator>>() to do conversion, but then there are problems with detecting conversion errors on some platforms (OSX). For that reason we instead use a number conversion facet directly. This gives direct access to the error information. */ iter_t beg(inp), end; std::ios::iostate err = std::ios_base::goodbit; iter_t last = cvt.get(beg, end, inp, err, val); if (std::ios_base::goodbit != err && std::ios_base::eofbit != err) throw Numeric_conversion_error(str); if (last != end) throw Numeric_conversion_partial(str); return val; } inline double strtod(const std::string &str) { return strtonum<double>(str); } inline uint64_t strtoui(const std::string &str, int radix = 10) { return strtonum<uint64_t>(str, radix); } inline int64_t strtoi(const std::string &str, int radix = 10) { return strtonum<int64_t>(str, radix); } } // parser #endif

cdk/parser/tokenizer.h (472 lines of code) (raw):