mysqlshdk/libs/utils/document_parser.cc (655 lines of code) (raw):

/* * Copyright (c) 2018, 2024, Oracle and/or its affiliates. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License, version 2.0, * as published by the Free Software Foundation. * * This program is designed to work with certain software (including * but not limited to OpenSSL) that is licensed under separate terms, * as designated in a particular file or component or in included license * documentation. The authors of MySQL hereby grant you an additional * permission to link the program and your derivative works with the * separately licensed software that they have either included with * the program or referenced in the documentation. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See * the GNU General Public License, version 2.0, for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software Foundation, Inc., * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "mysqlshdk/libs/utils/document_parser.h" #include <errno.h> #include <fcntl.h> #include <sys/stat.h> #include <time.h> #include <limits> #ifdef _WIN32 #include <io.h> #else #include <unistd.h> #endif #include <deque> #include <iterator> #include <mutex> #include <string> #include <string_view> #include "mysqlshdk/include/scripting/type_info/custom.h" #include "mysqlshdk/include/scripting/type_info/generic.h" #include "mysqlshdk/include/scripting/types.h" #include "mysqlshdk/libs/utils/strformat.h" #include "mysqlshdk/libs/utils/utils_string.h" #include "mysqlshdk/shellcore/shell_console.h" namespace shcore { namespace { std::string hexify(const std::string &data) { if (data.size() == 0) { return std::string{}; } std::string s(3 * data.size(), 'x'); std::string::iterator k = s.begin(); for (const unsigned char i : data) { *k++ = "0123456789abcdef"[i >> 4]; *k++ = "0123456789abcdef"[i & 0x0F]; *k++ = ' '; } s.resize(3 * data.size() - 1); return s; } } // namespace const shcore::Option_pack_def<Document_reader_options> &Document_reader_options::options() { static const auto opts = shcore::Option_pack_def<Document_reader_options>() .optional("convertBsonTypes", &Document_reader_options::convert_bson_types) .optional("convertBsonOid", &Document_reader_options::convert_bson_id) .optional("extractOidTime", &Document_reader_options::extract_oid_time) .optional("ignoreDate", &Document_reader_options::ignore_date) .optional("ignoreTimestamp", &Document_reader_options::ignore_timestamp) .optional("ignoreBinary", &Document_reader_options::ignore_binary) .optional("ignoreRegex", &Document_reader_options::ignore_regexp) .optional("ignoreRegexOptions", &Document_reader_options::ignore_regexp_options) .optional("decimalAsDouble", &Document_reader_options::decimal_as_double) .on_done(&Document_reader_options::on_unpacked_options); return opts; } void Document_reader_options::on_unpacked_options() { // The default value for convert_bson_id is the value of convert_bson_types if (convert_bson_id.is_null()) convert_bson_id = convert_bson_types; if (!extract_oid_time.is_null() && !convert_bson_id.get_safe(false)) { throw shcore::Exception::argument_error( "The 'extractOidTime' option can not be used if 'convertBsonOid' is " "disabled."); } std::vector<std::string> used_options; if (!ignore_date.is_null()) used_options.push_back("ignoreDate"); if (!ignore_timestamp.is_null()) used_options.push_back("ignoreTimestamp"); if (!ignore_binary.is_null()) used_options.push_back("ignoreBinary"); if (!ignore_regexp.is_null()) used_options.push_back("ignoreRegex"); if (!ignore_regexp_options.is_null()) used_options.push_back("ignoreRegexOptions"); if (!decimal_as_double.is_null()) used_options.push_back("decimalAsDouble"); if (!used_options.empty() && !convert_bson_types.get_safe(false)) { throw shcore::Exception::argument_error(shcore::str_format( "The following option%s can not be used if 'convertBsonTypes' is " "disabled: %s", used_options.size() > 1 ? "s" : "", shcore::str_join(used_options, ", ", [](const std::string &data) { return "'" + data + "'"; }).c_str())); } if (ignore_regexp.get_safe(false) && ignore_regexp_options.get_safe(false)) { throw shcore::Exception::argument_error( "The 'ignoreRegex' and 'ignoreRegexOptions' options can't both be " "enabled"); } if (!extract_oid_time.is_null() && (*extract_oid_time).empty()) { throw shcore::Exception::runtime_error( "Option 'extractOidTime' can not be empty."); } } bool Document_reader_options::ignore_type(Bson_type type) const { if (convert_bson_types.get_safe(false)) { switch (type) { case Bson_type::OBJECT_ID: return !convert_bson_id.get_safe(false); case Bson_type::DATE: return ignore_date.get_safe(false); case Bson_type::TIMESTAMP: return ignore_timestamp.get_safe(false); case Bson_type::REGEX: return ignore_regexp.get_safe(false); case Bson_type::BINARY: return ignore_binary.get_safe(false); case Bson_type::LONG: case Bson_type::INTEGER: case Bson_type::DECIMAL: return false; case Bson_type::NONE: break; } } else { return type == Bson_type::OBJECT_ID ? !convert_bson_id.get_safe(false) : true; } return true; } std::string Json_reader::next() { std::deque<char> context; m_source->skip_whitespaces(); Json_document_parser parser(m_source, m_options); return parser.parse(); } void Json_reader::parse_bom() { std::string header; header.reserve(4); for (int i = 0; i < 4; i++) { const auto c = m_source->peek(); if (c == '{' || ::isspace(c) || m_source->eof()) { break; } else { header += m_source->get(); } } if (header.size() == 0) { return; } if (std::string{'\xef', '\xbb', '\xbf'}.compare(header) == 0) { // nop } else if (std::string{'\x00', '\x00', '\xfe', '\xff'}.compare(header) == 0) { // utf-32 be throw std::runtime_error("UTF-32BE encoded document is not supported."); } else if (std::string{'\xff', '\xfe', '\x00', '\x00'}.compare(header) == 0) { // utf-32 le throw std::runtime_error("UTF-32LE encoded document is not supported."); } else if (std::string{'\xfe', '\xff'}.compare(header) == 0) { // utf-16 be throw std::runtime_error("UTF-16BE encoded document is not supported."); } else if (std::string{'\xff', '\xfe'}.compare(header) == 0) { // utf-16 le throw std::runtime_error("UTF-16LE encoded document is not supported."); } else { throw std::runtime_error("JSON document contains invalid bytes (" + hexify(header) + ") at the begining of the file."); } } Document_parser::Document_parser(Buffered_input *input, const Document_reader_options &options, size_t depth, bool as_array, const std::string context) : m_source(input), m_options(options), m_depth(depth), m_as_array(as_array), m_context(context) {} void Json_document_parser::throw_premature_end() { throw invalid_json("Premature end of input stream", m_source->offset()); } void Json_document_parser::throw_invalid_json(const std::string &missing, const std::string &context, std::size_t offset) { std::string msg = "Unexpected data, expected to find "; msg += missing; if (!context.empty()) { msg += " " + context; } throw invalid_json(msg, offset); } void Json_document_parser::clear_document() { m_document->resize(m_document_start_offset); } Bson_type Json_document_parser::get_bson_type() { using namespace std::literals; const char *first = &(*m_document)[m_last_attribute_start]; const char *last = &(*m_document)[m_last_attribute_end]; if (first == last || *first != '$') { return Bson_type::NONE; } ++first; size_t attr_size = std::distance(first, last); std::string_view attr_view(first, attr_size); switch (attr_size) { case 3: // oid if ("oid"sv == attr_view) { return Bson_type::OBJECT_ID; } break; case 4: // date if ("date"sv == attr_view) { return Bson_type::DATE; } break; case 5: // regex if ("regex"sv == attr_view) { return Bson_type::REGEX; } break; case 6: // binary if ("binary"sv == attr_view) { return Bson_type::BINARY; } break; case 9: // numberInt // timestamp if ("numberInt"sv == attr_view) { return Bson_type::INTEGER; } if ("timestamp"sv == attr_view) { return Bson_type::TIMESTAMP; } break; case 10: // numberLong if ("numberLong"sv == attr_view) { return Bson_type::LONG; } break; case 13: // numberDecimal if ("numberDecimal"sv == attr_view) { return Bson_type::DECIMAL; } break; } return Bson_type::NONE; } std::string Json_document_parser::parse() { std::string document; parse(&document); return document; } /** * This function carries on the actual parsing of a document by appending new * data into the received document. * * Nested documents/arrays are parsed by creating a new instance of the class * and calling this function so the same output buffer is used everywhere. */ void Json_document_parser::parse(std::string *document) { m_document = document; m_document_start_offset = m_document->size(); if (m_source->eof()) return; if (m_source->peek() != (m_as_array ? '[' : '{')) { std::string type = m_as_array ? "array" : "object"; throw invalid_json("Input does not start with a JSON " + type, m_source->offset()); } // Appends the initial character (*m_document) += m_source->get(); get_whitespaces(m_document); bool complete = false; // Tests for an empty object/array if (m_source->peek() == (m_as_array ? ']' : '}')) { (*m_document) += m_source->get(); complete = true; } int field_count = 0; while (!complete && !m_source->eof()) { if (!m_as_array) { // Next data is an attribute which is quoted m_last_attribute_start = m_document->size() + 1; get_string(m_document); m_last_attribute_end = m_document->size() - 1; if (m_options.convert_bson_types.get_safe(false) || m_options.convert_bson_id.get_safe(false)) { auto type = get_bson_type(); // If the first field is a mongo special field // The original document is translated based on // options and the right data is returned if (!field_count && !m_options.ignore_type(type)) { parse_bson_document(type); return; } } get_whitespaces(m_document); if (m_source->eof()) throw_premature_end(); if (m_source->peek() != ':') throw invalid_json( "Unexpected character, expected field/value separator ':'", m_source->offset()); (*m_document) += m_source->get(); get_whitespaces(m_document); field_count++; } get_value(m_document); get_whitespaces(m_document); // Only comma or closing is expected if (m_source->eof()) throw_premature_end(); if (m_source->peek() == (m_as_array ? ']' : '}')) { complete = true; } else if (m_source->peek() != ',') { std::string type = m_as_array ? "value" : "field"; throw invalid_json( "Unexpected character, expected " + type + " separator ','", m_source->offset()); } // Consumes the , or the closing character (*m_document) += m_source->get(); get_whitespaces(m_document); } if (!complete) throw_premature_end(); } /** * BSON Data Types are self contained in embedded documents. * * This function executes calls the correct parsing routine for each * supported BSON Data Type. */ void Json_document_parser::parse_bson_document(Bson_type type) { switch (type) { case Bson_type::OBJECT_ID: parse_bson_oid(); break; case Bson_type::DATE: parse_bson_date(); break; case Bson_type::TIMESTAMP: parse_bson_timestamp(); break; case Bson_type::LONG: case Bson_type::INTEGER: parse_bson_integer(type); break; case Bson_type::DECIMAL: parse_bson_decimal(); break; case Bson_type::REGEX: parse_bson_regex(); break; case Bson_type::BINARY: parse_bson_binary(); break; case Bson_type::NONE: break; } } /** * Helper function, returns the correct missing label based on a * Bson_token definition */ std::string get_missing(const Json_document_parser::Bson_token &token) { switch (token.type) { case 'X': if (token.length == 0) return "a string with hexadecimal digits"; else if (token.length == 1) return "a string with an hexadecimal digit"; else return "a string with " + std::to_string(token.length) + " hexadecimal digits"; case 'S': if (token.length == 0) return "a string"; else if (token.length == 1) return "a string with one character"; else return "a string with " + std::to_string(token.length) + " characters"; case 'I': return "an integer string"; case 'i': return "an integer"; case 'N': return "a numeric string"; case 'n': return "a number"; case 'v': return "a value"; case '{': case '}': case '[': case ']': case ',': case ':': std::string missing; missing = "'"; missing.append(1, token.type); missing += "'"; return missing; } assert(0); return ""; } /** * This is a helper function used to read through the structure of a document * representing a BSON Data Type. * * @param target: The document where the read data will be appended * @param tokens: An array of Bson_tokens with information about how the * different tokens should be read as well as the types of validations to be * done on them. * * A Bson_token contains the following elements: * - type: Indicates the type of token to be read. * - value: Used to trigger a verification so the read data matches this value. * - target: A recipient where the read data will be stored, if not given * the read data is simply discarded. * - ntarget: Similar to target but for numeric values. * - length: Used to trigger a length validation, the read data should match * this length. * * Following the available token types: * * a) Data coming as strings (Uppercase Types): * - S: to read a string * - N: to read a JSON number in a string * - I: to read an integer number in a string * - X: to read hexdecimal digits in a string * * b) Raw data (Lowercase Types): * - n: to read a JSON number * - i: to read an integer * - v: to read any value * * c) JSON separators * - These are: '{', '}', '[', ']', ',' and ':' * * Available Validations: * * The validations are not available for all the tokens, they were enabled * as needed, current validations include: * * - S and X values must match the token length if it's != 0 * - S values must match the token value is it's != "" * - I and i values must be pure digits * - X values must be pure hexadecimal digits * - N and n values must be fully convertible to double */ void Json_document_parser::get_bson_data(const std::vector<Bson_token> &tokens, const std::string &context) { for (const auto &token : tokens) { std::string missing_data = get_missing(token); // Where the read value will be stored std::string *target = token.target; // If caller did not provide a recipient for the read value, a temporal is // provided. std::string temporal; if (!token.target) target = &temporal; // Backup the position of the recipient where the token will be stored size_t target_start = target->size(); size_t value_start = target_start; size_t value_size = 0; // Backup the offset where the read token will start m_source->skip_whitespaces(); auto offset = m_source->offset() + 1; switch (token.type) { case 'S': case 'X': case 'I': case 'N': if (m_source->peek() != '"') throw_invalid_json(missing_data, context, offset); get_string(target, context); value_start++; value_size = target->size() - target_start - 2; break; case 'v': case 'i': case 'n': get_value(target); value_size = target->size() - target_start; break; case '{': case '}': case '[': case ']': case ',': case ':': if (m_source->peek() != token.type) { throw_invalid_json(missing_data, context, offset); } else { get_char(target); } continue; default: assert(0); } // S and X strings may be validated to have a specific length if (token.length && (token.type == 'S' || token.type == 'X') && value_size != token.length) { throw_invalid_json(missing_data, context, offset); } // S strings may be validated to be a specific value if the value is defined if (token.type == 'S' && !token.value.empty() && token.value != target->substr(target_start)) { throw_invalid_json(token.value, context, offset); } // X strings mist have pure hexadecimal digits if (token.type == 'X') { for (size_t index = 0; index < value_size; index++) { if (!isxdigit((*target)[value_start + index])) throw_invalid_json(missing_data, context, offset); } } // Numeric values are converted into double to verify validity if (token.type == 'n' || token.type == 'N' || token.type == 'i' || token.type == 'I') { if (value_size == 0) throw_invalid_json(missing_data, context, offset); // Integers must have pure decimal digits if (token.type == 'I' || token.type == 'i') { size_t digit_index = 0; if ((*target)[value_start] == '+' || (*target)[value_start] == '-') { digit_index++; } for (size_t index = digit_index; index < value_size; index++) { if (!isdigit((*target)[value_start + index])) throw_invalid_json(missing_data, context, offset); } } std::string str_number = target->substr(value_start, value_size); str_number = shcore::str_strip(str_number); // If a placeholder is provided, the number is placed there double *number = token.ntarget; double temp_dbl; if (!number) number = &temp_dbl; char *end = nullptr; (*number) = std::strtod(str_number.c_str(), &end); // End will point to the character after the last converted character // So it should be the end of the string to a complete conversion if (*end != '\0') throw_invalid_json(missing_data, context, offset); } } m_source->skip_whitespaces(); } /** * Process a BSON ObjectID Document */ void Json_document_parser::parse_bson_oid() { // Throws away {"$oid" clear_document(); // And gets the associated value get_bson_data( { {':'}, {'X', "", m_document, nullptr, 24}, {'}'}, }, "processing extended JSON for $oid"); // Extraction of time from $oid is done for ObjectID value at the _id field // Of the first level document, also only if requested if (m_depth == 1 && !m_options.extract_oid_time.is_null() && m_context == "_id") { size_t pos; std::string tstamp_hex = "0x" + m_document->substr(m_document_start_offset + 1, 8); time_t tstamp = std::stol(tstamp_hex, &pos, 16); std::string tstamp_str = mysqlshdk::utils::fmttime( "%F %T", mysqlshdk::utils::Time_type::GMT, &tstamp); m_document->append(",\"" + *m_options.extract_oid_time + "\":\"" + tstamp_str + "\""); } } /** * Process a BSON Date Document */ void Json_document_parser::parse_bson_date() { // Throws away {"$date" clear_document(); // And gets the associated value get_bson_data( { {':'}, {'S', "", m_document}, {'}'}, }, "processing extended JSON for $date"); } /** * Process a BSON Decimal Document */ void Json_document_parser::parse_bson_decimal() { // Throws away {"$numberDecimal" clear_document(); std::string decimal_str; std::string *custom_target = m_options.decimal_as_double.get_safe(false) ? &decimal_str : m_document; double number; // Gets the associated value get_bson_data({{':'}, {'N', "", custom_target, &number}, {'}'}}, "processing extended JSON for $numberDecimal"); if (m_options.decimal_as_double.get_safe(false)) { m_document->append(std::to_string(number)); } } /** * Processes BSON NumberInt and NumberLong Documents */ void Json_document_parser::parse_bson_integer(Bson_type type) { // Throws away {"$numberInt" or {$numberLong clear_document(); std::string value; std::string context = "processing extended JSON for $number"; if (type == Bson_type::LONG) context += "Long"; else context += "Int"; get_bson_data({{':'}, {'I', "", &value, nullptr}, {'}'}}, context); // handle a signed integer starting with a plus, trim the plus character std::size_t start = 1; if ('+' == value[start]) { ++start; } std::string number = value.substr(start, value.size() - 1 - start); m_document->append(number); } bool Json_document_parser::valid_timestamp(double json_number) { struct tm local_time_struct; time_t tstamp = static_cast<time_t>(json_number); // Converts the time_t version of the original number to // a tm struct #ifdef _WIN32 localtime_s(&local_time_struct, &tstamp); #else localtime_r(&tstamp, &local_time_struct); #endif // Now gets the time_t value from that struct time_t local_tstamp = mktime(&local_time_struct); // If final value matches the original value, then it's a valid date return json_number == local_tstamp; } /** * Processes BSON Timestamp Document */ void Json_document_parser::parse_bson_timestamp() { // Throws away {"$timestamp" clear_document(); std::string tstamp_str; double tstamp_n; get_bson_data({{':'}, {'{'}, {'S', "\"t\""}, {':'}, {'n', "", &tstamp_str, &tstamp_n}, {','}, {'S', "\"i\""}, {':'}, {'n'}, {'}'}, {'}'}}, "processing extended JSON for $timestamp"); if (!valid_timestamp(tstamp_n)) { throw invalid_json( "Invalid timestamp value found processing extended JSON for " "$timestamp.", m_source->offset()); } else { time_t tstamp = static_cast<time_t>(tstamp_n); m_document->append("\"" + mysqlshdk::utils::fmttime( "%F %T", mysqlshdk::utils::Time_type::GMT, &tstamp) + "\""); } } /** * Processes BSON Regex Document */ void Json_document_parser::parse_bson_regex() { // Throws away {"$regex" clear_document(); std::string regex; std::string options; auto offset = m_source->offset(); get_bson_data({{':'}, {'S', "", &regex}, {','}, {'S', "\"$options\""}, {':'}, {'S', "", &options}, {'}'}}, "processing extended JSON for $regex"); // Unquotes the options options = options.substr(1, options.size() - 2); if (m_options.ignore_regexp_options.get_safe(true)) { m_document->append(regex); if (!options.empty()) { std::string warning = "The regular expression for " + m_context + " contains options being ignored: " + options + "."; mysqlsh::current_console()->print_warning(warning); log_warning("%s", warning.c_str()); } } else { for (const auto &c : options) { if (c != 'i' && c != 'm' && c != 'x' && c != 's') throw invalid_json( "Unexpected data, invalid options found processing extended JSON " "for $regex", offset); } m_document->append("\"/"); m_document->append(regex.substr(1, regex.size() - 2)); m_document->append("/"); m_document->append(options); m_document->append("\""); } } /** * Processes BSON BinData Document */ void Json_document_parser::parse_bson_binary() { // Throws away {"$binary" clear_document(); get_bson_data({{':'}, {'S', "", m_document}, {','}, {'S', "\"$type\""}, {':'}, {'X', "", nullptr, nullptr, 2}, {'}'}}, "processing extended JSON for $binary"); } /** * Parses a double quoted string from m_source * @param target: the target string where the read characters will be appended * @returns A boolean indicating whether the string may be a mongodb special * key, they start with $. */ void Json_document_parser::get_string(std::string *target, const std::string &context) { if (m_source->peek() != '"') throw_invalid_json("a string", context, m_source->offset()); get_char(target); // Count holds the count of the characters read between the quotes bool done = false; while (!m_source->eof() && !done) { switch (m_source->peek()) { case '\\': get_char(target); get_char(target); break; case '"': get_char(target); done = true; break; default: get_char(target); } } if (!done) throw_premature_end(); } void Json_document_parser::get_whitespaces(std::string *target) { while (!m_source->eof() && ::isspace(m_source->peek())) get_char(target); } void Json_document_parser::get_value(std::string *target) { switch (m_source->peek()) { case '\0': // end of input before end of document throw_premature_end(); break; case '{': { std::string context; if (!m_as_array) { size_t size = m_last_attribute_end - m_last_attribute_start; context = m_document->substr(m_last_attribute_start, size); } Json_document_parser parser(m_source, m_options, m_depth + 1, false, context); parser.parse(m_document); break; } case '[': { Json_document_parser parser(m_source, m_options, m_depth + 1, true); parser.parse(m_document); break; } case '"': get_string(target); break; case '}': throw invalid_json("Unexpected '}'", m_source->offset()); break; case ']': throw invalid_json("Unexpected ']'", m_source->offset()); break; default: { while (m_source->peek() != ',' && m_source->peek() != (m_as_array ? ']' : '}')) get_char(target); } } } } // namespace shcore