cpp-ch/local-engine/Storages/Serializations/ExcelNumberReader.h (416 lines of code) (raw):

/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #pragma once #include <IO/readFloatText.h> namespace DB { namespace ErrorCodes { extern const int CANNOT_PARSE_NUMBER; } } namespace local_engine { static String zh_cn_symbol = "¥"; // std::use_facet<std::moneypunct<char>>(std::locale("zh_cn.utf8")).curr_symbol(); static String en_us_symbol = "$"; // std::use_facet<std::moneypunct<char>>(std::locale("en_US.utf8")).curr_symbol(); inline bool checkMoneySymbol(DB::ReadBuffer & buf, String & symbol) { auto pr = static_cast<DB::PeekableReadBuffer>(buf); DB::PeekableReadBufferCheckpoint checkpoint{pr, false}; std::string::iterator it = symbol.begin(); while (it != symbol.end()) { if (pr.eof() || *pr.position() != *it) { pr.rollbackToCheckpoint(); return false; } pr.position()++; it++; } return true; } inline bool checkMoneySymbol(DB::ReadBuffer & buf) { return checkMoneySymbol(buf, zh_cn_symbol) || checkMoneySymbol(buf, en_us_symbol); } inline bool checkNumberComma(DB::ReadBuffer & buf, bool has_quote, const DB::FormatSettings & settings) { if (!has_quote && settings.csv.delimiter == ',') return false; /// if has ',', next ',' must has 3 digits, eg: 1,000 / 11,000,000 /// error is 11,00 / 1,0 if ((buf.position() + 4 <= buf.buffer().end() && isNumericASCII(buf.position()[1]) && isNumericASCII(buf.position()[2]) && isNumericASCII(buf.position()[3]) && !isNumericASCII(buf.position()[4])) || (buf.position() + 3 == buf.buffer().end() && isNumericASCII(buf.position()[1]) && isNumericASCII(buf.position()[2]) && isNumericASCII(buf.position()[3]))) { return true; } else return false; } template <size_t N, bool before_point = false, typename T> static inline bool readUIntTextUpToNSignificantDigits(T & x, DB::ReadBuffer & buf, bool has_quote, const DB::FormatSettings & settings) { bool has_values = false; /// In optimistic case we can skip bound checking for first loop. if (buf.position() + N <= buf.buffer().end()) { for (size_t i = 0; i < N; ++i) { if (isNumericASCII(*buf.position())) { x *= 10; x += *buf.position() & 0x0F; ++buf.position(); has_values = true; } else if constexpr (before_point) // 10,000,000 { if (!has_values) return false; if (*buf.position() == ',' && checkNumberComma(buf, has_quote, settings)) { ++buf.position(); continue; } else return true; } else return true; } } else { for (size_t i = 0; i < N; ++i) { if (!buf.eof() && isNumericASCII(*buf.position())) { x *= 10; x += *buf.position() & 0x0F; ++buf.position(); has_values = true; } else if constexpr (before_point) // 10,000,000 { if (!has_values) return false; if (*buf.position() == ',' && checkNumberComma(buf, has_quote, settings)) { ++buf.position(); continue; } else return true; } else return true; } } while (!buf.eof() && (buf.position() + 8 <= buf.buffer().end()) && DB::is_made_of_eight_digits_fast(buf.position())) { buf.position() += 8; } while (!buf.eof() && isNumericASCII(*buf.position())) ++buf.position(); return true; } template <typename T> inline bool readExcelFloatTextFastImpl(T & x, DB::ReadBuffer & in, bool has_quote, const DB::FormatSettings & settings) { static_assert(std::is_same_v<T, double> || std::is_same_v<T, float>, "Argument for readFloatTextImpl must be float or double"); static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); const UInt8 MAX_HEAD_SKIP = 2; const UInt8 MAX_TAIL_SKIP = 2; UInt8 head_skip = 0; UInt8 tail_skip = 0; bool negative = false; x = 0; UInt64 before_point = 0; UInt64 after_point = 0; int after_point_exponent = 0; int exponent = 0; if (in.eof()) return false; while (!in.eof()) { if ((*in.position() < '0' || *in.position() > '9') && *in.position() != '-' && *in.position() != '+' && *in.position() != '.' && !checkMoneySymbol(in)) { if (!((static_cast<UInt8>(*in.position()) & 0b11000000u) == 0b10000000u)) // learn from UTF8Helpers.h { head_skip++; if (head_skip > MAX_HEAD_SKIP) return false; } ++in.position(); } else break ; } if (*in.position() == '-') { negative = true; ++in.position(); } else if (*in.position() == '+') ++in.position(); auto count_after_sign = in.count(); constexpr int significant_digits = std::numeric_limits<UInt64>::digits10; if (!local_engine::readUIntTextUpToNSignificantDigits<significant_digits, true>(before_point, in, has_quote, settings)) return false; size_t read_digits = in.count() - count_after_sign; if (unlikely(read_digits > significant_digits)) { int before_point_additional_exponent = static_cast<int>(read_digits) - significant_digits; x = static_cast<T>(shift10(before_point, before_point_additional_exponent)); } else { x = before_point; } if (checkChar('.', in)) { auto after_point_count = in.count(); while (!in.eof() && *in.position() == '0') ++in.position(); auto after_leading_zeros_count = in.count(); int after_point_num_leading_zeros = static_cast<int>(after_leading_zeros_count - after_point_count); local_engine::readUIntTextUpToNSignificantDigits<significant_digits>(after_point, in, has_quote, settings); read_digits = in.count() - after_leading_zeros_count; after_point_exponent = (read_digits > significant_digits ? -significant_digits : static_cast<int>(-read_digits)) - after_point_num_leading_zeros; } if (checkChar('e', in) || checkChar('E', in)) { if (in.eof()) return false; bool exponent_negative = false; if (*in.position() == '-') { exponent_negative = true; ++in.position(); } else if (*in.position() == '+') { ++in.position(); } local_engine::readUIntTextUpToNSignificantDigits<4>(exponent, in, has_quote, settings); if (exponent_negative) exponent = -exponent; } if (!(*in.position() >= '0' && *in.position() <= '9')) // process suffix { while (!in.eof()) { if(*in.position() == settings.csv.delimiter ||*in.position() == '\'' ||*in.position() == '\"' || *in.position() == '\n' || *in.position() == '\r') { break; } if (!((static_cast<UInt8>(*in.position()) & 0b11000000u) == 0b10000000u)) // learn from UTF8Helpers.h { tail_skip++; if (tail_skip>MAX_TAIL_SKIP) return false; } ++in.position(); } } if (after_point) x += static_cast<T>(shift10(after_point, after_point_exponent)); if (exponent) x = static_cast<T>(shift10(x, exponent)); if (negative) x = -x; auto num_characters_without_sign = in.count() - count_after_sign; /// Denormals. At most one character is read before denormal and it is '-'. if (num_characters_without_sign == 0) { if (in.eof()) return false; if (*in.position() == '+') { ++in.position(); if (in.eof()) return false; else if (negative) return false; } if (*in.position() == 'i' || *in.position() == 'I') { if (assertOrParseInfinity<false>(in)) { x = std::numeric_limits<T>::infinity(); if (negative) x = -x; return true; } return false; } else if (*in.position() == 'n' || *in.position() == 'N') { if (assertOrParseNaN<false>(in)) { x = std::numeric_limits<T>::quiet_NaN(); if (negative) x = -x; return true; } return false; } } return true; } template <typename T> bool readExcelIntTextImpl(T & x, DB::ReadBuffer & buf, bool has_quote, const DB::FormatSettings & settings) { bool number_force = settings.try_infer_integers==1; const UInt8 MAX_HEAD_SKIP = 2; const UInt8 MAX_TAIL_SKIP = 2; UInt8 head_skip=0; UInt8 tail_skip=0; using UnsignedT = make_unsigned_t<T>; bool negative = false; UnsignedT res{}; if (buf.eof()) return false; /// '+' or '-' bool has_sign = false; bool has_number = false; UInt32 length = 0; while (!buf.eof()) { if (*buf.position() == '+') { /// 123+ or +123+, just stop after 123 or +123. if (has_number) break; /// No digits read yet, but we already read sign, like ++, -+. if (has_sign) return false; has_sign = true; ++buf.position(); } else if (*buf.position() == '-') { if (has_number) break; if (has_sign) return false; if constexpr (is_signed_v<T>) negative = true; else return false; has_sign = true; ++buf.position(); } else if (*buf.position() == ',') { /// invalidate like 1,00010 if (checkNumberComma(buf, has_quote, settings)) { ++buf.position(); continue; } else break; } else if (*buf.position() == '.') { ++buf.position(); if (has_number) { while (!buf.eof()) { if (!(*buf.position() >= '0' && *buf.position() <= '9')) { if (number_force) break; else return false; } else { ++buf.position(); } } } else return false; } else if (*buf.position() >= '0' && *buf.position() <= '9') { has_number = true; ++length; if (length >= std::numeric_limits<T>::max_digits10) { if (negative) { T signed_res = -res; if (common::mulOverflow<T>(signed_res, 10, signed_res) || common::subOverflow<T>(signed_res, (*buf.position() - '0'), signed_res)) return false; res = -static_cast<UnsignedT>(signed_res); } else { T signed_res = res; if (common::mulOverflow<T>(signed_res, 10, signed_res) || common::addOverflow<T>(signed_res, (*buf.position() - '0'), signed_res)) return false; res = signed_res; } } else { res *= 10; res += *buf.position() - '0'; } ++buf.position(); } else if (!has_number && !has_sign && checkMoneySymbol(buf)) { continue; } else if (has_number && !(*buf.position() >= '0' && *buf.position() <= '9') && number_force) // process suffix { while (!buf.eof()) { if(*buf.position() == settings.csv.delimiter ||*buf.position() == '\'' ||*buf.position() == '\"' || *buf.position() == '\n' || *buf.position() == '\r') { break; } if (!((static_cast<UInt8>(*buf.position()) & 0b11000000u) == 0b10000000u)) // learn from UTF8Helpers.h { tail_skip++; if (tail_skip>MAX_TAIL_SKIP) return false; } ++buf.position(); } break; } else if (!has_number && !(*buf.position() >= '0' && *buf.position() <= '9') && number_force) // process prefix { if(*buf.position() == settings.csv.delimiter || *buf.position() == '\n' || *buf.position() == '\r') { break; } if (!((static_cast<UInt8>(*buf.position()) & 0b11000000u) == 0b10000000u)) // learn from UTF8Helpers.h { head_skip++; if (head_skip>MAX_HEAD_SKIP) return false; } ++buf.position(); } else break; } if (!has_number) return false; x = res; if constexpr (is_signed_v<T>) { if (negative) { x = -res; } } return true; } }