sql_utils/public/interval_value.cc (979 lines of code) (raw):

/* * Copyright 2023 Google LLC * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "sql_utils/public/interval_value.h" #include <cctype> #include <cmath> #include <limits> #include <ostream> #include <string> #include <type_traits> #include "sql_utils/public/functions/arithmetics.h" #include "sql_utils/public/functions/datetime.pb.h" #include "absl/base/casts.h" #include "absl/hash/hash.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/match.h" #include "absl/strings/numbers.h" #include "absl/strings/str_format.h" #include "sql_utils/base/endian.h" #include "re2/re2.h" #include "sql_utils/base/status_macros.h" namespace bigquery_ml_utils { namespace { inline ::bigquery_ml_utils_base::StatusBuilder MakeIntervalParsingError(absl::string_view input) { return MakeEvalError() << "Invalid INTERVAL value '" << input << "'"; } std::string ToString(int64_t value) { std::string s; uint64_t v = value; if (value < 0) { v = 0 - v; // MSVC 2013 errors on unary negation of unsigned. s += "-"; } if (v < int64_t{1000}) { absl::StrAppendFormat(&s, "%d", v); } else if (v >= int64_t{1000000000000000}) { // Number bigger than 1E15; use that notation. absl::StrAppendFormat(&s, "%0.3G", static_cast<double>(v)); } else { static const char units[] = "kMBT"; const char* unit = units; while (v >= int64_t{1000000}) { v /= int64_t{1000}; ++unit; SQL_CHECK(unit < units + ABSL_ARRAYSIZE(units)); } absl::StrAppendFormat(&s, "%.2f%c", v / 1000.0, *unit); } return s; } std::string Int128ToString(absl::int128 value) { if (value >= int64_t{1000000000000000} || value <= int64_t{-1000000000000000}) { // Number bigger than 1E15; use that notation. return absl::StrFormat("%0.3G", static_cast<double>(value)); } return ToString(static_cast<int64_t>(value)); } template <typename T> void UnalignedLoadAndAugmentPtr(T* value, const char** data) { // Note that the whole if-else block can be simplified to the following: // *value = bigquery_ml_utils_base::LittleEndian::Load<T>(*data); // Unfortunately, the opensource version does not have the generic // Load<T> function. That's why Load32 and Load64 functions are used. if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>) { *value = static_cast<T>(bigquery_ml_utils_base::LittleEndian::Load32(*data)); } else { static_assert(std::is_same_v<T, int64_t> || std::is_same_v<T, uint64_t>, "UnalignedLoadAndAugmentPtr supports 32 bit and 64 bit " "signed or unsigned integer types only."); *value = static_cast<T>(bigquery_ml_utils_base::LittleEndian::Load64(*data)); } *data += sizeof(T); } } // namespace absl::StatusOr<IntervalValue> IntervalValue::FromYMDHMS( int64_t years, int64_t months, int64_t days, int64_t hours, int64_t minutes, int64_t seconds) { absl::Status status; int64_t year_months; if (!functions::Multiply(IntervalValue::kMonthsInYear, years, &year_months, &status)) { return status; } if (!functions::Add(months, year_months, &months, &status)) { return status; } // Int128 math cannot overflow __int128 nanos = kNanosInHour * hours + kNanosInMinute * minutes + kNanosInSecond * seconds; return FromMonthsDaysNanos(months, days, nanos); } size_t IntervalValue::HashCode() const { return absl::Hash<IntervalValue>()(*this); } absl::StatusOr<IntervalValue> IntervalValue::operator*(int64_t value) const { absl::Status status; int64_t months; if (!bigquery_ml_utils::functions::Multiply(get_months(), value, &months, &status)) { return absl::OutOfRangeError("Interval overflow during multiplication"); } int64_t days; if (!bigquery_ml_utils::functions::Multiply(get_days(), value, &days, &status)) { return absl::OutOfRangeError("Interval overflow during multiplication"); } FixedInt<64, 3> nanos = FixedInt<64, 3>(get_nanos()); nanos *= value; if (nanos > FixedInt<64, 3>(FixedInt<64, 2>::max()) || nanos < FixedInt<64, 3>(FixedInt<64, 2>::min())) { return absl::OutOfRangeError("Interval overflow during multiplication"); } return IntervalValue::FromMonthsDaysNanos(months, days, static_cast<__int128>(nanos)); } absl::StatusOr<IntervalValue> IntervalValue::operator/(int64_t value) const { if (value == 0) { return absl::OutOfRangeError("Interval division by zero"); } int64_t months = get_months() / value; int64_t months_remainder = get_months() % value; int64_t adjusted_days = get_days() + (months_remainder * IntervalValue::kDaysInMonth); int64_t days = adjusted_days / value; __int128 days_reminder = adjusted_days % value; FixedInt<64, 3> adjusted_nanos = FixedInt<64, 3>(get_nanos()); adjusted_nanos += FixedInt<64, 3>(days_reminder * IntervalValue::kNanosInDay); FixedInt<64, 3> nanos = adjusted_nanos; nanos /= FixedInt<64, 3>(value); if (nanos > FixedInt<64, 3>(FixedInt<64, 2>::max()) || nanos < FixedInt<64, 3>(FixedInt<64, 2>::min())) { return absl::OutOfRangeError("Interval overflow during division"); } return IntervalValue::FromMonthsDaysNanos(months, days, static_cast<__int128>(nanos)); } void IntervalValue::SumAggregator::Add(IntervalValue value) { months_ += value.get_months(); days_ += value.get_days(); nanos_ += FixedInt<64, 3>(value.get_nanos()); } absl::StatusOr<IntervalValue> IntervalValue::SumAggregator::GetSum() const { // It is unlikely that months/days will overflow int64_t, and that nanos will // overflow int128 - but check it nevertheless. if (months_ > std::numeric_limits<int64_t>::max() || months_ < std::numeric_limits<int64_t>::min() || days_ > std::numeric_limits<int64_t>::max() || days_ < std::numeric_limits<int64_t>::min() || nanos_ > FixedInt<64, 3>(FixedInt<64, 2>::max()) || nanos_ < FixedInt<64, 3>(FixedInt<64, 2>::min())) { return absl::OutOfRangeError("Interval overflow during Sum operation"); } return IntervalValue::FromMonthsDaysNanos(static_cast<int64_t>(months_), static_cast<int64_t>(days_), static_cast<__int128>(nanos_)); } absl::StatusOr<IntervalValue> IntervalValue::SumAggregator::GetAverage( int64_t count) const { SQL_DCHECK_GT(count, 0); // AVG(interval) = SUM(interval) / count, but SUM(interval) may not be a // valid interval (because of overflow), so we do manual division of parts // instead of building interval object and using it's division operator. __int128 months = months_ / count; __int128 months_remainder = months_ % count; __int128 adjusted_days = days_ + (months_remainder * IntervalValue::kDaysInMonth); __int128 days = adjusted_days / count; __int128 days_reminder = adjusted_days % count; FixedInt<64, 3> adjusted_nanos = FixedInt<64, 3>(nanos_); adjusted_nanos += FixedInt<64, 3>(days_reminder * IntervalValue::kNanosInDay); FixedInt<64, 3> nanos = adjusted_nanos; nanos /= FixedInt<64, 3>(count); // It is unlikely that months/days will overflow int64_t, and that nanos will // overflow int128 - but check it nevertheless. if (months > std::numeric_limits<int64_t>::max() || months < std::numeric_limits<int64_t>::min() || days > std::numeric_limits<int64_t>::max() || days < std::numeric_limits<int64_t>::min() || nanos > FixedInt<64, 3>(FixedInt<64, 2>::max()) || nanos < FixedInt<64, 3>(FixedInt<64, 2>::min())) { return absl::OutOfRangeError("Interval overflow during Avg operation"); } return IntervalValue::FromMonthsDaysNanos(static_cast<int64_t>(months), static_cast<int64_t>(days), static_cast<__int128>(nanos)); } std::string IntervalValue::SumAggregator::SerializeAsProtoBytes() const { std::string result; SerializeAndAppendToProtoBytes(&result); return result; } void IntervalValue::SumAggregator::SerializeAndAppendToProtoBytes( std::string* bytes) const { absl::uint128 months = bigquery_ml_utils_base::LittleEndian::FromHost128(months_); bytes->append(reinterpret_cast<const char*>(&months), sizeof(months)); absl::uint128 days = bigquery_ml_utils_base::LittleEndian::FromHost128(days_); bytes->append(reinterpret_cast<const char*>(&days), sizeof(days)); nanos_.SerializeToBytes(bytes); } absl::StatusOr<IntervalValue::SumAggregator> IntervalValue::SumAggregator::DeserializeFromProtoBytes( absl::string_view bytes) { IntervalValue::SumAggregator aggregator; if (bytes.empty()) { return aggregator; } if (bytes.size() < sizeof(absl::uint128) * 2) { return absl::OutOfRangeError( "Invalid serialized INTERVAL::SumAggregator size too small"); } const char* ptr = reinterpret_cast<const char*>(bytes.data()); aggregator.months_ = static_cast<__int128>( bigquery_ml_utils_base::LittleEndian::ToHost128(*absl::bit_cast<absl::uint128*>(ptr))); ptr += sizeof(absl::uint128); aggregator.days_ = static_cast<__int128>( bigquery_ml_utils_base::LittleEndian::ToHost128(*absl::bit_cast<absl::uint128*>(ptr))); ptr += sizeof(absl::uint128); if (!aggregator.nanos_.DeserializeFromBytes( bytes.substr(sizeof(absl::uint128) * 2))) { return absl::OutOfRangeError( "Invalid serialized INTERVAL::SumAggregator failed to deserialize " "nanos"); } return aggregator; } std::string IntervalValue::SumAggregator::DebugString() const { return absl::StrCat( "IntervalValue::SumAggregator (months=", Int128ToString(months_), ", days=", Int128ToString(days_), ", nanos=", nanos_.ToString(), ")"); } void IntervalValue::SumAggregator::MergeWith(const SumAggregator& other) { months_ += other.months_; days_ += other.days_; nanos_ += other.nanos_; } void IntervalValue::SerializeAndAppendToBytes(std::string* bytes) const { int64_t micros = bigquery_ml_utils_base::LittleEndian::FromHost64(micros_); bytes->append(reinterpret_cast<const char*>(&micros), sizeof(micros)); int32_t days = bigquery_ml_utils_base::LittleEndian::FromHost32(days_); bytes->append(reinterpret_cast<const char*>(&days), sizeof(days)); uint32_t months_nanos = bigquery_ml_utils_base::LittleEndian::FromHost32(months_nanos_); bytes->append(reinterpret_cast<const char*>(&months_nanos), sizeof(months_nanos)); } absl::StatusOr<IntervalValue> IntervalValue::DeserializeFromBytes( absl::string_view bytes) { // Empty translates to interval value 0 if (bytes.empty()) { return IntervalValue(); } if (bytes.size() != sizeof(IntervalValue)) { return absl::OutOfRangeError(absl::StrCat( "Invalid serialized INTERVAL size, expected ", sizeof(IntervalValue), " bytes, but got ", bytes.size(), " bytes.")); } const char* data = bytes.data(); IntervalValue interval; UnalignedLoadAndAugmentPtr(&interval.micros_, &data); UnalignedLoadAndAugmentPtr(&interval.days_, &data); UnalignedLoadAndAugmentPtr(&interval.months_nanos_, &data); SQL_RETURN_IF_ERROR(ValidateMonths(interval.get_months())); SQL_RETURN_IF_ERROR(ValidateDays(interval.get_days())); SQL_RETURN_IF_ERROR(ValidateNanos(interval.get_nanos())); return interval; } std::string IntervalValue::ToString() const { // Interval conversion to string always uses fully expanded form: // [<sign>]x-x [<sign>]x [<sign>]x:x:x[.ddd[ddd[ddd]]] // Year-Month part int64_t total_months = std::abs(get_months()); int64_t years = total_months / 12; int64_t months = total_months % 12; // Hour:Minute:Second and optional second fractions part. __int128 total_nanos = get_nanos(); bool negative_nanos = false; if (total_nanos < 0) { // Cannot overflow because valid range of nanos is smaller than most // negative value. total_nanos = -total_nanos; negative_nanos = true; } int64_t hours = total_nanos / kNanosInHour; total_nanos -= hours * kNanosInHour; int64_t minutes = total_nanos / kNanosInMinute; total_nanos -= minutes * kNanosInMinute; int64_t seconds = total_nanos / kNanosInSecond; total_nanos -= seconds * kNanosInSecond; bool has_millis = total_nanos != 0; int64_t millis = total_nanos / kNanosInMilli; total_nanos -= millis * kNanosInMilli; bool has_micros = total_nanos != 0; int64_t micros = total_nanos / kNanosInMicro; int64_t nanos = total_nanos % kNanosInMicro; std::string result = absl::StrFormat( "%s%d-%d %d %s%d:%d:%d", get_months() < 0 ? "-" : "", years, months, get_days(), negative_nanos ? "-" : "", hours, minutes, seconds); // Fractions of second always come in group of 3 if (has_millis) { absl::StrAppendFormat(&result, ".%03d", millis); if (has_micros) { absl::StrAppendFormat(&result, "%03d", micros); if (nanos != 0) { absl::StrAppendFormat(&result, "%03d", nanos); } } } return result; } std::string IntervalValue::ToISO8601() const { int64_t years = get_months() / 12; int64_t months = get_months() % 12; int64_t days = get_days(); __int128 total_nanos = get_nanos(); int64_t hours = total_nanos / kNanosInHour; int64_t minutes = (total_nanos % kNanosInHour) / kNanosInMinute; int64_t seconds = (total_nanos % kNanosInMinute) / kNanosInSecond; int64_t subseconds = total_nanos % kNanosInSecond; std::string result("P"); if (years != 0) absl::StrAppend(&result, years, "Y"); if (months != 0) absl::StrAppend(&result, months, "M"); if (days != 0) absl::StrAppend(&result, days, "D"); if (total_nanos != 0) absl::StrAppend(&result, "T"); if (hours != 0) absl::StrAppend(&result, hours, "H"); if (minutes != 0) absl::StrAppend(&result, minutes, "M"); if (seconds != 0 || subseconds != 0) { if (subseconds == 0) { absl::StrAppend(&result, seconds, "S"); } else { if (seconds != 0) { absl::StrAppend(&result, seconds, "."); } else if (total_nanos < 0) { absl::StrAppend(&result, "-0."); } else { absl::StrAppend(&result, "0."); } // Print fractions of a second without trailing zeros if (subseconds < 0) subseconds = -subseconds; for (int64_t factor : {100000000, 10000000, 1000000, 100000, 10000, 1000, 100, 10, 1}) { int64_t digit = subseconds / factor; absl::StrAppend(&result, digit); subseconds %= factor; if (subseconds == 0) { break; } } absl::StrAppend(&result, "S"); } } if (result.size() == 1) absl::StrAppend(&result, "0Y"); return result; } absl::StatusOr<int64_t> NanosFromFractionDigits(absl::string_view input, absl::string_view digits) { int64_t nano_fractions; if (!absl::SimpleAtoi(digits, &nano_fractions)) { return MakeIntervalParsingError(input); } if (digits.size() > 9) { return MakeIntervalParsingError(input); } // Add enough zeros at the end to get nanoseconds. The maximum value is // limited by 10^10, hence cannot overflow for (int i = 0; i < 9 - digits.size(); i++) { nano_fractions *= 10; } return nano_fractions; } // Pattern for interval seconds: [+|-][s][.ddddddddd]. We only use it when // there is a decimal dot in the input, therefore fractions are not optional. const LazyRE2 kRESecond = {R"(([-+])?(\d*)\.(\d+))"}; absl::StatusOr<IntervalValue> IntervalValue::ParseFromString( absl::string_view input, functions::DateTimestampPart part) { absl::Status status; // SimpleAtoi ignores leading and trailing spaces, but we reject them. if (input.empty() || std::isspace(input.front()) || std::isspace(input.back())) { return MakeIntervalParsingError(input); } // Seconds are special, because they allow fractions if (part == functions::SECOND && input.find('.') != input.npos) { absl::string_view sign; absl::string_view seconds_text; absl::string_view digits; // [+|-][s][.ddddddddd] - capture sign, seconds and digits of fractions. if (!RE2::FullMatch(input, *kRESecond, &sign, &seconds_text, &digits)) { return MakeIntervalParsingError(input); } int64_t seconds = 0; if (!seconds_text.empty()) { // This SimpleAtoi can fail if there were too many digits for seconds. if (!absl::SimpleAtoi(seconds_text, &seconds)) { return MakeIntervalParsingError(input); } } SQL_RET_CHECK(!digits.empty()); SQL_ASSIGN_OR_RETURN(__int128 nano_fractions, NanosFromFractionDigits(input, digits)); // Result always fits into int128 __int128 nanos = IntervalValue::kNanosInSecond * seconds + nano_fractions; bool negative = !sign.empty() && sign[0] == '-'; if (negative) { nanos = -nanos; } return IntervalValue::FromNanos(nanos); } int64_t value; if (!absl::SimpleAtoi(input, &value)) { return MakeIntervalParsingError(input); } switch (part) { case functions::YEAR: if (!functions::Multiply(IntervalValue::kMonthsInYear, value, &value, &status)) { return status; } return IntervalValue::FromMonths(value); case functions::QUARTER: if (!functions::Multiply(IntervalValue::kMonthsInQuarter, value, &value, &status)) { return status; } return IntervalValue::FromMonths(value); case functions::MONTH: return IntervalValue::FromMonths(value); case functions::WEEK: if (!functions::Multiply(IntervalValue::kDaysInWeek, value, &value, &status)) { return status; } return IntervalValue::FromDays(value); case functions::DAY: return IntervalValue::FromDays(value); case functions::HOUR: if (!functions::Multiply(IntervalValue::kMicrosInHour, value, &value, &status)) { return status; } return IntervalValue::FromMicros(value); case functions::MINUTE: if (!functions::Multiply(IntervalValue::kMicrosInMinute, value, &value, &status)) { return status; } return IntervalValue::FromMicros(value); case functions::SECOND: if (!functions::Multiply(IntervalValue::kMicrosInSecond, value, &value, &status)) { return status; } return IntervalValue::FromMicros(value); default: return MakeEvalError() << "Unsupported interval datetime field " << functions::DateTimestampPart_Name(part); } } // Regular expressions for parsing two datetime part intervals. // YEAR TO MONTH '[+|-]x-x const LazyRE2 kREYearToMonth = {R"(([-+])?(\d+)-(\d+))"}; // YEAR TO DAY '[+|-]x-x [+|-]x' const LazyRE2 kREYearToDay = {R"(([-+])?(\d+)-(\d+) ([-+]?\d+))"}; // YEAR TO HOUR '[+|-]x-x [+|-]x [+|-]x' const LazyRE2 kREYearToHour = {R"(([-+])?(\d+)-(\d+) ([-+]?\d+) ([-+])?(\d+))"}; // YEAR TO MINUTE '[+|-]x-x [+|-]x [+|-]x:x' const LazyRE2 kREYearToMinute = { R"(([-+])?(\d+)-(\d+) ([-+]?\d+) ([-+])?(\d+):(\d+))"}; // YEAR TO SECOND '[+|-]x-x [+|-]x [+|-]x:x:x[.ddddddddd]' const LazyRE2 kREYearToSecond = { R"(([-+])?(\d+)-(\d+) ([-+]?\d+) ([-+])?(\d+):(\d+):(\d+))"}; const LazyRE2 kREYearToSecondFractions = { R"(([-+])?(\d+)-(\d+) ([-+]?\d+) ([-+])?(\d+):(\d+):(\d+)\.(\d+))"}; // MONTH TO DAY '[+|-]x [+|-]x' const LazyRE2 kREMonthToDay = {R"(([-+])?(\d+) ([-+]?\d+))"}; // MONTH TO HOUR '[+|-]x [+|-]x [+|-]x' const LazyRE2 kREMonthToHour = {R"(([-+])?(\d+) ([-+]?\d+) ([-+])?(\d+))"}; // MONTH TO MINUTE '[+|-]x [+|-]x [+|-]x:x' const LazyRE2 kREMonthToMinute = { R"(([-+])?(\d+) ([-+]?\d+) ([-+])?(\d+):(\d+))"}; // MONTH TO SECOND '[+|-]x [+|-]]x [+|-]x:x:x[.ddddddddd]' const LazyRE2 kREMonthToSecond = { R"(([-+])?(\d+) ([-+]?\d+) ([-+])?(\d+):(\d+):(\d+))"}; const LazyRE2 kREMonthToSecondFractions = { R"(([-+])?(\d+) ([-+]?\d+) ([-+])?(\d+):(\d+):(\d+)\.(\d+))"}; // DAY TO HOUR '[+|-]x [+|-]x' const LazyRE2 kREDayToHour = {R"(([-+]?\d+) ([-+])?(\d+))"}; // DAY TO MINUTE '[+|-]x [+|-]x:x' const LazyRE2 kREDayToMinute = {R"(([-+]?\d+) ([-+])?(\d+):(\d+))"}; // DAY TO SECOND '[+|-]x [+|-]x:x:x[.ddddddddd]' const LazyRE2 kREDayToSecond = {R"(([-+]?\d+) ([-+])?(\d+):(\d+):(\d+))"}; const LazyRE2 kREDayToSecondFractions = { R"(([-+]?\d+) ([-+])?(\d+):(\d+):(\d+)\.(\d+))"}; // HOUR TO MINUTE '[+|-]x:x' const LazyRE2 kREHourToMinute = {R"(([-+])?(\d+):(\d+))"}; // HOUR TO SECOND '[+|-]x:x:x[.ddddddddd]' const LazyRE2 kREHourToSecond = {R"(([-+])?(\d+):(\d+):(\d+))"}; const LazyRE2 kREHourToSecondFractions = {R"(([-+])?(\d+):(\d+):(\d+)\.(\d+))"}; // MINUTE TO SECOND '[+|-]x:x[.ddddddddd]' const LazyRE2 kREMinuteToSecond = {R"(([-+])?(\d+):(\d+))"}; const LazyRE2 kREMinuteToSecondFractions = {R"(([-+])?(\d+):(\d+)\.(\d+))"}; absl::StatusOr<IntervalValue> IntervalValue::ParseFromString( absl::string_view input, functions::DateTimestampPart from, functions::DateTimestampPart to) { // Sign (empty, '-' or '+') for months and nano fields. There is no special // treatment for sign of days, because days are standalone number and are // matched and parsed by RE2 as part of ([-+]?\d+) group. std::string sign_months; std::string sign_nanos; // All the datetime fields int64_t years = 0; int64_t months = 0; int64_t days = 0; int64_t hours = 0; int64_t minutes = 0; int64_t seconds = 0; // Fractions of seconds absl::string_view fraction_digits; // Indication whether parsing succeeded. bool parsed = false; // Seconds are special, because they can have optional fractions if (to == functions::SECOND && input.find('.') != input.npos) { switch (from) { case functions::YEAR: parsed = RE2::FullMatch(input, *kREYearToSecondFractions, &sign_months, &years, &months, &days, &sign_nanos, &hours, &minutes, &seconds, &fraction_digits); break; case functions::MONTH: parsed = RE2::FullMatch(input, *kREMonthToSecondFractions, &sign_months, &months, &days, &sign_nanos, &hours, &minutes, &seconds, &fraction_digits); break; case functions::DAY: parsed = RE2::FullMatch(input, *kREDayToSecondFractions, &days, &sign_nanos, &hours, &minutes, &seconds, &fraction_digits); break; case functions::HOUR: parsed = RE2::FullMatch(input, *kREHourToSecondFractions, &sign_nanos, &hours, &minutes, &seconds, &fraction_digits); break; case functions::MINUTE: parsed = RE2::FullMatch(input, *kREMinuteToSecondFractions, &sign_nanos, &minutes, &seconds, &fraction_digits); break; default: return MakeEvalError() << "Invalid interval datetime fields: " << functions::DateTimestampPart_Name(from) << " TO " << functions::DateTimestampPart_Name(to); } } else { #define DATETIME_PARTS(from, to) (from << 16 | to) switch (DATETIME_PARTS(from, to)) { case DATETIME_PARTS(functions::YEAR, functions::MONTH): parsed = RE2::FullMatch(input, *kREYearToMonth, &sign_months, &years, &months); break; case DATETIME_PARTS(functions::YEAR, functions::DAY): parsed = RE2::FullMatch(input, *kREYearToDay, &sign_months, &years, &months, &days); break; case DATETIME_PARTS(functions::YEAR, functions::HOUR): parsed = RE2::FullMatch(input, *kREYearToHour, &sign_months, &years, &months, &days, &sign_nanos, &hours); break; case DATETIME_PARTS(functions::YEAR, functions::MINUTE): parsed = RE2::FullMatch(input, *kREYearToMinute, &sign_months, &years, &months, &days, &sign_nanos, &hours, &minutes); break; case DATETIME_PARTS(functions::YEAR, functions::SECOND): parsed = RE2::FullMatch(input, *kREYearToSecond, &sign_months, &years, &months, &days, &sign_nanos, &hours, &minutes, &seconds); break; case DATETIME_PARTS(functions::MONTH, functions::DAY): parsed = RE2::FullMatch(input, *kREMonthToDay, &sign_months, &months, &days); break; case DATETIME_PARTS(functions::MONTH, functions::HOUR): parsed = RE2::FullMatch(input, *kREMonthToHour, &sign_months, &months, &days, &sign_nanos, &hours); break; case DATETIME_PARTS(functions::MONTH, functions::MINUTE): parsed = RE2::FullMatch(input, *kREMonthToMinute, &sign_months, &months, &days, &sign_nanos, &hours, &minutes); break; case DATETIME_PARTS(functions::MONTH, functions::SECOND): parsed = RE2::FullMatch(input, *kREMonthToSecond, &sign_months, &months, &days, &sign_nanos, &hours, &minutes, &seconds); break; case DATETIME_PARTS(functions::DAY, functions::HOUR): parsed = RE2::FullMatch(input, *kREDayToHour, &days, &sign_nanos, &hours); break; case DATETIME_PARTS(functions::DAY, functions::MINUTE): parsed = RE2::FullMatch(input, *kREDayToMinute, &days, &sign_nanos, &hours, &minutes); break; case DATETIME_PARTS(functions::DAY, functions::SECOND): parsed = RE2::FullMatch(input, *kREDayToSecond, &days, &sign_nanos, &hours, &minutes, &seconds); break; case DATETIME_PARTS(functions::HOUR, functions::MINUTE): parsed = RE2::FullMatch(input, *kREHourToMinute, &sign_nanos, &hours, &minutes); break; case DATETIME_PARTS(functions::HOUR, functions::SECOND): parsed = RE2::FullMatch(input, *kREHourToSecond, &sign_nanos, &hours, &minutes, &seconds); break; case DATETIME_PARTS(functions::MINUTE, functions::SECOND): parsed = RE2::FullMatch(input, *kREMinuteToSecond, &sign_nanos, &minutes, &seconds); break; default: return MakeEvalError() << "Invalid interval datetime fields: " << functions::DateTimestampPart_Name(from) << " TO " << functions::DateTimestampPart_Name(to); } #undef DATETIME_PARTS } if (!parsed) { return MakeIntervalParsingError(input); } absl::Status status; int64_t years_as_months; if (!functions::Multiply(IntervalValue::kMonthsInYear, years, &years_as_months, &status)) { return status; } if (!functions::Add(years_as_months, months, &months, &status)) { return status; } bool negative_months = !sign_months.empty() && sign_months[0] == '-'; if (negative_months) { months = -months; } // Result always fits into int128. __int128 nanos = IntervalValue::kNanosInHour * hours + IntervalValue::kNanosInMinute * minutes + IntervalValue::kNanosInSecond * seconds; if (!fraction_digits.empty()) { SQL_ASSIGN_OR_RETURN(int64_t nano_fractions, NanosFromFractionDigits(input, fraction_digits)); nanos += nano_fractions; } bool negative_nanos = !sign_nanos.empty() && sign_nanos[0] == '-'; if (negative_nanos) { nanos = -nanos; } return IntervalValue::FromMonthsDaysNanos(months, days, nanos); } absl::StatusOr<IntervalValue> IntervalValue::ParseFromString( absl::string_view input) { // We can unambiguously determine possible datetime fields by counting number // of spaces, colons and dashes after digit in the input // (dash before digit could be a minus sign) // // ------------------+-------------+--------+--------+--------------------+ // Datetime fields | Format | Spaces | Colons | Dashes after digit | // ------------------+-------------+--------+--------+--------------------+ // YEAR TO SECOND | Y-M D H:M:S | 2 | 2 | 1 | // YEAR TO MINUTE | Y-M D H:M | 2 | 1 | 1 | // YEAR TO HOUR | Y-M D H | 2 | 0 | 1 | // YEAR TO DAY | Y-M D | 1 | 0 | 1 | // YEAR TO MONTH | Y-M | 0 | 0 | 1 | // MONTH TO HOUR | M D H | 2 | 0 | 0 | // MONTH TO MINUTE | M D H:M | 2 | 1 | 0 | // MONTH TO SECOND | M D H:M:S | 2 | 2 | 0 | // DAY TO MINUTE | D H:M | 1 | 1 | 0 | // DAY TO SECOND | D H:M:S | 1 | 2 | 0 | // HOUR TO SECOND | H:M:S | 0 | 2 | 0 | // ------------------+-------------+--------+--------+--------------------+ char p = '\0'; int spaces = 0; int colons = 0; int dashes = 0; for (char c : input) { if (c == ' ') { spaces++; } else if (c == ':') { colons++; } else if (c == '-' && std::isdigit(p)) { dashes++; } p = c; } #define SCD(s, c, d) ((s)*100 + (c)*10 + d) using functions::DAY; using functions::HOUR; using functions::MINUTE; using functions::MONTH; using functions::SECOND; using functions::YEAR; switch (SCD(spaces, colons, dashes)) { case SCD(2, 2, 1): return IntervalValue::ParseFromString(input, YEAR, SECOND); case SCD(2, 1, 1): return IntervalValue::ParseFromString(input, YEAR, MINUTE); case SCD(2, 0, 1): return IntervalValue::ParseFromString(input, YEAR, HOUR); case SCD(1, 0, 1): return IntervalValue::ParseFromString(input, YEAR, DAY); case SCD(0, 0, 1): return IntervalValue::ParseFromString(input, YEAR, MONTH); case SCD(2, 0, 0): return IntervalValue::ParseFromString(input, MONTH, HOUR); case SCD(2, 1, 0): return IntervalValue::ParseFromString(input, MONTH, MINUTE); case SCD(2, 2, 0): return IntervalValue::ParseFromString(input, MONTH, SECOND); case SCD(1, 1, 0): return IntervalValue::ParseFromString(input, DAY, MINUTE); case SCD(1, 2, 0): return IntervalValue::ParseFromString(input, DAY, SECOND); case SCD(0, 2, 0): return IntervalValue::ParseFromString(input, HOUR, SECOND); } #undef SCD return MakeIntervalParsingError(input); } namespace { const LazyRE2 kRENumber = {R"((\d+)(\.|\,)?(\d+)?)"}; // Parser for ISO 8601 Duration format with following modifications: // - negative datetime parts are allowed // - multiple dateparts of same type are allowed // - order of dateparts can be arbitrary // - 'W' can be used for weeks in the date portion // - Only seconds can have fractional numbers class ISO8601Parser { const char kEof = '\0'; public: absl::StatusOr<IntervalValue> Parse(absl::string_view input) { input_ = input; char c = GetChar(); if (c != 'P') { return MakeIntervalParsingError(input) << ": Interval must start with 'P'"; } if (input_.empty()) { return MakeIntervalParsingError(input) << ": At least one datetime part must be defined in the interval"; } absl::Status status; // When true - parsing time part (after T), when false - parsing date part. bool in_time_part = false; int64_t years = 0; int64_t months = 0; int64_t weeks = 0; int64_t days = 0; int64_t hours = 0; int64_t minutes = 0; int64_t seconds = 0; int64_t nano_fractions = 0; for (;;) { int64_t sign = false; c = PeekChar(); if (c == kEof) { break; } if (!std::isdigit(c)) { GetChar(); if (c == '-') { // Proceed to parse the number and make it negative later sign = true; } else if (c == 'T') { // Switching from date to time part if (in_time_part) { return MakeIntervalParsingError(input) << ": Unexpected duplicate time separator 'T'"; } in_time_part = true; continue; } else { return MakeIntervalParsingError(input) << ": Unexpected " << PrintChar(c); } } // We now expect to see positive number (possibly with fractional digits) // followed by datetime part letter. SQL_RETURN_IF_ERROR(ParseNumber()); int64_t number; if (!absl::SimpleAtoi(digits_, &number)) { return MakeIntervalParsingError(input) << ": Cannot convert '" << digits_ << "' to integer"; } // number couldn't have been negative, so no worries about underflow // of int64_t::min if (sign) number = -number; c = GetChar(); if (!in_time_part) { switch (c) { case 'Y': if (!functions::Add(years, number, &years, &status)) { return status; } break; case 'M': if (!functions::Add(months, number, &months, &status)) { return status; } break; case 'W': if (!functions::Add(weeks, number, &weeks, &status)) { return status; } break; case 'D': if (!functions::Add(days, number, &days, &status)) { return status; } break; default: return MakeIntervalParsingError(input) << ": Unexpected " << PrintChar(c) << " in the date portion of interval"; } } else { switch (c) { case 'H': if (!functions::Add(hours, number, &hours, &status)) { return status; } break; case 'M': if (!functions::Add(minutes, number, &minutes, &status)) { return status; } break; case 'S': if (!functions::Add(seconds, number, &seconds, &status)) { return status; } if (!decimal_point_.empty()) { SQL_ASSIGN_OR_RETURN( number, NanosFromFractionDigits(input_, decimal_digits_)); if (sign) number = -number; nano_fractions += number; } break; default: return MakeIntervalParsingError(input) << ": Unexpected " << PrintChar(c) << " in the time portion of interval"; } } if (!decimal_point_.empty() && c != 'S') { return MakeIntervalParsingError(input) << ": Fractional values are only allowed for " "seconds part 'S', but were used for " << PrintChar(c); } } int64_t year_months; if (!functions::Multiply(IntervalValue::kMonthsInYear, years, &year_months, &status)) { return status; } if (!functions::Add(months, year_months, &months, &status)) { return status; } int64_t week_days; if (!functions::Multiply(IntervalValue::kDaysInWeek, weeks, &week_days, &status)) { return status; } if (!functions::Add(days, week_days, &days, &status)) { return status; } // Int128 math cannot overflow __int128 nanos = IntervalValue::kNanosInHour * hours + IntervalValue::kNanosInMinute * minutes + IntervalValue::kNanosInSecond * seconds + nano_fractions; return IntervalValue::FromMonthsDaysNanos(months, days, nanos); } private: absl::Status ParseNumber() { digits_ = {}; decimal_point_ = {}; decimal_digits_ = {}; if (!RE2::Consume(&input_, *kRENumber, &digits_, &decimal_point_, &decimal_digits_)) { return MakeEvalError() << "Expected number"; } return absl::OkStatus(); } char PeekChar() const { if (input_.empty()) { return kEof; } return input_[0]; } char GetChar() { if (input_.empty()) { return kEof; } char c = input_[0]; input_.remove_prefix(1); return c; } std::string PrintChar(char c) const { if (c == kEof) return "end of input"; return absl::StrCat("'", std::string(1, c), "'"); } // Points to the current position being parsed in input absl::string_view input_; // Parsed digits before decimal dot absl::string_view digits_; // Decimal dot itself (needed to detect trailing dot) absl::string_view decimal_point_; // Digits after the decimal dot absl::string_view decimal_digits_; }; } // namespace absl::StatusOr<IntervalValue> IntervalValue::ParseFromISO8601( absl::string_view input) { ISO8601Parser parser; return parser.Parse(input); } absl::StatusOr<IntervalValue> IntervalValue::Parse(absl::string_view input) { if (absl::StartsWith(input, "P")) { return ParseFromISO8601(input); } return ParseFromString(input); } absl::StatusOr<IntervalValue> IntervalValue::FromInteger( int64_t value, functions::DateTimestampPart part) { switch (part) { case functions::YEAR: return IntervalValue::FromYMDHMS(value, 0, 0, 0, 0, 0); case functions::MONTH: return IntervalValue::FromYMDHMS(0, value, 0, 0, 0, 0); case functions::DAY: return IntervalValue::FromYMDHMS(0, 0, value, 0, 0, 0); case functions::HOUR: return IntervalValue::FromYMDHMS(0, 0, 0, value, 0, 0); case functions::MINUTE: return IntervalValue::FromYMDHMS(0, 0, 0, 0, value, 0); case functions::SECOND: return IntervalValue::FromYMDHMS(0, 0, 0, 0, 0, value); case functions::QUARTER: { absl::Status status; if (!functions::Multiply(IntervalValue::kMonthsInQuarter, value, &value, &status)) { return status; } return IntervalValue::FromYMDHMS(0, value, 0, 0, 0, 0); } case functions::WEEK: { absl::Status status; if (!functions::Multiply(IntervalValue::kDaysInWeek, value, &value, &status)) { return status; } return IntervalValue::FromYMDHMS(0, 0, value, 0, 0, 0); } default: return MakeEvalError() << "Invalid interval datetime field " << functions::DateTimestampPart_Name(part); } } absl::StatusOr<int64_t> IntervalValue::Extract( functions::DateTimestampPart part) const { switch (part) { case functions::YEAR: return get_months() / kMonthsInYear; case functions::MONTH: return get_months() % kMonthsInYear; case functions::DAY: return get_days(); case functions::HOUR: return get_nanos() / kNanosInHour; case functions::MINUTE: return (get_nanos() % kNanosInHour) / kNanosInMinute; case functions::SECOND: return (get_nanos() % kNanosInMinute) / kNanosInSecond; case functions::MILLISECOND: return (get_nanos() % kNanosInSecond) / kNanosInMilli; case functions::MICROSECOND: return (get_nanos() % kNanosInSecond) / kNanosInMicro; case functions::NANOSECOND: return (get_nanos() % kNanosInSecond); default: break; // fall through } __int128 total_nanos = get_nanos(); bool negative_nanos = false; if (total_nanos < 0) { // Cannot overflow because valid range of nanos is smaller than most // negative value. total_nanos = -total_nanos; negative_nanos = true; } int64_t value; switch (part) { case functions::HOUR: value = total_nanos / kNanosInHour; break; case functions::MINUTE: value = (total_nanos % kNanosInHour) / kNanosInMinute; break; case functions::SECOND: value = (total_nanos % kNanosInMinute) / kNanosInSecond; break; case functions::MILLISECOND: value = (total_nanos % kNanosInSecond) / kNanosInMilli; break; case functions::MICROSECOND: value = (total_nanos % kNanosInMilli) / kNanosInMicro; break; case functions::NANOSECOND: value = total_nanos % kNanosInMicro; break; default: return absl::OutOfRangeError( absl::StrFormat("Unsupported date part %s in EXTRACT FROM INTERVAL", functions::DateTimestampPart_Name(part))); } if (negative_nanos) { value = -value; } return value; } std::ostream& operator<<(std::ostream& out, IntervalValue value) { return out << value.ToString(); } absl::StatusOr<IntervalValue> JustifyHours(const IntervalValue& v) { __int128 nanos = v.get_nanos(); int64_t days = v.get_days() + nanos / IntervalValue::kNanosInDay; nanos = nanos % IntervalValue::kNanosInDay; if (days > 0 && nanos < 0) { nanos += IntervalValue::kNanosInDay; days--; } else if (days < 0 && nanos > 0) { nanos -= IntervalValue::kNanosInDay; days++; } return IntervalValue::FromMonthsDaysNanos(v.get_months(), days, nanos); } absl::StatusOr<IntervalValue> JustifyDays(const IntervalValue& v) { int64_t months = v.get_months() + v.get_days() / IntervalValue::kDaysInMonth; int64_t days = v.get_days() % IntervalValue::kDaysInMonth; if (months > 0 && days < 0) { days += IntervalValue::kDaysInMonth; months--; } else if (months < 0 && days > 0) { days -= IntervalValue::kDaysInMonth; months++; } return IntervalValue::FromMonthsDaysNanos(months, days, v.get_nanos()); } absl::StatusOr<IntervalValue> JustifyInterval(const IntervalValue& v) { __int128 nanos = v.get_nanos(); int64_t days = v.get_days() + nanos / IntervalValue::kNanosInDay; nanos = nanos % IntervalValue::kNanosInDay; int64_t months = v.get_months() + days / IntervalValue::kDaysInMonth; days = days % IntervalValue::kDaysInMonth; // This logic might be non-intuitive, but it repeats the logic in Postgres // for making sure all datetime parts have same sign. if (months > 0 && (days < 0 || (days == 0 && nanos < 0))) { days += IntervalValue::kDaysInMonth; months--; } else if (months < 0 && (days > 0 || (days == 0 && nanos > 0))) { days -= IntervalValue::kDaysInMonth; months++; } if (days > 0 && nanos < 0) { nanos += IntervalValue::kNanosInDay; days--; } else if (days < 0 && nanos > 0) { nanos -= IntervalValue::kNanosInDay; days++; } return IntervalValue::FromMonthsDaysNanos(months, days, nanos); } } // namespace bigquery_ml_utils