sql_utils/public/interval_value.h (277 lines of code) (raw):
/*
* Copyright 2023 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_PUBLIC_INTERVAL_VALUE_H_
#define THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_PUBLIC_INTERVAL_VALUE_H_
#include <cstdint>
#include <ostream>
#include <string>
#include "sql_utils/common/errors.h"
#include "sql_utils/common/multiprecision_int.h"
#include "sql_utils/public/functions/datetime.pb.h"
#include "absl/status/status.h"
#include "absl/status/statusor.h"
#include "absl/strings/str_join.h"
#include "sql_utils/base/status_macros.h"
namespace bigquery_ml_utils {
class IntervalValue final {
// The INTERVAL value is composed of 3 fields:
// 1. Number of months
// 2. Number of days
// 3. Number of nanoseconds
//
// Each field should be able to cover 10,000 years with sign. The required
// number of bits for each field is:
// Months - 18 bits
// Days - 23 bits
// Microseconds - 59 bits
// Nanoseconds - 69 bits
// Nanoseconds fraction of microseconds - 10 bits.
// Two of the most used fields - micros and days get int64_t and int32_t
// parts of interval.
//
// 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6
// | micros | days | months and |
// nano fractions
//
// Months and nano fractions of microsecond take the rest of 32 bits.
//
// 01234567890123456789012345678901
// | months | nano |
// | | fractions |
//
// Months are always stored as positive numbers, and the highest bit is used
// to store the sign information: 0 for positive, 1 for negative.
// Nano fractions are always positive, when needed micros value is adjusted.
// This allows single canonical representation of nanos, i.e. for nanos=-1,
// it will be stored as micros=-1 and nano_fractions=999.
static const uint32_t kMonthSignMask = 0x80000000;
static const uint32_t kMonthsMask = 0x7FFFE000;
static const uint32_t kMonthsShift = 13;
static const uint32_t kNanosMask = 0x000003FF;
static const uint32_t kNanosShift = 0;
public:
static const int64_t kMonthsInYear = 12;
static const int64_t kMonthsInQuarter = 3;
static const int64_t kHoursInDay = 24;
static const int64_t kMinutesInHour = 60;
static const int64_t kSecondsInMinute = 60;
static const int64_t kMicrosInMilli = 1000;
static const int64_t kMillisInSecond = 1000;
static const int64_t kMicrosInSecond = kMillisInSecond * kMicrosInMilli;
static const int64_t kMicrosInMinute = kSecondsInMinute * kMicrosInSecond;
static const int64_t kMicrosInHour = kMinutesInHour * kMicrosInMinute;
static const int64_t kMicrosInDay = kHoursInDay * kMicrosInHour;
static const int64_t kDaysInMonth = 30;
static const int64_t kDaysInWeek = 7;
static const int64_t kMicrosInMonth = kDaysInMonth * kMicrosInDay;
static const int64_t kNanosInMicro = 1000;
static const __int128 kNanosInMicro128 = static_cast<__int128>(kNanosInMicro);
static const __int128 kNanosInMilli = kMicrosInMilli * kNanosInMicro128;
static const __int128 kNanosInSecond = kMicrosInSecond * kNanosInMicro128;
static const __int128 kNanosInMinute = kMicrosInMinute * kNanosInMicro128;
static const __int128 kNanosInHour = kMicrosInHour * kNanosInMicro128;
static const __int128 kNanosInDay = kNanosInMicro128 * kMicrosInDay;
static const __int128 kNanosInMonth = kNanosInMicro128 * kMicrosInMonth;
static const int64_t kMaxYears = 10000;
static const int64_t kMaxMonths = 12 * kMaxYears;
static const int64_t kMaxDays = 366 * kMaxYears;
static const int64_t kMaxHours = kMaxDays * kHoursInDay;
static const int64_t kMaxMinutes = kMaxHours * kMinutesInHour;
static const int64_t kMaxSeconds = kMaxMinutes * kSecondsInMinute;
static const int64_t kMaxMicros = kMicrosInDay * kMaxDays;
static const __int128 kMaxNanos = kNanosInMicro128 * kMaxMicros;
static const int64_t kMinMonths = -kMaxMonths;
static const int64_t kMinDays = -kMaxDays;
static const int64_t kMinMicros = -kMaxMicros;
static const __int128 kMinNanos = -kMaxNanos;
// Builds interval value from [Y]ears, [M]onths, [D]ays, [H]ours, [M]inutes
// and [S]econds.
static absl::StatusOr<IntervalValue> FromYMDHMS(int64_t years, int64_t months,
int64_t days, int64_t hours,
int64_t minutes,
int64_t seconds);
static absl::StatusOr<IntervalValue> FromMonthsDaysMicros(int64_t months,
int64_t days,
int64_t micros) {
SQL_RETURN_IF_ERROR(ValidateMonths(months));
SQL_RETURN_IF_ERROR(ValidateDays(days));
SQL_RETURN_IF_ERROR(ValidateMicros(micros));
return IntervalValue(months, days, micros);
}
static absl::StatusOr<IntervalValue> FromMonthsDaysNanos(int64_t months,
int64_t days,
__int128 nanos) {
SQL_RETURN_IF_ERROR(ValidateMonths(months));
SQL_RETURN_IF_ERROR(ValidateDays(days));
SQL_RETURN_IF_ERROR(ValidateNanos(nanos));
return IntervalValue(months, days, nanos);
}
static absl::StatusOr<IntervalValue> FromMonths(int64_t months) {
SQL_RETURN_IF_ERROR(ValidateMonths(months));
return IntervalValue(months, 0);
}
static absl::StatusOr<IntervalValue> FromDays(int64_t days) {
SQL_RETURN_IF_ERROR(ValidateDays(days));
return IntervalValue(0, days);
}
static absl::StatusOr<IntervalValue> FromMicros(int64_t micros) {
SQL_RETURN_IF_ERROR(ValidateMicros(micros));
return IntervalValue(0, 0, micros);
}
static absl::StatusOr<IntervalValue> FromNanos(__int128 nanos) {
SQL_RETURN_IF_ERROR(ValidateNanos(nanos));
return IntervalValue(0, 0, nanos);
}
static IntervalValue MaxValue() {
return IntervalValue(kMaxMonths, kMaxDays, kMaxNanos);
}
static IntervalValue MinValue() {
return IntervalValue(kMinMonths, kMinDays, kMinNanos);
}
// Default constructor, constructs a zero value.
constexpr IntervalValue() {}
// Convert interval value to micros. Note, that the resulting number of
// micros can be bigger (up to 3 times) than the maximum number of micros
// allowed in interval.
int64_t GetAsMicros() const {
return get_months() * kMicrosInMonth + get_days() * kMicrosInDay +
get_micros();
}
// Convert interval value to nanos. Note, that the resulting number of
// nanos can be bigger (up to 3 times) than the maximum number of nanos
// allowed in interval.
__int128 GetAsNanos() const {
return get_months() * kNanosInMonth + get_days() * kNanosInDay +
get_nanos();
}
// Get the months part of interval
int64_t get_months() const {
int64_t months = ((months_nanos_ & kMonthsMask) >> kMonthsShift);
return (months_nanos_ & kMonthSignMask) ? -months : months;
}
// Get the days part of interval
int64_t get_days() const { return days_; }
// Get the micros part of interval
int64_t get_micros() const { return micros_; }
// Get the nanos part of interval
__int128 get_nanos() const {
return kNanosInMicro128 * micros_ + get_nano_fractions();
}
// Get only the nano fractions part [0 to 999]
int64_t get_nano_fractions() const {
return (months_nanos_ & kNanosMask) >> kNanosShift;
}
// Comparison operators.
bool operator==(const IntervalValue& v) const {
return get_nano_fractions() == v.get_nano_fractions() &&
GetAsMicros() == v.GetAsMicros();
}
bool operator!=(const IntervalValue& v) const {
return GetAsMicros() != v.GetAsMicros() ||
get_nano_fractions() != v.get_nano_fractions();
}
bool operator<(const IntervalValue& v) const {
int64_t micros = GetAsMicros();
int64_t v_micros = v.GetAsMicros();
return micros < v_micros || (micros == v_micros &&
get_nano_fractions() < v.get_nano_fractions());
}
bool operator>(const IntervalValue& v) const {
int64_t micros = GetAsMicros();
int64_t v_micros = v.GetAsMicros();
return micros > v_micros || (micros == v_micros &&
get_nano_fractions() > v.get_nano_fractions());
}
bool operator<=(const IntervalValue& v) const {
int64_t micros = GetAsMicros();
int64_t v_micros = v.GetAsMicros();
return micros < v_micros ||
(micros == v_micros &&
get_nano_fractions() <= v.get_nano_fractions());
}
bool operator>=(const IntervalValue& v) const {
int64_t micros = GetAsMicros();
int64_t v_micros = v.GetAsMicros();
return micros > v_micros ||
(micros == v_micros &&
get_nano_fractions() >= v.get_nano_fractions());
}
// Unary minus operator
IntervalValue operator-() const {
int64_t months = get_months();
int64_t days = get_days();
__int128 nanos = get_nanos();
return IntervalValue(-months, -days, -nanos);
}
// Binary plus operator
absl::StatusOr<IntervalValue> operator+(const IntervalValue& v) const {
return IntervalValue::FromMonthsDaysNanos(get_months() + v.get_months(),
get_days() + v.get_days(),
get_nanos() + v.get_nanos());
}
// Binary minus operator
absl::StatusOr<IntervalValue> operator-(const IntervalValue& v) const {
return IntervalValue::FromMonthsDaysNanos(get_months() - v.get_months(),
get_days() - v.get_days(),
get_nanos() - v.get_nanos());
}
// Multiply by integer operator
absl::StatusOr<IntervalValue> operator*(int64_t v) const;
// Divide by integer operator
absl::StatusOr<IntervalValue> operator/(int64_t v) const;
// Aggregates multiple INTERVAL values and produces sum and average of all
// values. This class handles a temporary overflow while adding values.
// OUT_OF_RANGE error is generated only if the result is outside of the valid
// INTERVAL range.
class SumAggregator final {
public:
// Adds an INTERVAL value to the sum.
void Add(IntervalValue value);
// Subtracts an INTERVAL value from the sum.
void Subtract(IntervalValue value) {
Add(-value);
}
// Returns sum of all input values. Returns OUT_OF_RANGE error on overflow.
absl::StatusOr<IntervalValue> GetSum() const;
// Returns sum of all input values divided by the specified divisor.
// Returns OUT_OF_RANGE error on overflow of the division result.
// Note, that with the proper invocation of AVG function, overflow is not
// possible.
// Caller must ensure that count is positive non-zero.
absl::StatusOr<IntervalValue> GetAverage(int64_t count) const;
// Merges the state with other SumAggregator instance's state.
void MergeWith(const SumAggregator& other);
// Serialization and deserialization methods for NUMERIC values that are
// intended to be used to store them in protos. The encoding is variable in
// length with max size of 32 bytes. SerializeAndAppendToProtoBytes is
// typically more efficient due to fewer memory allocations.
std::string SerializeAsProtoBytes() const;
void SerializeAndAppendToProtoBytes(std::string* bytes) const;
static absl::StatusOr<SumAggregator> DeserializeFromProtoBytes(
absl::string_view bytes);
std::string DebugString() const;
private:
__int128 months_ = 0;
__int128 days_ = 0;
FixedInt<64, 3> nanos_;
};
// Returns hash code for the value.
size_t HashCode() const;
template <typename H>
friend H AbslHashValue(H h, const IntervalValue& v);
absl::StatusOr<int64_t> Extract(functions::DateTimestampPart part) const;
// Serialization and deserialization methods for interval values.
void SerializeAndAppendToBytes(std::string* bytes) const;
std::string SerializeAsBytes() const {
std::string bytes;
SerializeAndAppendToBytes(&bytes);
return bytes;
}
static absl::StatusOr<IntervalValue> DeserializeFromBytes(
absl::string_view bytes);
// Builds fully expanded string representation of interval.
std::string ToString() const;
// Builds ISO 8601 Duration compliant string representation of interval.
std::string ToISO8601() const;
// Parses interval from string, automatically detects datetime fields.
static absl::StatusOr<IntervalValue> ParseFromString(absl::string_view input);
// Parses interval from string for single datetime field.
static absl::StatusOr<IntervalValue> ParseFromString(
absl::string_view input, functions::DateTimestampPart part);
// Parses interval from string for two datetime fields.
static absl::StatusOr<IntervalValue> ParseFromString(
absl::string_view input, functions::DateTimestampPart from,
functions::DateTimestampPart to);
// Parses interval from ISO 8601 Duration.
static absl::StatusOr<IntervalValue> ParseFromISO8601(
absl::string_view input);
// Parses either canonical interval string representation (ParseFromString) or
// ISO 8601 Duration representation - detects automatically the format.
static absl::StatusOr<IntervalValue> Parse(absl::string_view input);
// Interval constructor from integer for given datetime part field.
static absl::StatusOr<IntervalValue> FromInteger(
int64_t value, functions::DateTimestampPart part);
private:
IntervalValue(int64_t months, int64_t days, int64_t micros = 0) {
micros_ = micros;
days_ = static_cast<int32_t>(days);
if (months >= 0) {
months_nanos_ = static_cast<uint32_t>(months) << kMonthsShift;
} else {
months_nanos_ =
(static_cast<uint32_t>(-months) << kMonthsShift) | kMonthSignMask;
}
}
IntervalValue(int64_t months, int64_t days, __int128 nanos) {
micros_ = nanos / kNanosInMicro;
days_ = static_cast<int32_t>(days);
if (months >= 0) {
months_nanos_ = static_cast<uint32_t>(months) << kMonthsShift;
} else {
months_nanos_ =
(static_cast<uint32_t>(-months) << kMonthsShift) | kMonthSignMask;
}
int64_t nano_fractions = nanos % kNanosInMicro;
if (nano_fractions < 0) {
// Make sure nano_fractions are always positive by adjusting micros.
nano_fractions = kNanosInMicro + nano_fractions;
micros_--;
}
months_nanos_ |= static_cast<uint32_t>(nano_fractions) << kNanosShift;
}
template <typename T>
static absl::Status ValidateField(T value, T min, T max,
absl::string_view field_name) {
if (ABSL_PREDICT_TRUE(value <= max && value >= min)) {
return absl::OkStatus();
}
return MakeEvalError() << "Interval field " << field_name << " '"
<< absl::int128(value) << "' is out of range "
<< absl::int128(min) << " to " << absl::int128(max);
}
static absl::Status ValidateMonths(int64_t months) {
return ValidateField(months, kMinMonths, kMaxMonths, "months");
}
static absl::Status ValidateDays(int64_t days) {
return ValidateField(days, kMinDays, kMaxDays, "days");
}
static absl::Status ValidateMicros(int64_t micros) {
return ValidateField(micros, kMinMicros, kMaxMicros, "microseconds");
}
static absl::Status ValidateNanos(__int128 nanos) {
return ValidateField(nanos, kMinNanos, kMaxNanos, "nanoseconds");
}
int64_t micros_ = 0;
int32_t days_ = 0;
uint32_t months_nanos_ = 0;
};
static_assert(sizeof(IntervalValue) == 16, "IntervalValue must be 16 bytes");
template <typename H>
inline H AbslHashValue(H h, const IntervalValue& v) {
return H::combine(std::move(h), v.GetAsMicros(), v.get_nano_fractions());
}
// Allow INTERVAL values to be logged.
std::ostream& operator<<(std::ostream& out, IntervalValue value);
// Normalizes 24 hour time periods into full days. Adjusts nanos and days to
// have the same sign.
absl::StatusOr<IntervalValue> JustifyHours(const IntervalValue& v);
// Normalizes 30 day time periods into full months. Adjusts days and months to
// have the same sign.
absl::StatusOr<IntervalValue> JustifyDays(const IntervalValue& v);
// Normalizes 24 hour time periods into full days, and after thatn 30 day time
// periods into full months. Adjusts all date parts to have the same sign.
absl::StatusOr<IntervalValue> JustifyInterval(const IntervalValue& v);
} // namespace bigquery_ml_utils
#endif // THIRD_PARTY_PY_BIGQUERY_ML_UTILS_SQL_UTILS_PUBLIC_INTERVAL_VALUE_H_