sql_utils/public/functions/cast_date_time.cc (1,840 lines of code) (raw):

// // Copyright 2023 Google LLC // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // #include "sql_utils/public/functions/cast_date_time.h" #include <string.h> #include <time.h> #include <cctype> #include <cstddef> #include <cstdint> #include <limits> #include <string> #include <utility> #include <vector> #include "absl/base/optimization.h" #include "absl/container/flat_hash_map.h" #include "absl/flags/flag.h" #include "absl/status/status.h" #include "absl/status/statusor.h" #include "absl/strings/ascii.h" #include "absl/strings/match.h" #include "absl/strings/numbers.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/string_view.h" #include "absl/strings/substitute.h" #include "absl/time/civil_time.h" #include "absl/time/clock.h" #include "absl/time/time.h" #include "unicode/uchar.h" #include "unicode/utf8.h" #include "sql_utils/base/general_trie.h" #include "sql_utils/base/logging.h" #include "sql_utils/base/map_util.h" #include "sql_utils/base/mathutil.h" #include "sql_utils/base/ret_check.h" #include "sql_utils/base/status_macros.h" #include "sql_utils/common/errors.h" #include "sql_utils/common/utf_util.h" #include "sql_utils/public/functions/date_time_util.h" #include "sql_utils/public/functions/datetime.pb.h" #include "sql_utils/public/functions/input_format_string_max_width.h" #include "sql_utils/public/functions/parse_date_time_utils.h" #include "sql_utils/public/strings.h" #include "sql_utils/public/type.h" #include "sql_utils/public/type.pb.h" namespace bigquery_ml_utils { namespace functions { namespace { using cast_date_time_internal::DateTimeFormatElement; using cast_date_time_internal::FormatElementCategory; using cast_date_time_internal::FormatElementType; using cast_date_time_internal::GetDateTimeFormatElements; using parse_date_time_utils::ConvertTimeToTimestamp; using parse_date_time_utils::ParseInt; using parse_date_time_utils::ParseSubSeconds; using CategoryToElementsMap = absl::flat_hash_map<FormatElementCategory, std::vector<const DateTimeFormatElement*>>; using TypeToElementMap = absl::flat_hash_map<FormatElementType, const DateTimeFormatElement*>; static const int64_t powers_of_ten[] = { 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000}; constexpr int64_t kNaiveNumSecondsPerMinute = 60; constexpr int64_t kNaiveNumSecondsPerHour = 60 * kNaiveNumSecondsPerMinute; constexpr int64_t kNaiveNumSecondsPerDay = 24 * kNaiveNumSecondsPerHour; constexpr int64_t kNaiveNumMicrosPerDay = kNaiveNumSecondsPerDay * 1000 * 1000; // Matches <target_str> with string <input_str> in a char-by-char manner. The // matching is case-insensitive if <ignore_case> is true and case-sensitive // otherwise. Returns the number of consumed characters upon successful // matching, and returns absl::string_view::npos otherwise. size_t ParseStringByExactMatch(absl::string_view input_str, absl::string_view target_str, bool ignore_case = false) { if (target_str.empty()) { return 0; } if (ignore_case ? absl::StartsWithIgnoreCase(input_str, target_str) : absl::StartsWith(input_str, target_str)) { return target_str.size(); } else { return absl::string_view::npos; } } struct ParseWithCandidatesResult { size_t parsed_length = absl::string_view::npos; int matched_candidate_index; }; // Matches candidate strings inside <candidates> with <input_str> in a // char-by-char manner. The matching is case-insensitve if <ignore_case> is true // and case-sensitive otherwise. Sets the <parsed_length> inside returned // ParseWithCandidatesResult object to be the number of consumed characters // and <matched_candidate_ind> to be the index of the first matched string upon // successful matching, and sets <parsed_length> to be absl::string_view::npos // otherwise. ParseWithCandidatesResult ParseStringWithCandidates( absl::string_view input_str, absl::Span<const absl::string_view> candidates, bool ignore_case) { for (size_t i = 0; i < candidates.size(); ++i) { size_t parsed_length = ParseStringByExactMatch(input_str, candidates[i], ignore_case); if (parsed_length != absl::string_view::npos) { return {.parsed_length = parsed_length, .matched_candidate_index = static_cast<int>(i)}; } } return {.parsed_length = absl::string_view::npos}; } // Matches <input_str> with abbreviated or full month names in a // case-insensitive way. Returns the number of consumed characters and // produces <month> value upon successful matching, and returns // absl::string_view::npos otherwise. size_t ParseMonthNames(absl::string_view timestamp_string, bool abbreviated, int* month) { static constexpr absl::string_view abbreviated_month_names[] = { "JAN", "FEB", "MAR", "APR", "MAY", "JUN", "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"}; static constexpr absl::string_view full_month_names[] = { "JANUARY", "FEBRUARY", "MARCH", "APRIL", "MAY", "JUNE", "JULY", "AUGUST", "SEPTEMBER", "OCTOBER", "NOVEMBER", "DECEMBER"}; absl::Span<const absl::string_view> month_names = abbreviated ? absl::Span<const absl::string_view>(abbreviated_month_names) : absl::Span<const absl::string_view>(full_month_names); ParseWithCandidatesResult parsing_result = ParseStringWithCandidates(timestamp_string, month_names, /*ignore_case=*/true); if (parsing_result.parsed_length != absl::string_view::npos) { *month = parsing_result.matched_candidate_index + 1; } return parsing_result.parsed_length; } // This functions is similar to parse_date_time_utils::ParseInt function but // accepts <input_str> of string_view type and we will verify that the number of // parsed characters is within the range of [<min_width>, <max_width>]. Returns // the number of consumed characters upon successfully parsing an integer, and // returns absl::string_view::npos otherwise. size_t ParseInt(absl::string_view input_str, int min_width, int max_width, int64_t min, int64_t max, int* value_ptr) { const char* res_dp = ParseInt(input_str.data(), input_str.data() + input_str.size(), max_width, min, max, value_ptr); if (res_dp == nullptr) { return absl::string_view::npos; } size_t parsed_width = res_dp - input_str.data(); if (parsed_width < min_width || parsed_width > max_width) { return absl::string_view::npos; } return parsed_width; } // This functions is similar to parse_date_time_utils::ParseSubSeconds function // but accepts <input_str> of string_view type and we will verify that the // number of parsed characters is within the range of // [<min_width>, <max_width>]. Returns the number of consumed characters upon // successful parsing, and returns absl::string_view::npos otherwise. size_t ParseSubSeconds(absl::string_view input_str, int min_width, int max_width, TimestampScale scale, absl::Duration* subseconds) { const char* res_dp = ParseSubSeconds(input_str.data(), input_str.data() + input_str.size(), max_width, scale, subseconds); if (res_dp == nullptr) { return absl::string_view::npos; } size_t parsed_width = res_dp - input_str.data(); if (parsed_width < min_width || parsed_width > max_width) { return absl::string_view::npos; } return parsed_width; } // Consumes the leading Unicode whitespaces in the string <input_str>. Returns // the number of consumed characters. size_t TrimLeadingUnicodeWhiteSpaces(absl::string_view input_str) { UChar32 character; size_t data_length = input_str.size(); // Offset for consecutive Unicode whitespaces since <dp>. size_t all_uwhitespace_offset = 0; size_t offset = 0; while (offset < data_length) { U8_NEXT(input_str.data(), offset, data_length, character); if (u_isUWhiteSpace(character)) { all_uwhitespace_offset = offset; } else { break; } } return all_uwhitespace_offset; } std::string FormatElementTypeString(const FormatElementType& type) { switch (type) { case FormatElementType::kFormatElementTypeUnspecified: return "FORMAT_ELEMENT_TYPE_UNSPECIFIED"; case FormatElementType::kSimpleLiteral: return "SIMPLE_LITERAL"; case FormatElementType::kDoubleQuotedLiteral: return "DOUBLE_QUOTED_LITERAL"; case FormatElementType::kWhitespace: return "WHITESPACE"; case FormatElementType::kYYYY: return "YYYY"; case FormatElementType::kYYY: return "YYY"; case FormatElementType::kYY: return "YY"; case FormatElementType::kY: return "Y"; case FormatElementType::kRRRR: return "RRRR"; case FormatElementType::kRR: return "RR"; case FormatElementType::kYCommaYYY: return "Y,YYY"; case FormatElementType::kIYYY: return "IYYY"; case FormatElementType::kIYY: return "IYY"; case FormatElementType::kIY: return "IY"; case FormatElementType::kI: return "I"; case FormatElementType::kSYYYY: return "SYYYY"; case FormatElementType::kYEAR: return "YEAR"; case FormatElementType::kSYEAR: return "SYEAR"; case FormatElementType::kMM: return "MM"; case FormatElementType::kMON: return "MON"; case FormatElementType::kMONTH: return "MONTH"; case FormatElementType::kRM: return "RM"; case FormatElementType::kDDD: return "DDD"; case FormatElementType::kDD: return "DD"; case FormatElementType::kD: return "D"; case FormatElementType::kDAY: return "DAY"; case FormatElementType::kDY: return "DY"; case FormatElementType::kJ: return "J"; case FormatElementType::kHH: return "HH"; case FormatElementType::kHH12: return "HH12"; case FormatElementType::kHH24: return "HH24"; case FormatElementType::kMI: return "MI"; case FormatElementType::kSS: return "SS"; case FormatElementType::kSSSSS: return "SSSSS"; case FormatElementType::kFFN: return "FFN"; case FormatElementType::kAM: return "AM"; case FormatElementType::kPM: return "PM"; case FormatElementType::kAMWithDots: return "A.M."; case FormatElementType::kPMWithDots: return "P.M."; case FormatElementType::kTZH: return "TZH"; case FormatElementType::kTZM: return "TZM"; case FormatElementType::kCC: return "CC"; case FormatElementType::kSCC: return "SCC"; case FormatElementType::kQ: return "Q"; case FormatElementType::kIW: return "IW"; case FormatElementType::kWW: return "WW"; case FormatElementType::kW: return "W"; case FormatElementType::kAD: return "AD"; case FormatElementType::kBC: return "BC"; case FormatElementType::kADWithDots: return "A.D."; case FormatElementType::kBCWithDots: return "B.C."; case FormatElementType::kSP: return "SP"; case FormatElementType::kTH: return "TH"; case FormatElementType::kSPTH: return "SPTH"; case FormatElementType::kTHSP: return "THSP"; case FormatElementType::kFM: return "FM"; } } FormatElementCategory GetFormatElementCategoryFromType( const FormatElementType& type) { switch (type) { case FormatElementType::kFormatElementTypeUnspecified: return FormatElementCategory::kFormatElementCategoryUnspecified; case FormatElementType::kSimpleLiteral: case FormatElementType::kDoubleQuotedLiteral: case FormatElementType::kWhitespace: return FormatElementCategory::kLiteral; case FormatElementType::kYYYY: case FormatElementType::kYYY: case FormatElementType::kYY: case FormatElementType::kY: case FormatElementType::kRRRR: case FormatElementType::kRR: case FormatElementType::kYCommaYYY: case FormatElementType::kIYYY: case FormatElementType::kIYY: case FormatElementType::kIY: case FormatElementType::kI: case FormatElementType::kSYYYY: case FormatElementType::kYEAR: case FormatElementType::kSYEAR: return FormatElementCategory::kYear; case FormatElementType::kMM: case FormatElementType::kMON: case FormatElementType::kMONTH: case FormatElementType::kRM: return FormatElementCategory::kMonth; case FormatElementType::kDDD: case FormatElementType::kDD: case FormatElementType::kD: case FormatElementType::kDAY: case FormatElementType::kDY: case FormatElementType::kJ: return FormatElementCategory::kDay; case FormatElementType::kHH: case FormatElementType::kHH12: case FormatElementType::kHH24: return FormatElementCategory::kHour; case FormatElementType::kMI: return FormatElementCategory::kMinute; case FormatElementType::kSS: case FormatElementType::kSSSSS: case FormatElementType::kFFN: return FormatElementCategory::kSecond; case FormatElementType::kAM: case FormatElementType::kPM: case FormatElementType::kAMWithDots: case FormatElementType::kPMWithDots: return FormatElementCategory::kMeridianIndicator; case FormatElementType::kTZH: case FormatElementType::kTZM: return FormatElementCategory::kTimeZone; case FormatElementType::kCC: case FormatElementType::kSCC: return FormatElementCategory::kCentury; case FormatElementType::kQ: return FormatElementCategory::kQuarter; case FormatElementType::kIW: case FormatElementType::kWW: case FormatElementType::kW: return FormatElementCategory::kWeek; case FormatElementType::kAD: case FormatElementType::kBC: case FormatElementType::kADWithDots: case FormatElementType::kBCWithDots: return FormatElementCategory::kEraIndicator; case FormatElementType::kSP: case FormatElementType::kTH: case FormatElementType::kSPTH: case FormatElementType::kTHSP: case FormatElementType::kFM: return FormatElementCategory::kMisc; } } std::string FormatElementCategoryString(const FormatElementCategory& category) { switch (category) { case FormatElementCategory::kFormatElementCategoryUnspecified: return "FORMAT_ELEMENT_CATEGORY_UNSPECIFIED"; case FormatElementCategory::kLiteral: return "LITERAL"; case FormatElementCategory::kYear: return "YEAR"; case FormatElementCategory::kMonth: return "MONTH"; case FormatElementCategory::kDay: return "DAY"; case FormatElementCategory::kHour: return "HOUR"; case FormatElementCategory::kMinute: return "MINUTE"; case FormatElementCategory::kSecond: return "SECOND"; case FormatElementCategory::kMeridianIndicator: return "MERIDIAN_INDICATOR"; case FormatElementCategory::kTimeZone: return "TIME_ZONE"; case FormatElementCategory::kCentury: return "CENTURY"; case FormatElementCategory::kQuarter: return "QUARTER"; case FormatElementCategory::kWeek: return "WEEK"; case FormatElementCategory::kEraIndicator: return "ERA_INDICATOR"; case FormatElementCategory::kMisc: return "MISC"; } } // Checks whether the format element is supported for parsing. bool IsSupportedForParsing(const DateTimeFormatElement& format_element) { switch (format_element.type) { case FormatElementType::kSimpleLiteral: case FormatElementType::kDoubleQuotedLiteral: case FormatElementType::kWhitespace: case FormatElementType::kYYYY: case FormatElementType::kYYY: case FormatElementType::kYY: case FormatElementType::kY: case FormatElementType::kRRRR: case FormatElementType::kRR: case FormatElementType::kYCommaYYY: case FormatElementType::kMM: case FormatElementType::kMON: case FormatElementType::kMONTH: case FormatElementType::kDD: case FormatElementType::kHH: case FormatElementType::kHH12: case FormatElementType::kHH24: case FormatElementType::kMI: case FormatElementType::kSS: case FormatElementType::kSSSSS: case FormatElementType::kFFN: case FormatElementType::kAMWithDots: case FormatElementType::kPMWithDots: case FormatElementType::kTZH: case FormatElementType::kTZM: return true; default: return false; } } // Helper function that determines which format elements must have a digit // immediately following the portion of the input text that they match. // Format elements that satisfy this property match exactly the number of digits // indicated in the format element; those that don't allow mapping against a // smaller number of digits. // For example, "YYY", standalone, matches 1-3 digits, but in the context of // "YYYMM", "YYY" must match exactly 3 digits. absl::StatusOr<std::vector<bool>> ComputeElementPrecedesDigits( std::vector<DateTimeFormatElement> format_elements) { std::vector<bool> element_precedes_digits; element_precedes_digits.resize(format_elements.size(), false); for (int i = static_cast<int>(element_precedes_digits.size()) - 2; i >= 0; --i) { const DateTimeFormatElement& next_format_element = format_elements[i + 1]; switch (next_format_element.type) { case FormatElementType::kDoubleQuotedLiteral: if (next_format_element.literal_value.empty()) { // The next element is of "kDoubleQuotedLiteral" type with empty // <literal_value> and consumes no characters from the input, so the // <element_precedes_digit> of the current element is the same as // that of the next element. element_precedes_digits[i] = element_precedes_digits[i + 1]; } else if (absl::ascii_isdigit( next_format_element.literal_value.at(0))) { element_precedes_digits[i] = true; } break; case FormatElementType::kSimpleLiteral: case FormatElementType::kWhitespace: case FormatElementType::kMON: case FormatElementType::kMONTH: case FormatElementType::kAMWithDots: case FormatElementType::kPMWithDots: case FormatElementType::kTZH: element_precedes_digits[i] = false; break; case FormatElementType::kYYYY: case FormatElementType::kYYY: case FormatElementType::kYY: case FormatElementType::kY: case FormatElementType::kRRRR: case FormatElementType::kRR: case FormatElementType::kYCommaYYY: case FormatElementType::kMM: case FormatElementType::kDD: case FormatElementType::kHH: case FormatElementType::kHH12: case FormatElementType::kHH24: case FormatElementType::kMI: case FormatElementType::kSS: case FormatElementType::kSSSSS: case FormatElementType::kFFN: case FormatElementType::kTZM: element_precedes_digits[i] = true; break; default: SQL_RET_CHECK_FAIL() << "Unexpected FormatElementType: " << FormatElementTypeString(next_format_element.type); } } return element_precedes_digits; } // This struct specifies the range of number of digits that an element can // parse. struct DigitCountRange { int min = 0; int max = 0; }; // Returns a vector of DigitCountRange objects <digit_count_ranges> where // <digit_count_ranges[i]> indicates the range of number of digits to parse for // <format_elements[i]>. For elements that do not parse digits (e.g. "-") or // parse more than one digit blocks (e.g. "Y,YYY"), we set both <min> and // <max> to be 0, and do not use them in parsing process. Returns an error if // any format element inside <format_elements> is not supported for parsing. absl::StatusOr<std::vector<DigitCountRange>> ComputeDigitCountRanges( const std::vector<DateTimeFormatElement>& format_elements) { std::vector<DigitCountRange> digit_count_ranges; digit_count_ranges.resize(format_elements.size()); SQL_ASSIGN_OR_RETURN(const std::vector<bool> element_precedes_digits, ComputeElementPrecedesDigits(format_elements)); SQL_RET_CHECK(digit_count_ranges.size() == element_precedes_digits.size()); for (size_t i = 0; i < digit_count_ranges.size(); ++i) { bool element_precedes_digit = element_precedes_digits[i]; int element_length = format_elements[i].len_in_format_str; switch (format_elements[i].type) { case FormatElementType::kSimpleLiteral: case FormatElementType::kWhitespace: case FormatElementType::kDoubleQuotedLiteral: case FormatElementType::kYCommaYYY: case FormatElementType::kMON: case FormatElementType::kMONTH: case FormatElementType::kAMWithDots: case FormatElementType::kPMWithDots: digit_count_ranges[i] = {.min = 0, .max = 0}; break; // Format elements “RRRR” and “YYYY” parse exactly 4 digits if // <element_precedes_digit> is true and can parse up to 5 digits // otherwise. case FormatElementType::kYYYY: case FormatElementType::kRRRR: if (element_precedes_digit) { digit_count_ranges[i] = {.min = 4, .max = 4}; } else { // "RRRR"/"YYYY" can parse up to 5 digits when // <element_precedes_digit> is false. digit_count_ranges[i] = {.min = 1, .max = 5}; } break; // The elements below parse exactly <element_length> digits if // <element_precedes_digit> is true; otherwise, the number of digits // they can parse varies between 1 and <element_length>. case FormatElementType::kYYY: case FormatElementType::kYY: case FormatElementType::kY: case FormatElementType::kRR: case FormatElementType::kMM: case FormatElementType::kDD: case FormatElementType::kHH: case FormatElementType::kMI: case FormatElementType::kSS: case FormatElementType::kSSSSS: if (element_precedes_digit) { digit_count_ranges[i] = {.min = element_length, .max = element_length}; } else { digit_count_ranges[i] = {.min = 1, .max = element_length}; } break; case FormatElementType::kHH12: case FormatElementType::kHH24: case FormatElementType::kTZH: case FormatElementType::kTZM: if (element_precedes_digit) { digit_count_ranges[i] = {.min = 2, .max = 2}; } else { digit_count_ranges[i] = {.min = 1, .max = 2}; } break; case FormatElementType::kFFN: if (element_precedes_digit) { digit_count_ranges[i] = { .min = format_elements[i].subsecond_digit_count, .max = format_elements[i].subsecond_digit_count}; } else { digit_count_ranges[i] = { .min = 1, .max = format_elements[i].subsecond_digit_count}; } break; default: SQL_RET_CHECK_FAIL() << "Unexpected FormatElementType: " << FormatElementTypeString(format_elements[i].type); } } return digit_count_ranges; } // Parses <timestamp_string> with a format element of "kRR" type and // produces <year> value as output. Returns the number of consumed // characters upon successful parsing, and returns absl::string_view::npos // otherwise. size_t ParseWithFormatElementOfTypeRR(absl::string_view timestamp_string, int current_year, DigitCountRange digit_count_range, int* year) { int current_year_last_two_digits = current_year % 100; int current_year_before_last_two_digits = current_year / 100; int year_before_last_two_digits = current_year_before_last_two_digits; int year_last_two_digits; size_t parsed_length = ParseInt(timestamp_string, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/99, &year_last_two_digits); if (parsed_length != absl::string_view::npos) { if (year_last_two_digits < 50 && current_year_last_two_digits >= 50) { year_before_last_two_digits += 1; } else if (year_last_two_digits >= 50 && current_year_last_two_digits < 50) { year_before_last_two_digits -= 1; } *year = year_before_last_two_digits * 100 + year_last_two_digits; } return parsed_length; } // Parses <timestamp_string> with a format element of "kYCommaYYY" type and // produces <year> value as output. Returns the number of consumed // characters upon successful parsing, and returns absl::string_view::npos // otherwise. size_t ParseWithFormatElementOfTypeYCommaYYY(absl::string_view timestamp_string, int* year) { // The number of charcters in <timestamp_string> that has been parsed. size_t parsed_length = 0; size_t parsed_length_temp = absl::string_view::npos; int year_first_part; int year_last_three_digits; absl::string_view timestamp_str_to_parse = timestamp_string; // Parses "Y" part of "Y,YYY". parsed_length_temp = ParseInt(timestamp_str_to_parse, /*min_width=*/1, /*max_width=*/2, /*min=*/0, /*max=*/10, &year_first_part); if (parsed_length_temp == absl::string_view::npos) { return absl::string_view::npos; } parsed_length = parsed_length_temp; timestamp_str_to_parse = timestamp_str_to_parse.substr(parsed_length_temp); // Parses "," part of "Y,YYY". parsed_length_temp = ParseStringByExactMatch(timestamp_str_to_parse, ","); if (parsed_length_temp == absl::string_view::npos) { return absl::string_view::npos; } parsed_length += parsed_length_temp; timestamp_str_to_parse = timestamp_str_to_parse.substr(parsed_length_temp); // Parses "YYY" part of "Y,YYY". parsed_length_temp = ParseInt(timestamp_str_to_parse, /*min_width=*/3, /*max_width=*/3, /*min=*/0, /*max=*/999, &year_last_three_digits); if (parsed_length_temp == absl::string_view::npos) { return absl::string_view::npos; } *year = year_first_part * 1000 + year_last_three_digits; parsed_length += parsed_length_temp; return parsed_length; } // Parses <timestamp_string> with a format element of "kTZH" type and produces // <positive_timezone_offset> and <timezone_offset_hour>. Returns the number of // consumed characters upon successful parsing, and returns // absl::string_view::npos otherwise. size_t ParseWithFormatElementOfTypeTZH(absl::string_view timestamp_string, DigitCountRange digit_count_range, bool* positive_timezone_offset, int* timezone_offset_hour) { size_t parsed_length = 0; absl::string_view timestamp_str_to_parse = timestamp_string; ParseWithCandidatesResult parse_result = ParseStringWithCandidates( timestamp_str_to_parse, {"-", "+", " "}, /*ignore_case=*/false); if (parse_result.parsed_length == absl::string_view::npos) { return absl::string_view::npos; } *positive_timezone_offset = (parse_result.matched_candidate_index != 0); parsed_length += parse_result.parsed_length; timestamp_str_to_parse = timestamp_str_to_parse.substr(parse_result.parsed_length); size_t parsed_length_temp = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/14, timezone_offset_hour); if (parsed_length_temp == absl::string_view::npos) { return absl::string_view::npos; } parsed_length += parsed_length_temp; return parsed_length; } // This function conducts the parsing for <timestamp_string> with // <format_elements>. absl::Status ParseTimeWithFormatElements( const std::vector<DateTimeFormatElement>& format_elements, absl::string_view timestamp_string, const absl::TimeZone default_timezone, const absl::Time current_timestamp, TimestampScale scale, absl::Time* timestamp) { // The number of format elements from <format_elements> that have been // successfully processed so far. size_t processed_format_element_count = 0; // The number of characters of <timestamp_string> that have been successfully // parsed so far. size_t timestamp_str_parsed_length = 0; absl::TimeZone::CivilInfo now_info = default_timezone.At(current_timestamp); absl::CivilSecond cs_now = now_info.cs; int year = static_cast<int>(cs_now.year()); int month = cs_now.month(); int mday = 1; int hour = 0; int min = 0; int sec = 0; int hour_in_12_hour_clock = 0; bool afternoon = false; absl::Duration subseconds = absl::ZeroDuration(); // Indicates whether TZH or TZM appears in the format string. bool timezone_specified_in_format = false; bool positive_timezone_offset = true; int timezone_offset_hour = 0; int timezone_offset_min = 0; bool error_in_parsing = false; SQL_ASSIGN_OR_RETURN(const std::vector<DigitCountRange> digit_count_ranges, ComputeDigitCountRanges(format_elements)); // Skips leading whitespaces. timestamp_str_parsed_length += TrimLeadingUnicodeWhiteSpaces(timestamp_string); while (!error_in_parsing && timestamp_str_parsed_length < timestamp_string.size() && processed_format_element_count < format_elements.size()) { size_t parsed_length = absl::string_view::npos; absl::string_view timestamp_str_to_parse = timestamp_string.substr(timestamp_str_parsed_length); const DateTimeFormatElement& format_element = format_elements[processed_format_element_count]; DigitCountRange digit_count_range = digit_count_ranges[processed_format_element_count]; switch (format_element.type) { case FormatElementType::kSimpleLiteral: case FormatElementType::kDoubleQuotedLiteral: parsed_length = ParseStringByExactMatch(timestamp_str_to_parse, format_element.literal_value); break; case FormatElementType::kWhitespace: // Format element of "kWhitespace" type matches 1 or more Unicode // whitespaces. parsed_length = TrimLeadingUnicodeWhiteSpaces(timestamp_str_to_parse); if (parsed_length == 0) { // Matches 0 Unicode whitespace, so we set <error_in_parsing> to true // to indicate an error. error_in_parsing = true; } break; // Parses for entire year value. For example, for input string "1234", the // output <year> is 1234. case FormatElementType::kYYYY: case FormatElementType::kRRRR: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/10000, &year); break; // Parses for the last 3/2/1 digits of the year value depending on the // length of the element. For example, assuming <current_year> is 1970: // - for input "123", the output <year> with "YYY" is 1123, // - for input "12", the output <year> with "YY" is 1912, // - for input "1", the output <year> with "Y" is 1971. case FormatElementType::kYYY: case FormatElementType::kYY: case FormatElementType::kY: { int element_length = format_element.len_in_format_str; SQL_RET_CHECK(element_length >= 0 && element_length < ABSL_ARRAYSIZE(powers_of_ten)); int element_length_power_of_ten = static_cast<int>(powers_of_ten[element_length]); int parsed_year_part; parsed_length = ParseInt( timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/element_length_power_of_ten - 1, &parsed_year_part); if (parsed_length != absl::string_view::npos) { year = year - year % element_length_power_of_ten + parsed_year_part; } break; } // Parses for the last 2 digit of the year value. The first 2 digits // of the output can be different from that of current year (more // details at (broken link)). // For example, if the current year is 2002: // - for input "12", the output <year> is 2012, // - for input "51", the output <year> is 1951. // If the current year is 2299, // - for input "12", the output <year> is 2312, // - for input "51", thr output <year> is 2251. case FormatElementType::kRR: { parsed_length = ParseWithFormatElementOfTypeRR( timestamp_str_to_parse, /*current_year=*/year, digit_count_range, &year); break; } // Parses for entire year value with a string in pattern "X,XXX" or // "XX,XXX". For example, // - for input "1,234", the output <year> is 1234, // - for input "10,000", the output <year> is 10000. case FormatElementType::kYCommaYYY: parsed_length = ParseWithFormatElementOfTypeYCommaYYY( timestamp_str_to_parse, &year); break; // Parses for month value 1-12. For example, for input "11", the output // <month> is 11. case FormatElementType::kMM: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/1, /*max=*/12, &month); break; // Parses abbreviated month names with "MON" element and full month // names with "MONTH" element. The parsing is case-insensitive. // For example, // - for input "Jan"/"jAN", the output <month> with "MON" is 1, // - for input "JUNE"/"juNe", the output <month> with "MONTH" is 6. case FormatElementType::kMON: case FormatElementType::kMONTH: parsed_length = ParseMonthNames( timestamp_str_to_parse, /*abbreviated=*/format_element.type == FormatElementType::kMON, &month); break; // Parses for day of month value. For example, for input "20", the // output <mday> is 20. case FormatElementType::kDD: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/1, /*max=*/31, &mday); break; // kHH/kHH12 and kAMWithDots/kPMWithDots are used to parse hour value // of a 12-hour clock. The matching for meridian indicator part is // case-insensitive. For example, // - if input for kHH/kHH12 is "11" and input for // kAMWithDots/kPMWithDots is "A.M."/"A.m.", the output <hour> is 11. // - if input for kHH/kHH12 is "12" and input for // kAMWithDots/kPMWithDots is "a.M."/"a.m.", the output <hour> is 0. // string "11", the hour value in the result 12-hour clock is 11. case FormatElementType::kHH: case FormatElementType::kHH12: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/1, /*max=*/12, &hour_in_12_hour_clock); break; case FormatElementType::kAMWithDots: case FormatElementType::kPMWithDots: { ParseWithCandidatesResult parse_result = ParseStringWithCandidates( timestamp_str_to_parse, {"A.M.", "P.M."}, /*ignore_case=*/true); parsed_length = parse_result.parsed_length; if (parsed_length != absl::string_view::npos) { afternoon = (parse_result.matched_candidate_index == 1); } break; } // Parses for hour value in a 24-hour clock. For example, for input "12", // the output <hour> is 12. case FormatElementType::kHH24: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/23, &hour); break; // Parses for minute value 0-59. For example, for input "20", the output // <min> is 20. case FormatElementType::kMI: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/59, &min); break; // Parses for second value 0-59. For example, for input "30", the output // <sec> is 30. case FormatElementType::kSS: parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/59, &sec); break; // Parses for number of seconds past midnight 0 ~ 2400*60*60-1. For // example, for input "3662", the output <hour>, <min> and <sec> are 1, 1, // 2 respectively (since 3660 seconds past midnight corresponds to time // "01:01:02"). case FormatElementType::kSSSSS: { int sec_of_day; parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/kNaiveNumSecondsPerDay - 1, &sec_of_day); if (parsed_length != absl::string_view::npos) { hour = sec_of_day / kNaiveNumSecondsPerHour; min = (sec_of_day % kNaiveNumSecondsPerHour) / kNaiveNumSecondsPerMinute; sec = sec_of_day % kNaiveNumSecondsPerMinute; } break; } // Parses for subsecond value. Additional digits beyond the input <scale> // are truncated (6 for micros, 9 for nanos). For example, // - for input "123", the output subsecond with "FF3" is 123. // - for input "1234567", the output subsecond with "FF7" is 123456 // under micros scale, or 1234567 under nano scale. case FormatElementType::kFFN: { SQL_RET_CHECK(format_element.subsecond_digit_count > 0 && format_element.subsecond_digit_count <= 9); parsed_length = ParseSubSeconds(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, scale, &subseconds); break; } // Parses for the sign and hour value of the time zone offset. For // example, // - for input "+10"/" 10", the sign and hour value of output time zone // are "+10". // - for input "-09", the sign and hour value of output time zone are // "-09". case FormatElementType::kTZH: { timezone_specified_in_format = true; parsed_length = ParseWithFormatElementOfTypeTZH( timestamp_str_to_parse, digit_count_range, &positive_timezone_offset, &timezone_offset_hour); break; } // Parses for the minute value of the time zone offset. For example, for // input "13", the minute value of output time zone is 13. case FormatElementType::kTZM: timezone_specified_in_format = true; parsed_length = ParseInt(timestamp_str_to_parse, /*min_width=*/digit_count_range.min, /*max_width=*/digit_count_range.max, /*min=*/0, /*max=*/59, &timezone_offset_min); break; default: break; } if (parsed_length == absl::string_view::npos) { // If <parsed_length> is absl::string_view::npos, we set // <error_in_parsing> to be true to indicate an error. error_in_parsing = true; } if (!error_in_parsing) { // We successfully processed a format element, so update the number of // elements and characters processed. processed_format_element_count++; timestamp_str_parsed_length += parsed_length; } } if (error_in_parsing) { return MakeEvalError() << "Failed to parse input timestamp string at " << timestamp_str_parsed_length << " with format element " << format_elements[processed_format_element_count].ToString(); } // Skips any remaining whitespace. timestamp_str_parsed_length += TrimLeadingUnicodeWhiteSpaces( timestamp_string.substr(timestamp_str_parsed_length)); // Skips trailing empty format elements {kDoubleQuotedLiteral, ""} which match // "" in input string. while ( processed_format_element_count < format_elements.size() && format_elements[processed_format_element_count].type == FormatElementType::kDoubleQuotedLiteral && format_elements[processed_format_element_count].literal_value.empty()) { processed_format_element_count++; } if (timestamp_str_parsed_length < timestamp_string.size()) { return MakeEvalError() << "Illegal non-space trailing data '" << timestamp_string.substr( timestamp_str_parsed_length) << "' in timestamp string"; } if (processed_format_element_count < format_elements.size()) { return MakeEvalError() << "Entire timestamp string has been parsed before dealing with" << " format element " << format_elements[processed_format_element_count].ToString(); } // Calculates the <hour> in 24-hour clock if hour value of a 12-hour clock is // parsed. if (hour_in_12_hour_clock != 0) { hour = hour_in_12_hour_clock % 12 + (afternoon ? 12 : 0); } const absl::CivilSecond cs(year, month, mday, hour, min, sec); // absl::CivilSecond will 'normalize' its arguments, so we simply compare // the input against the result to check whether a YMD is valid. if (cs.year() != year || cs.month() != month || cs.day() != mday) { return MakeEvalError() << "Invalid result from year, month, day values after parsing"; } absl::TimeZone timezone; if (timezone_specified_in_format) { SQL_RETURN_IF_ERROR(MakeTimeZone( absl::StrFormat("%c%02d%02d", positive_timezone_offset ? '+' : '-', timezone_offset_hour, timezone_offset_min), &timezone)); } else { timezone = default_timezone; } *timestamp = timezone.At(cs).pre + subseconds; if (!IsValidTime(*timestamp)) { return MakeEvalError() << "The parsing result is out of valid time range"; } return absl::OkStatus(); } // Returns an error if more than one format element in the target category exist // in the format string, i.e. the value of <category> in // <category_to_elements_map> contains more than one item. For example, you // cannot have elements "YY" and "RRRR" at the same time since they are both // in "kYear" category. absl::Status CheckForDuplicateElementsInCategory( FormatElementCategory category, const CategoryToElementsMap& category_to_elements_map) { if (category_to_elements_map.contains(category) && category_to_elements_map.at(category).size() > 1) { return MakeEvalError() << "More than one format element in category " << FormatElementCategoryString(category) << " exist: " << category_to_elements_map.at(category)[0]->ToString() << " and " << category_to_elements_map.at(category)[1]->ToString(); } return absl::OkStatus(); } // Returns an error if the element in the target category exists in the format // string, i.e. <category> exists in <category_to_elements_map> as a key. For // example, you cannot have any format element in "kHour" category if the output // type is DATE. absl::Status CheckCategoryNotExist( FormatElementCategory category, const CategoryToElementsMap& category_to_elements_map, absl::string_view output_type_name) { if (category_to_elements_map.contains(category)) { std::string error_reason = absl::Substitute( "Format element in category $0 ($1) is not allowed for output type $2", FormatElementCategoryString(category), category_to_elements_map.at(category)[0]->ToString(), output_type_name); return MakeEvalError() << error_reason; } return absl::OkStatus(); } // Returns an error if <type> is present in <type_to_element_map> and <category> // is present in <category_to_elements_map>. For example, if you have a format // element of "kHH24" type, you cannot have any format element in // "kMeridianIndicator" category. absl::Status CheckForMutuallyExclusiveElements( FormatElementType type, FormatElementCategory category, const TypeToElementMap& type_to_element_map, const CategoryToElementsMap& category_to_elements_map) { if (type_to_element_map.contains(type) && category_to_elements_map.contains(category)) { std::string error_reason = absl::Substitute( "Format element in category $0 ($1) and format element $2 cannot exist " "simultaneously", FormatElementCategoryString(category), category_to_elements_map.at(category)[0]->ToString(), type_to_element_map.at(type)->ToString()); return MakeEvalError() << error_reason; } return absl::OkStatus(); } // Returns an error if both <type1> and <type2> are present in // <type_to_element_map>. For example, if you have a format element of "kSSSSS" // type which indicates seconds in a day, then you cannot have another element // of "kSS" type to indicate seconds in an hour. absl::Status CheckForMutuallyExclusiveElements( FormatElementType type1, FormatElementType type2, const TypeToElementMap& type_to_element_map) { if (type_to_element_map.contains(type1) && type_to_element_map.contains(type2)) { return MakeEvalError() << "Format elements " << type_to_element_map.at(type1)->ToString() << " and " << type_to_element_map.at(type2)->ToString() << " cannot exist simultaneously"; } return absl::OkStatus(); } // Confirms that a format element in <category> is present if a format element // of any type from <types> exists and vice versa. For example, you must have a // format element in "kMeridianIndicator" category if a format element of "kHH" // or "kHH12" type is used. Also, if you have a format element in // "kMeridianIndicator" category, you must have a format element of "kHH" or // "kHH12" type. absl::Status CheckForCoexistance( std::vector<FormatElementType> types, FormatElementCategory category, const TypeToElementMap& type_to_element_map, const CategoryToElementsMap& category_to_elements_map) { FormatElementType present_type; bool type_exists = false; for (const FormatElementType& type : types) { if (type_to_element_map.contains(type)) { type_exists = true; present_type = type; break; } } if (type_exists && !category_to_elements_map.contains(category)) { return MakeEvalError() << "Format element in category " << FormatElementCategoryString(category) << " is required when format element " << type_to_element_map.at(present_type)->ToString() << " exists"; } else if (category_to_elements_map.contains(category) && !type_exists) { std::vector<std::string> format_element_type_strs; format_element_type_strs.reserve(types.size()); for (const FormatElementType& type : types) { format_element_type_strs.push_back(FormatElementTypeString(type)); } std::string joined_format_element_type_strs = absl::StrJoin(format_element_type_strs, "/"); std::string error_reason = absl::Substitute( "Format element of type $0 is required when format element in " "category $1 ($2) exists", joined_format_element_type_strs, FormatElementCategoryString(category), category_to_elements_map.at(category)[0]->ToString()); return MakeEvalError() << error_reason; } return absl::OkStatus(); } // Conducts basic and common verifications with the format string. absl::Status ConductBasicFormatStringChecks(absl::string_view format_string) { if (!IsWellFormedUTF8(format_string)) { return MakeEvalError() << "Format string is not a valid UTF-8 string"; } if (format_string.size() > absl::GetFlag(FLAGS_sql_cast_format_string_max_width)) { return MakeEvalError() << "Format string too long; limit " << absl::GetFlag( FLAGS_sql_cast_format_string_max_width); } return absl::OkStatus(); } // Validates the elements in <format_elements> with specific rules, and also // makes sure they are not of any category in <invalid_categories>. absl::Status ValidateDateTimeFormatElements( const std::vector<DateTimeFormatElement>& format_elements, const std::vector<FormatElementCategory>& invalid_categories, absl::string_view output_type_name) { CategoryToElementsMap category_to_elements_map; TypeToElementMap type_to_element_map; for (const DateTimeFormatElement& format_element : format_elements) { if (!IsSupportedForParsing(format_element)) { return MakeEvalError() << "Format element " << format_element.ToString() << " is not supported for parsing"; } // We store at most 2 elements inside this map, since this is enough to // print in error message when duplicate checks fail for a category. if (category_to_elements_map[format_element.category].size() < 2) { category_to_elements_map[format_element.category].push_back( &format_element); } if (type_to_element_map.contains(format_element.type)) { // We do not allow that more than one non-literal format element of the // same type exist at the same time. For example, the format string // "MiYYmI" is invalid since two format elements of "kMI" type // (appearing as "Mi" and "MI") exist in it. if (format_element.category != FormatElementCategory::kLiteral) { return MakeEvalError() << absl::Substitute( "Format element $0 appears more than once in the " "format string", format_element.ToString()); } } else { type_to_element_map[format_element.type] = &format_element; } } // Checks invalid format element categories for the output type. for (const FormatElementCategory& invalid_category : invalid_categories) { SQL_RETURN_IF_ERROR(CheckCategoryNotExist( invalid_category, category_to_elements_map, output_type_name)); } // Checks categories which do not allow duplications. const std::vector<FormatElementCategory> categories_to_check_duplicate = { FormatElementCategory::kMeridianIndicator, FormatElementCategory::kYear, FormatElementCategory::kMonth, FormatElementCategory::kDay, FormatElementCategory::kHour, FormatElementCategory::kMinute}; for (FormatElementCategory category : categories_to_check_duplicate) { SQL_RETURN_IF_ERROR(CheckForDuplicateElementsInCategory( category, category_to_elements_map)); } // Checks mutually exclusive format elements/types. // The Check between "kHH24" type and "kHH"/"kHH12" types is included in // duplicate check for "kHour" category. SQL_RETURN_IF_ERROR(CheckForMutuallyExclusiveElements( FormatElementType::kHH24, FormatElementCategory::kMeridianIndicator, type_to_element_map, category_to_elements_map)); // A Format element in "kMeridianIndicator" category must exist when a format // element of "kHH" or "kHH12" is present. Also, if we have a format element // in "kMeridianIndicator" category, a format element of "kHH" or "kHH12" type // must exist. SQL_RETURN_IF_ERROR( CheckForCoexistance({FormatElementType::kHH, FormatElementType::kHH12}, FormatElementCategory::kMeridianIndicator, type_to_element_map, category_to_elements_map)); // Format elements of "kSSSSS" type contain Hour, Minute and Second info, // therefore elements in "kHour" (along with "kMeridianIndicator") and // "kMinute" categories and elements of "kSS" type are disallowed. SQL_RETURN_IF_ERROR(CheckForMutuallyExclusiveElements( FormatElementType::kSSSSS, FormatElementCategory::kHour, type_to_element_map, category_to_elements_map)); SQL_RETURN_IF_ERROR(CheckForMutuallyExclusiveElements( FormatElementType::kSSSSS, FormatElementCategory::kMinute, type_to_element_map, category_to_elements_map)); SQL_RETURN_IF_ERROR(CheckForMutuallyExclusiveElements( FormatElementType::kSSSSS, FormatElementType::kSS, type_to_element_map)); return absl::OkStatus(); } // The result <timestamp> is always at microseconds precision. absl::Status ParseTimeWithFormatElements( const std::vector<DateTimeFormatElement>& format_elements, absl::string_view timestamp_string, const absl::TimeZone default_timezone, const absl::Time current_timestamp, int64_t* timestamp_micros) { absl::Time base_time; SQL_RETURN_IF_ERROR(ParseTimeWithFormatElements( format_elements, timestamp_string, default_timezone, current_timestamp, kMicroseconds, &base_time)); if (!ConvertTimeToTimestamp(base_time, timestamp_micros)) { return MakeEvalError() << "Invalid result from parsing function"; } return absl::OkStatus(); } absl::Status ValidateDateTimeFormatElementsForTimestampType( const std::vector<DateTimeFormatElement>& format_elements) { return ValidateDateTimeFormatElements(format_elements, {}, "TIMESTAMP"); } absl::Status ValidateDateTimeFormatElementsForDateType( const std::vector<DateTimeFormatElement>& format_elements) { return ValidateDateTimeFormatElements( format_elements, {FormatElementCategory::kHour, FormatElementCategory::kMinute, FormatElementCategory::kSecond, FormatElementCategory::kMeridianIndicator, FormatElementCategory::kTimeZone}, "DATE"); } absl::Status ValidateDateTimeFormatElementsForTimeType( const std::vector<DateTimeFormatElement>& format_elements) { return ValidateDateTimeFormatElements( format_elements, {FormatElementCategory::kYear, FormatElementCategory::kMonth, FormatElementCategory::kDay, FormatElementCategory::kTimeZone, FormatElementCategory::kCentury, FormatElementCategory::kQuarter, FormatElementCategory::kWeek, FormatElementCategory::kEraIndicator}, "TIME"); } absl::Status ValidateDateTimeFormatElementsForDatetimeType( const std::vector<DateTimeFormatElement>& format_elements) { return ValidateDateTimeFormatElements( format_elements, {FormatElementCategory::kTimeZone}, "DATETIME"); } // Checks to see if the format elements are valid for the date or time type. absl::Status ValidateDateDateTimeFormatElementsForFormatting( absl::Span<const DateTimeFormatElement> format_elements) { for (const DateTimeFormatElement& element : format_elements) { switch (element.category) { case FormatElementCategory::kLiteral: case FormatElementCategory::kYear: case FormatElementCategory::kMonth: case FormatElementCategory::kDay: continue; default: return MakeEvalError() << "DATE does not support " << element.ToString(); } } return absl::OkStatus(); } absl::Status ValidateTimeDateTimeFormatElementsForFormatting( absl::Span<const DateTimeFormatElement> format_elements) { for (const DateTimeFormatElement& element : format_elements) { switch (element.category) { case FormatElementCategory::kLiteral: case FormatElementCategory::kHour: case FormatElementCategory::kMinute: case FormatElementCategory::kSecond: case FormatElementCategory::kMeridianIndicator: continue; default: return MakeEvalError() << "TIME does not support " << element.ToString(); } } return absl::OkStatus(); } absl::Status ValidateDatetimeDateTimeFormatElementsForFormatting( absl::Span<const DateTimeFormatElement> format_elements) { for (const DateTimeFormatElement& element : format_elements) { switch (element.category) { case FormatElementCategory::kLiteral: case FormatElementCategory::kYear: case FormatElementCategory::kMonth: case FormatElementCategory::kDay: case FormatElementCategory::kHour: case FormatElementCategory::kMinute: case FormatElementCategory::kSecond: case FormatElementCategory::kMeridianIndicator: continue; default: return MakeEvalError() << "DATETIME does not support " << element.ToString(); } } return absl::OkStatus(); } } // namespace namespace cast_date_time_internal { std::string DateTimeFormatElement::ToString() const { switch (type) { case FormatElementType::kSimpleLiteral: return absl::StrCat("\'", literal_value, "\'"); case FormatElementType::kDoubleQuotedLiteral: return absl::StrCat( "\'", absl::Substitute("\"$0\"", absl::CEscape(literal_value)), "\'"); case FormatElementType::kWhitespace: { std::string space_chars = ""; for (int i = 0; i < len_in_format_str; ++i) { space_chars.push_back(' '); } return absl::StrCat("\'", space_chars, "\'"); } case FormatElementType::kFFN: return absl::StrCat("\'", "FF", subsecond_digit_count, "\'"); default: return absl::StrCat("\'", FormatElementTypeString(type), "\'"); } } static const FormatElementType kFormatElementTypeNullValue = FormatElementType::kFormatElementTypeUnspecified; using FormatElementTypeTrie = bigquery_ml_utils_base::GeneralTrie<FormatElementType, kFormatElementTypeNullValue>; const FormatElementTypeTrie* InitializeFormatElementTypeTrie() { FormatElementTypeTrie* trie = new FormatElementTypeTrie(); /*Simple Literals*/ trie->Insert("-", FormatElementType::kSimpleLiteral); trie->Insert(".", FormatElementType::kSimpleLiteral); trie->Insert("/", FormatElementType::kSimpleLiteral); trie->Insert(",", FormatElementType::kSimpleLiteral); trie->Insert("'", FormatElementType::kSimpleLiteral); trie->Insert(";", FormatElementType::kSimpleLiteral); trie->Insert(":", FormatElementType::kSimpleLiteral); /*Double Quoted Literal*/ // For the format element '\"xxxxx\"' (arbitrary text enclosed by ""), we // would match '\"' in the trie and then manually search the end of the // format element. trie->Insert("\"", FormatElementType::kDoubleQuotedLiteral); /*Whitespace*/ // For the format element consisting of a sequence of consecutive ASCII space // characters (' '), we would match ' ' in the trie and then manually search // the end of the sequence. trie->Insert(" ", FormatElementType::kWhitespace); /*Year*/ trie->Insert("YYYY", FormatElementType::kYYYY); trie->Insert("YYY", FormatElementType::kYYY); trie->Insert("YY", FormatElementType::kYY); trie->Insert("Y", FormatElementType::kY); trie->Insert("RRRR", FormatElementType::kRRRR); trie->Insert("RR", FormatElementType::kRR); trie->Insert("Y,YYY", FormatElementType::kYCommaYYY); trie->Insert("IYYY", FormatElementType::kIYYY); trie->Insert("IYY", FormatElementType::kIYY); trie->Insert("IY", FormatElementType::kIY); trie->Insert("I", FormatElementType::kI); trie->Insert("SYYYY", FormatElementType::kSYYYY); trie->Insert("YEAR", FormatElementType::kYEAR); trie->Insert("SYEAR", FormatElementType::kSYEAR); /*Month*/ trie->Insert("MM", FormatElementType::kMM); trie->Insert("MON", FormatElementType::kMON); trie->Insert("MONTH", FormatElementType::kMONTH); trie->Insert("RM", FormatElementType::kRM); /*Day*/ trie->Insert("DDD", FormatElementType::kDDD); trie->Insert("DD", FormatElementType::kDD); trie->Insert("D", FormatElementType::kD); trie->Insert("DAY", FormatElementType::kDAY); trie->Insert("DY", FormatElementType::kDY); trie->Insert("J", FormatElementType::kJ); /*Hour*/ trie->Insert("HH", FormatElementType::kHH); trie->Insert("HH12", FormatElementType::kHH12); trie->Insert("HH24", FormatElementType::kHH24); /*Minute*/ trie->Insert("MI", FormatElementType::kMI); /*Second*/ trie->Insert("SS", FormatElementType::kSS); trie->Insert("SSSSS", FormatElementType::kSSSSS); trie->Insert("FF1", FormatElementType::kFFN); trie->Insert("FF2", FormatElementType::kFFN); trie->Insert("FF3", FormatElementType::kFFN); trie->Insert("FF4", FormatElementType::kFFN); trie->Insert("FF5", FormatElementType::kFFN); trie->Insert("FF6", FormatElementType::kFFN); trie->Insert("FF7", FormatElementType::kFFN); trie->Insert("FF8", FormatElementType::kFFN); trie->Insert("FF9", FormatElementType::kFFN); /*Meridian indicator*/ trie->Insert("AM", FormatElementType::kAM); trie->Insert("PM", FormatElementType::kPM); trie->Insert("A.M.", FormatElementType::kAMWithDots); trie->Insert("P.M.", FormatElementType::kPMWithDots); /*Time zone*/ trie->Insert("TZH", FormatElementType::kTZH); trie->Insert("TZM", FormatElementType::kTZM); /*Century*/ trie->Insert("CC", FormatElementType::kCC); trie->Insert("SCC", FormatElementType::kSCC); /*Quarter*/ trie->Insert("Q", FormatElementType::kQ); /*Week*/ trie->Insert("IW", FormatElementType::kIW); trie->Insert("WW", FormatElementType::kWW); trie->Insert("W", FormatElementType::kW); /*Era Indicator*/ trie->Insert("AD", FormatElementType::kAD); trie->Insert("BC", FormatElementType::kBC); trie->Insert("A.D.", FormatElementType::kADWithDots); trie->Insert("B.C.", FormatElementType::kBCWithDots); /*Misc*/ trie->Insert("SP", FormatElementType::kSP); trie->Insert("TH", FormatElementType::kTH); trie->Insert("SPTH", FormatElementType::kSPTH); trie->Insert("THSP", FormatElementType::kTHSP); trie->Insert("FM", FormatElementType::kFM); return trie; } const FormatElementTypeTrie& GetFormatElementTypeTrie() { static const FormatElementTypeTrie* format_element_type_trie = InitializeFormatElementTypeTrie(); return *format_element_type_trie; } // Decides the <format_casing_type> field for a non-literal format element // based on its original string and category. absl::StatusOr<FormatCasingType> GetFormatCasingTypeOfNonLiteralElements( absl::string_view format_element_str, FormatElementCategory category) { SQL_RET_CHECK(category != FormatElementCategory::kLiteral); SQL_RET_CHECK(!format_element_str.empty() && absl::ascii_isalpha(format_element_str[0])); // If the first letter of the element is lowercase, then all the letters in // the output are lowercase. if (absl::ascii_islower(format_element_str[0])) { return FormatCasingType::kAllLettersLowercase; } // If the elements are in "kMeridianIndicator" or "kEraIndicator" category, // or the length of format element string is 1, the first letter indicates the // overall casing. Besides "A.M."/"P.M."/"A.D."/"B.C." (that belong to // "kMeridianIndicator" or "kEraIndicator" categories), the only element // whose second character of the element string is not an alphabet is // "Y,YYY", since this element does not output letters, the choice of // FormatCasingType should make no difference to the formatting result. if (category == FormatElementCategory::kMeridianIndicator || category == FormatElementCategory::kEraIndicator || format_element_str.size() == 1 || absl::AsciiStrToUpper(format_element_str) == "Y,YYY") { return FormatCasingType::kAllLettersUppercase; } SQL_RET_CHECK(absl::ascii_isalpha(format_element_str[1])); // If the first letter is upper case and the second letter is lowercase, then // the first letter of each word in the output is capitalized and the other // letters are lowercase. if (absl::ascii_isupper(format_element_str[0]) && absl::ascii_islower(format_element_str[1])) { return FormatCasingType::kOnlyFirstLetterUppercase; } // If the first two letters of the element are both upper case, the output is // capitalized. return FormatCasingType::kAllLettersUppercase; } // We need the upper <format_str> to do the search in prefix tree since matching // are case-sensitive and we need the original <format_str> to extract the // original_str for the format element object. absl::StatusOr<DateTimeFormatElement> GetNextDateTimeFormatElement( absl::string_view format_str, absl::string_view upper_format_str) { DateTimeFormatElement format_element; int matched_len; const FormatElementTypeTrie& format_element_type_trie = GetFormatElementTypeTrie(); const FormatElementType& type = format_element_type_trie.GetDataForMaximalPrefix( upper_format_str, &matched_len, /*is_terminator = */ nullptr); if (type == kFormatElementTypeNullValue) { return MakeEvalError() << "Cannot find matched format element"; } format_element.type = type; format_element.category = GetFormatElementCategoryFromType(type); if (format_element.category != FormatElementCategory::kLiteral) { SQL_ASSIGN_OR_RETURN( format_element.format_casing_type, GetFormatCasingTypeOfNonLiteralElements( format_str.substr(0, matched_len), format_element.category)); format_element.len_in_format_str = matched_len; if (format_element.type == FormatElementType::kFFN && !absl::SimpleAtoi(format_str.substr(2, matched_len - 2), &format_element.subsecond_digit_count)) { return MakeEvalError() << "Failed to parse format element of FFN type"; } return format_element; } // For literal format elements, we preserve casing of output letters since // they are originally from user input format string. format_element.format_casing_type = FormatCasingType::kPreserveCase; if (format_element.type == FormatElementType::kSimpleLiteral) { format_element.len_in_format_str = matched_len; format_element.literal_value = format_str.substr(0, matched_len); return format_element; } if (format_element.type == FormatElementType::kWhitespace) { // If the matched type is "kWhitespace", we search for the end of sequence // of consecutive ' ' (ASCII 32) characters. while (matched_len < format_str.length() && format_str[matched_len] == ' ') { matched_len++; } format_element.len_in_format_str = matched_len; return format_element; } SQL_RET_CHECK(format_element.type == FormatElementType::kDoubleQuotedLiteral); // If the matched type is "kDoubleQuotedLiteral", we search for the end // manually and do the unescaping in this process. format_element.literal_value = ""; size_t ind_to_check = 1; bool is_escaped = false; bool stop_search = false; while (ind_to_check < format_str.length() && !stop_search) { // Includes the char at position <ind_to_check>. matched_len++; char char_to_check = format_str[ind_to_check]; ind_to_check++; if (is_escaped) { if (char_to_check == '\\' || char_to_check == '\"') { is_escaped = false; } else { return MakeEvalError() << "Unsupported escape sequence \\" << char_to_check << " in text"; } } else if (char_to_check == '\\') { is_escaped = true; continue; } else if (char_to_check == '\"') { stop_search = true; break; } format_element.literal_value.push_back(char_to_check); } if (!stop_search) { return MakeEvalError() << "Cannot find matching \" for quoted literal"; } format_element.len_in_format_str = matched_len; return format_element; } // We need the upper format_str to do the search in prefix tree since matching // are case-sensitive and we need the original format_str to extract the // original_str for the format element object. absl::StatusOr<std::vector<DateTimeFormatElement>> GetDateTimeFormatElements( absl::string_view format_str) { std::vector<DateTimeFormatElement> format_elements; size_t processed_len = 0; std::string upper_format_str_temp = absl::AsciiStrToUpper(format_str); absl::string_view upper_format_str = upper_format_str_temp; while (processed_len < format_str.size()) { auto res = GetNextDateTimeFormatElement(format_str.substr(processed_len), upper_format_str.substr(processed_len)); if (res.ok()) { DateTimeFormatElement& format_element = res.value(); format_elements.push_back(format_element); processed_len += format_element.len_in_format_str; } else { return MakeEvalError() << res.status().message() << " at " << processed_len; } } return format_elements; } // Takes a format model vector and rewrites it to be a format element string // that can be correctly formatted by FormatTime. Any elements that are not // supported by FormatTime will be formatted manually in this function. Any // non-literal elements that output strings will be outputted with the first // letter capitalized and all subsequent letters will be lowercase. absl::StatusOr<std::string> FromDateTimeFormatElementToFormatString( const DateTimeFormatElement& format_element, const absl::TimeZone::CivilInfo info) { switch (format_element.type) { case FormatElementType::kSimpleLiteral: case FormatElementType::kDoubleQuotedLiteral: return format_element.literal_value; case FormatElementType::kWhitespace: { std::string res = ""; for (int i = 0; i < format_element.len_in_format_str; ++i) { res.push_back(' '); } return res; } case FormatElementType::kYYYY: case FormatElementType::kYYY: case FormatElementType::kYY: case FormatElementType::kY: case FormatElementType::kRRRR: case FormatElementType::kRR: { int element_length = format_element.len_in_format_str; // YYYY will output the whole year regardless of how many digits are in // the year. // FormatTime does not support the year with the last 3 digits. int trunc_year = static_cast<int>(info.cs.year()) % powers_of_ten[element_length]; return absl::StrFormat( "%0*d", format_element.len_in_format_str, (element_length == 4 ? info.cs.year() : trunc_year)); break; } case FormatElementType::kMM: return "%m"; case FormatElementType::kMON: return "%b"; case FormatElementType::kMONTH: return "%B"; case FormatElementType::kD: return std::to_string(internal_functions::DayOfWeekIntegerSunToSat1To7( absl::GetWeekday(info.cs))); case FormatElementType::kDD: return "%d"; case FormatElementType::kDDD: return "%j"; case FormatElementType::kDAY: return "%A"; case FormatElementType::kDY: return "%a"; case FormatElementType::kHH: case FormatElementType::kHH12: return "%I"; case FormatElementType::kHH24: return "%H"; case FormatElementType::kMI: return "%M"; case FormatElementType::kSS: return "%S"; case FormatElementType::kSSSSS: { // FormatTime does not support having 5 digit second of the day. int second_of_day = info.cs.hour() * kNaiveNumSecondsPerHour + info.cs.minute() * kNaiveNumSecondsPerMinute + info.cs.second(); return absl::StrFormat("%05d", second_of_day); } case FormatElementType::kFFN: { return absl::StrCat("%E", format_element.subsecond_digit_count, "f"); } case FormatElementType::kAM: case FormatElementType::kPM: { if (info.cs.hour() >= 12) { return "PM"; } else { return "AM"; } } case FormatElementType::kAMWithDots: case FormatElementType::kPMWithDots: { if (info.cs.hour() >= 12) { return "P.M."; } else { return "A.M."; } } case FormatElementType::kTZH: case FormatElementType::kTZM: { bool positive_offset; int32_t hour_offset; int32_t minute_offset; internal_functions::GetSignHourAndMinuteTimeZoneOffset( info, &positive_offset, &hour_offset, &minute_offset); if (format_element.type == FormatElementType::kTZH) { return absl::StrFormat("%c%02d", positive_offset ? '+' : '-', hour_offset); } else { return absl::StrFormat("%02d", minute_offset); } } default: return MakeEvalError() << "Unsupported format element " << format_element.ToString(); } } absl::StatusOr<std::string> ResolveFormatString( const DateTimeFormatElement& format_element, absl::Time base_time, absl::TimeZone timezone) { const absl::TimeZone::CivilInfo info = timezone.At(base_time); SQL_ASSIGN_OR_RETURN( const std::string format_string, FromDateTimeFormatElementToFormatString(format_element, info)); // We do not need to go through steps of calling FormatTime function and // resolving casing for literal format elements. if (format_element.category == FormatElementCategory::kLiteral) { return format_string; } // The following resolves casing for format elements. std::string resolved_string = absl::FormatTime(format_string, base_time, timezone); switch (format_element.format_casing_type) { case FormatCasingType::kFormatCasingTypeUnspecified: return MakeEvalError() << "Format casing type is unspecified"; case FormatCasingType::kPreserveCase: // For any format element that outputs a string, its formatting result from // FormatTime function are already outputted with the first letter // capitalized and all subsequent letters being lowercase, so we do not need // any extra processing here. case FormatCasingType::kOnlyFirstLetterUppercase: return resolved_string; case FormatCasingType::kAllLettersUppercase: return absl::AsciiStrToUpper(resolved_string); case FormatCasingType::kAllLettersLowercase: return absl::AsciiStrToLower(resolved_string); } } absl::StatusOr<std::string> FromCastFormatTimestampToStringInternal( absl::Span<const DateTimeFormatElement> format_elements, absl::Time base_time, absl::TimeZone timezone) { if (!IsValidTime(base_time)) { return MakeEvalError() << "Invalid timestamp value: " << absl::ToUnixMicros(base_time); } absl::TimeZone normalized_timezone = internal_functions::GetNormalizedTimeZone(base_time, timezone); std::string updated_format_string; for (const DateTimeFormatElement& format_element : format_elements) { SQL_ASSIGN_OR_RETURN( std::string str_format, ResolveFormatString(format_element, base_time, normalized_timezone)); absl::StrAppend(&updated_format_string, str_format); } return updated_format_string; } } // namespace cast_date_time_internal absl::StatusOr<StringToTimestampCaster> StringToTimestampCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN(auto format_elements, GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDateTimeFormatElementsForTimestampType(format_elements)); return StringToTimestampCaster(std::move(format_elements)); } absl::Status StringToTimestampCaster::Cast(absl::string_view timestamp_string, absl::TimeZone default_timezone, absl::Time current_timestamp, int64_t* timestamp_micros) const { if (!IsWellFormedUTF8(timestamp_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } return ParseTimeWithFormatElements(format_elements_, timestamp_string, default_timezone, current_timestamp, timestamp_micros); } absl::Status CastStringToTimestamp(absl::string_view format_string, absl::string_view timestamp_string, const absl::TimeZone default_timezone, const absl::Time current_timestamp, int64_t* timestamp_micros) { SQL_ASSIGN_OR_RETURN(auto caster, StringToTimestampCaster::Create(format_string)); return caster.Cast(timestamp_string, default_timezone, current_timestamp, timestamp_micros); } absl::Status CastStringToTimestamp(absl::string_view format_string, absl::string_view timestamp_string, absl::string_view default_timezone_string, const absl::Time current_timestamp, int64_t* timestamp) { // Other two input string arguments (<format_string> and <timestamp_string>) // are checked in the overload call to CastStringToTimestamp. if (!IsWellFormedUTF8(default_timezone_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } absl::TimeZone timezone; SQL_RETURN_IF_ERROR(MakeTimeZone(default_timezone_string, &timezone)); return CastStringToTimestamp(format_string, timestamp_string, timezone, current_timestamp, timestamp); } absl::Status CastStringToTimestamp(absl::string_view format_string, absl::string_view timestamp_string, const absl::TimeZone default_timezone, const absl::Time current_timestamp, absl::Time* timestamp) { if (!IsWellFormedUTF8(timestamp_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN(const std::vector<DateTimeFormatElement> format_elements, GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDateTimeFormatElementsForTimestampType(format_elements)); return ParseTimeWithFormatElements(format_elements, timestamp_string, default_timezone, current_timestamp, kNanoseconds, timestamp); } absl::Status CastStringToTimestamp(absl::string_view format_string, absl::string_view timestamp_string, absl::string_view default_timezone_string, const absl::Time current_timestamp, absl::Time* timestamp) { // Other two input string arguments (<format_string> and <timestamp_string>) // are checked in the overload call to CastStringToTimestamp. if (!IsWellFormedUTF8(default_timezone_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } absl::TimeZone timezone; SQL_RETURN_IF_ERROR(MakeTimeZone(default_timezone_string, &timezone)); return CastStringToTimestamp(format_string, timestamp_string, timezone, current_timestamp, timestamp); } absl::StatusOr<StringToDateCaster> StringToDateCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN(auto format_elements, GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDateTimeFormatElementsForDateType(format_elements)); return StringToDateCaster(std::move(format_elements)); } absl::Status StringToDateCaster::Cast(absl::string_view date_string, int32_t current_date, int32_t* date) const { if (!IsWellFormedUTF8(date_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } absl::Time current_date_utc_ts; int64_t timestamp; SQL_RETURN_IF_ERROR(ConvertDateToTimestamp(current_date, absl::UTCTimeZone(), &current_date_utc_ts)); // We use <current_date_utc_ts> (constructed with <current_date> and "UTC") // as <current_timestamp> and "UTC" as <default_timezone>, so the // <current_year> and <current_date> used in ParseTimeWithFormatElements // function would be the same as year and month in <current_date>. SQL_RETURN_IF_ERROR(ParseTimeWithFormatElements( format_elements_, date_string, absl::UTCTimeZone(), current_date_utc_ts, &timestamp)); SQL_RETURN_IF_ERROR(ExtractFromTimestamp(DATE, timestamp, kMicroseconds, absl::UTCTimeZone(), date)); return absl::OkStatus(); } absl::Status CastStringToDate(absl::string_view format_string, absl::string_view date_string, int32_t current_date, int32_t* date) { SQL_ASSIGN_OR_RETURN(auto caster, StringToDateCaster::Create(format_string)); return caster.Cast(date_string, current_date, date); } absl::Status CastStringToTime(absl::string_view format_string, absl::string_view time_string, TimestampScale scale, TimeValue* time) { SQL_ASSIGN_OR_RETURN(auto caster, StringToTimeCaster::Create(format_string)); return caster.Cast(time_string, scale, time); } absl::StatusOr<StringToTimeCaster> StringToTimeCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN(auto format_elements, GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDateTimeFormatElementsForTimeType(format_elements)); return StringToTimeCaster(std::move(format_elements)); } absl::Status StringToTimeCaster::Cast(absl::string_view time_string, TimestampScale scale, TimeValue* time) const { if (!IsWellFormedUTF8(time_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } SQL_RET_CHECK(scale == kMicroseconds || scale == kNanoseconds) << "Only kNanoseconds or kMicroseconds scale is supported"; absl::Time timestamp; // We use "1970-01-01 utc" as the <current_timestamp> argument for // ParseTimeWithFormatElements function, and it actually has no effect for the // final output since we derive default values for time parts from // "00:00:00:000000000". SQL_RETURN_IF_ERROR(ParseTimeWithFormatElements( format_elements_, time_string, absl::UTCTimeZone(), /*current_timestamp=*/absl::UnixEpoch(), scale, &timestamp)); SQL_RETURN_IF_ERROR( ConvertTimestampToTime(timestamp, absl::UTCTimeZone(), scale, time)); return absl::OkStatus(); } absl::StatusOr<StringToDatetimeCaster> StringToDatetimeCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN(auto format_elements, GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDateTimeFormatElementsForDatetimeType(format_elements)); return StringToDatetimeCaster(std::move(format_elements)); } absl::Status StringToDatetimeCaster::Cast(absl::string_view datetime_string, TimestampScale scale, int32_t current_date, DatetimeValue* datetime) const { if (!IsWellFormedUTF8(datetime_string)) { return MakeEvalError() << "Input string is not valid UTF-8"; } SQL_RET_CHECK(scale == kMicroseconds || scale == kNanoseconds) << "Only kNanoseconds or kMicroseconds scale is supported"; absl::Time current_date_utc_ts; absl::Time timestamp; SQL_RETURN_IF_ERROR(ConvertDateToTimestamp(current_date, absl::UTCTimeZone(), &current_date_utc_ts)); // We use <current_date_utc_ts> (constructed with <current_date> and "UTC") // as <current_timestamp> and "UTC" as <default_timezone>, so the // <current_year> and <current_date> used in ParseTimeWithFormatElements // function would be the same as year and month in <current_date>. SQL_RETURN_IF_ERROR(ParseTimeWithFormatElements( format_elements_, datetime_string, absl::UTCTimeZone(), current_date_utc_ts, scale, &timestamp)); SQL_RETURN_IF_ERROR( ConvertTimestampToDatetime(timestamp, absl::UTCTimeZone(), datetime)); return absl::OkStatus(); } absl::Status CastStringToDatetime(absl::string_view format_string, absl::string_view datetime_string, TimestampScale scale, int32_t current_date, DatetimeValue* datetime) { SQL_ASSIGN_OR_RETURN(auto caster, StringToDatetimeCaster::Create(format_string)); return caster.Cast(datetime_string, scale, current_date, datetime); } absl::Status ValidateFormatStringForParsing( absl::string_view format_string, bigquery_ml_utils::TypeKind out_type) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN(const std::vector<DateTimeFormatElement> format_elements, GetDateTimeFormatElements(format_string)); if (out_type == TYPE_TIMESTAMP) { return ValidateDateTimeFormatElementsForTimestampType(format_elements); } else if (out_type == TYPE_DATE) { return ValidateDateTimeFormatElementsForDateType(format_elements); } else if (out_type == TYPE_TIME) { return ValidateDateTimeFormatElementsForTimeType(format_elements); } else if (out_type == TYPE_DATETIME) { return ValidateDateTimeFormatElementsForDatetimeType(format_elements); } else { return MakeSqlError() << "Unsupported output type for validation"; } } absl::Status ValidateFormatStringForFormatting( absl::string_view format_string, bigquery_ml_utils::TypeKind out_type) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN( std::vector<cast_date_time_internal::DateTimeFormatElement> format_elements, cast_date_time_internal::GetDateTimeFormatElements(format_string)); switch (out_type) { case TYPE_DATE: return ValidateDateDateTimeFormatElementsForFormatting(format_elements); case TYPE_DATETIME: return ValidateDatetimeDateTimeFormatElementsForFormatting( format_elements); case TYPE_TIME: return ValidateTimeDateTimeFormatElementsForFormatting(format_elements); case TYPE_TIMESTAMP: return absl::OkStatus(); default: return MakeSqlError() << "Unsupported output type for validation"; } } absl::StatusOr<DateToStringCaster> DateToStringCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN( auto format_elements, cast_date_time_internal::GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDateDateTimeFormatElementsForFormatting(format_elements)); return DateToStringCaster(std::move(format_elements)); } absl::Status DateToStringCaster::Cast(int32_t date, std::string* out) const { if (!IsValidDate(date)) { return MakeEvalError() << "Invalid date value: " << date; } // Treats it as a timestamp at midnight on that date and invokes the // format_timestamp function. int64_t date_timestamp = static_cast<int64_t>(date) * kNaiveNumMicrosPerDay; SQL_ASSIGN_OR_RETURN( *out, cast_date_time_internal::FromCastFormatTimestampToStringInternal( format_elements_, MakeTime(date_timestamp, kMicroseconds), absl::UTCTimeZone())); return absl::OkStatus(); } absl::Status CastFormatDateToString(absl::string_view format_string, int32_t date, std::string* out) { SQL_ASSIGN_OR_RETURN(auto caster, DateToStringCaster::Create(format_string)); SQL_RETURN_IF_ERROR(caster.Cast(date, out)); return absl::OkStatus(); } absl::StatusOr<DatetimeToStringCaster> DatetimeToStringCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN( auto format_elements, cast_date_time_internal::GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateDatetimeDateTimeFormatElementsForFormatting(format_elements)); return DatetimeToStringCaster(std::move(format_elements)); } absl::Status DatetimeToStringCaster::Cast(const DatetimeValue& datetime, std::string* out) const { if (!datetime.IsValid()) { return MakeEvalError() << "Invalid datetime value: " << datetime.DebugString(); } absl::Time datetime_in_utc = absl::UTCTimeZone().At(datetime.ConvertToCivilSecond()).pre; datetime_in_utc += absl::Nanoseconds(datetime.Nanoseconds()); SQL_ASSIGN_OR_RETURN( *out, cast_date_time_internal::FromCastFormatTimestampToStringInternal( format_elements_, datetime_in_utc, absl::UTCTimeZone())); return absl::OkStatus(); } absl::Status CastFormatDatetimeToString(absl::string_view format_string, const DatetimeValue& datetime, std::string* out) { SQL_ASSIGN_OR_RETURN(auto caster, DatetimeToStringCaster::Create(format_string)); return caster.Cast(datetime, out); } absl::StatusOr<TimeToStringCaster> TimeToStringCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN( auto format_elements, cast_date_time_internal::GetDateTimeFormatElements(format_string)); SQL_RETURN_IF_ERROR( ValidateTimeDateTimeFormatElementsForFormatting(format_elements)); return TimeToStringCaster(std::move(format_elements)); } absl::Status TimeToStringCaster::Cast(const TimeValue& time, std::string* out) const { if (!time.IsValid()) { return MakeEvalError() << "Invalid time value: " << time.DebugString(); } absl::Time time_in_epoch_day = absl::UTCTimeZone() .At(absl::CivilSecond(1970, 1, 1, time.Hour(), time.Minute(), time.Second())) .pre; time_in_epoch_day += absl::Nanoseconds(time.Nanoseconds()); SQL_ASSIGN_OR_RETURN( *out, cast_date_time_internal::FromCastFormatTimestampToStringInternal( format_elements_, time_in_epoch_day, absl::UTCTimeZone())); return absl::OkStatus(); } absl::Status CastFormatTimeToString(absl::string_view format_string, const TimeValue& time, std::string* out) { SQL_ASSIGN_OR_RETURN(auto caster, TimeToStringCaster::Create(format_string)); return caster.Cast(time, out); } absl::StatusOr<TimestampToStringCaster> TimestampToStringCaster::Create( absl::string_view format_string) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); SQL_ASSIGN_OR_RETURN( auto format_elements, cast_date_time_internal::GetDateTimeFormatElements(format_string)); return TimestampToStringCaster(std::move(format_elements)); } absl::Status TimestampToStringCaster::Cast(int64_t timestamp_micros, absl::TimeZone timezone, std::string* out) const { SQL_ASSIGN_OR_RETURN( *out, cast_date_time_internal::FromCastFormatTimestampToStringInternal( format_elements_, MakeTime(timestamp_micros, kMicroseconds), timezone)); return absl::OkStatus(); } absl::Status TimestampToStringCaster::Cast(absl::Time timestamp, absl::TimeZone timezone, std::string* out) const { SQL_ASSIGN_OR_RETURN( *out, cast_date_time_internal::FromCastFormatTimestampToStringInternal( format_elements_, timestamp, timezone)); return absl::OkStatus(); } absl::Status CastFormatTimestampToString(absl::string_view format_string, int64_t timestamp_micros, absl::TimeZone timezone, std::string* out) { // <format_string> is checked in the overload call to // CastFormatTimestampToString. return CastFormatTimestampToString( format_string, MakeTime(timestamp_micros, kMicroseconds), timezone, out); } absl::Status CastFormatTimestampToString(absl::string_view format_string, int64_t timestamp_micros, absl::string_view timezone_string, std::string* out) { SQL_RETURN_IF_ERROR(ConductBasicFormatStringChecks(format_string)); if (!IsWellFormedUTF8(timezone_string)) { return MakeEvalError() << "Timezone string is not a valid UTF-8 string."; } absl::TimeZone timezone; SQL_RETURN_IF_ERROR(MakeTimeZone(timezone_string, &timezone)); return CastFormatTimestampToString(format_string, timestamp_micros, timezone, out); } absl::Status CastFormatTimestampToString(absl::string_view format_string, absl::Time timestamp, absl::string_view timezone_string, std::string* out) { // <format_string> is checked in the overload call to // CastFormatTimestampToString. absl::TimeZone timezone; SQL_RETURN_IF_ERROR(MakeTimeZone(timezone_string, &timezone)); return CastFormatTimestampToString(format_string, timestamp, timezone, out); } absl::Status CastFormatTimestampToString(absl::string_view format_string, absl::Time timestamp, absl::TimeZone timezone, std::string* out) { SQL_ASSIGN_OR_RETURN(auto caster, TimestampToStringCaster::Create(format_string)); return caster.Cast(timestamp, timezone, out); } } // namespace functions } // namespace bigquery_ml_utils