in sql_utils/public/functions/parse_date_time.cc [964:1646]
static absl::Status ParseTime(absl::string_view format,
absl::string_view timestamp_string,
const absl::TimeZone default_timezone,
TimestampScale scale, bool parse_version2,
absl::Time* timestamp) {
// The unparsed input. Note that data and end_of_data can be nullptr
// for an empty string_view.
const char* data = timestamp_string.data();
const char* end_of_data = data + timestamp_string.length();
bool read_copy = false;
const char* original_data_copy_position;
std::string data_copy_str;
// If the last byte of the 'timestamp_string' is a nul-byte then we ignore it.
if (data != end_of_data) {
const char* last_char = end_of_data - 1;
if (*last_char == '\0') {
end_of_data = last_char;
}
}
// Skips leading whitespace.
data = ConsumeWhitespace(data, end_of_data);
// Sets default values for unspecified fields.
struct tm tm = { 0 };
tm.tm_year = 1970 - 1900; // tm_year is an offset from 1900
tm.tm_mon = 1 - 1; // tm_mon is 0-based, so this is January
tm.tm_mday = 1;
tm.tm_hour = 0;
tm.tm_min = 0;
tm.tm_sec = 0;
tm.tm_wday = 4; // Thursday
tm.tm_yday = 0;
tm.tm_isdst = 0;
DateParseContext date_parse_context;
absl::Duration subseconds;
int timezone_offset_minutes = 0;
bool saw_timezone_offset = false;
absl::TimeZone timezone = default_timezone;
const char* fmt = format.data();
const char* end_of_fmt = fmt + format.length();
// If the last byte of the 'format' string is a nul-byte then we ignore it.
if (fmt != end_of_fmt) {
const char* last_char = end_of_fmt - 1;
if (*last_char == '\0') {
end_of_fmt = last_char;
}
}
bool twelve_hour = false;
bool afternoon = false;
bool saw_percent_s = false;
int64_t percent_s_time = 0;
int century = 0;
// Should the value in <century> be applied to <tm.tm_year>.
bool use_century = false;
// Has <century> been set by an explicit '%C'. <century> can be set by '%y'
// but such an implicit value should be overwritten by a subsequent '%y'.
bool explicit_century = false;
// Steps through the format string one format element at a time. Generally
// uses strptime() to process the format elements, but has native
// handling for timezones, subseconds, and many others.
int current_element_position = 0;
while (data != nullptr && data < end_of_data && fmt < end_of_fmt) {
// If the next format character is a space, skip over all the next spaces
// in both the format and the input timestamp string.
if (absl::ascii_isspace(*fmt)) {
data = ConsumeWhitespace(data, end_of_data);
while (++fmt < end_of_fmt && absl::ascii_isspace(*fmt)) continue;
continue;
}
// If the next character in the format string is not a format element,
// then that character must match exactly with the input data or an
// error is returned.
if (fmt != nullptr && fmt < end_of_fmt && *fmt != '%') {
if (data != nullptr && data < end_of_data && *data == *fmt) {
++data;
++fmt;
} else {
return MakeEvalError() << "Mismatch between format character '" << *fmt
<< "' and string character '" << *data << "'";
}
continue;
}
const char* percent = fmt;
if (++fmt == end_of_fmt) {
// The format string cannot end with a single '%'.
return MakeEvalError() << "Format string cannot end with a single '%'";
}
current_element_position++;
switch (*fmt++) {
case 'Y':
// For SQL we accept years 0-10000 because after offsetting
// the result timestamp with a time zone it may fall within the valid
// range. The actual result timestamp value will be range-checked
// later.
// Note that the year value is offset in the tm by 1900.
// If the next element in the format is another formatting escape, don't
// allow 'ParseInt' to consume a fifth digit.
if (fmt < end_of_fmt && *fmt == '%') {
data = ParseInt(data, end_of_data, 4, 0, 9999, &tm.tm_year);
} else {
data = ParseInt(data, end_of_data, 5, 0, 10000, &tm.tm_year);
}
if (data != nullptr) tm.tm_year -= 1900;
// Full year form should overwrite century.
use_century = false;
explicit_century = false;
date_parse_context.last_year_element_position =
current_element_position;
date_parse_context.non_iso_date_part_present = true;
continue;
case 'C': {
// If the next element in the format is another formatting escape, don't
// allow 'ParseInt' to consume a third digit.
if (fmt < end_of_fmt && *fmt == '%') {
data = ParseInt(data, end_of_data, 2, 0, 99, ¢ury);
} else {
data = ParseInt(data, end_of_data, 3, 0, 100, ¢ury);
}
// Note that the year value is offset in the tm by 1900.
if (data != nullptr && !use_century) tm.tm_year = 0;
use_century = true;
explicit_century = true;
date_parse_context.last_year_element_position =
current_element_position;
date_parse_context.non_iso_date_part_present = true;
continue;
}
case 'm': {
data = ParseInt(data, end_of_data, 2, 1, 12, &tm.tm_mon);
tm.tm_mon -= 1;
date_parse_context.last_month_element_position =
current_element_position;
date_parse_context.non_iso_date_part_present = true;
continue;
}
case 'd': {
data = ParseInt(data, end_of_data, 2, 1, 31, &tm.tm_mday);
date_parse_context.last_mday_element_position =
current_element_position;
date_parse_context.non_iso_date_part_present = true;
continue;
}
case 'H':
data = ParseInt(data, end_of_data, 2, 0, 23, &tm.tm_hour);
twelve_hour = false;
continue;
case 'M':
data = ParseInt(data, end_of_data, 2, 0, 59, &tm.tm_min);
continue;
case 'S':
data = ParseInt(data, end_of_data, 2, 0, 60, &tm.tm_sec);
continue;
case 'Q': {
int quarter_number;
data = ParseInt(data, end_of_data, 1, 1, 4, &quarter_number);
if (data != nullptr) {
tm.tm_mon = (quarter_number - 1) * 3;
tm.tm_mday = 1;
}
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_month_element_position =
current_element_position;
date_parse_context.last_mday_element_position =
current_element_position;
continue;
}
case 'p': {
data = HandleMeridianFormatters(data, end_of_data, afternoon);
continue;
}
case 'r': // equivalent to %I:%M:%S %p
data = HandleTwelveHourFormatters(data, end_of_data, tm, twelve_hour);
data = ExpectChar(data, end_of_data, ':');
data = ParseInt(data, end_of_data, 2, 0, 59, &tm.tm_min);
data = ExpectChar(data, end_of_data, ':');
data = ParseInt(data, end_of_data, 2, 0, 60, &tm.tm_sec);
data = ConsumeWhitespace(data, end_of_data);
data = HandleMeridianFormatters(data, end_of_data, afternoon);
continue;
case 'c': // equivalent to '%a %b %e %T %Y'
// example: 'Tue Jul 20 12:34:56 2021'
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
date_parse_context.last_month_element_position =
current_element_position;
date_parse_context.last_mday_element_position =
current_element_position;
twelve_hour = false; // probably uses %H
break;
case 'R': // uses %H
case 'T': // uses %H
case 'X': // probably uses %H
twelve_hour = false;
break;
case 'y':
data = ParseInt(data, end_of_data, 2, 0, 99, &tm.tm_year);
// Use century to keep track of combinations of %y and %C.
if (data != nullptr && !explicit_century) {
century = tm.tm_year < 69 ? 20 : 19;
}
use_century = true;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
continue;
case 'z':
data = ParseOffset(data, end_of_data, '\0', &timezone_offset_minutes);
if (!IsValidTimeZone(timezone_offset_minutes)) {
return MakeEvalError()
<< "Timezone offset out of valid range -14:00 to +14:00: "
<< TimeZoneOffsetToString(timezone_offset_minutes);
}
saw_timezone_offset = true;
continue;
case 'Z': {
std::string timezone_string;
data = ParseZone(data, &timezone_string, end_of_data);
// The input time zone string overrides the default time zone.
SQL_RETURN_IF_ERROR(MakeTimeZone(timezone_string, &timezone));
// Unset the timezone offset settings, we will use an offset derived
// from the specified time zone name instead.
timezone_offset_minutes = 0;
saw_timezone_offset = false;
continue;
}
case 's': {
const int64_t seconds_min = types::kTimestampMin / kNumMillisPerSecond;
const int64_t seconds_max = types::kTimestampMax / kNumMillisPerSecond;
const int max_seconds_digits = 12;
data = ParseInt(data, end_of_data, max_seconds_digits, seconds_min,
seconds_max, &percent_s_time);
if (data != nullptr) saw_percent_s = true;
// We don't really need to track element positions for year/month/day
// since %s overrides everything else, but we do it for consistency
// since it does impact the year/month/day parts.
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
date_parse_context.last_month_element_position =
current_element_position;
date_parse_context.last_mday_element_position =
current_element_position;
continue;
}
case 'E': {
if (fmt < end_of_fmt && *fmt == 'z') {
if (data != nullptr && *data == 'Z') {
timezone_offset_minutes = 0;
saw_timezone_offset = true;
data += 1;
fmt += 1;
continue;
}
data = ParseOffset(data, end_of_data, ':', &timezone_offset_minutes);
if (!IsValidTimeZone(timezone_offset_minutes)) {
return MakeEvalError()
<< "Timezone offset out of valid range -14:00 to +14:00: "
<< TimeZoneOffsetToString(timezone_offset_minutes);
}
saw_timezone_offset = true;
fmt += 1;
continue;
}
if (fmt < end_of_fmt && *fmt == 'Y') {
// If the next element in the format is another formatting escape,
// don't allow 'ParseInt' to consume a fifth digit.
if (fmt + 1 < end_of_fmt && fmt[1] == '%') {
data = ParseInt(data, end_of_data, 4, 0, 9999, &tm.tm_year);
} else {
data = ParseInt(data, end_of_data, 5, 0, 10000, &tm.tm_year);
}
// Year with century. '%EY' is treated like '%Y' in en_US locale.
if (data != nullptr) tm.tm_year -= 1900;
fmt += 1;
// Full year form should overwrite century.
use_century = false;
explicit_century = false;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
continue;
}
if (fmt < end_of_fmt && *fmt == 'y') {
// Two digit year. '%Ey' is treated like '%y' in en_US locale.
data = ParseInt(data, end_of_data, 2, 0, 99, &tm.tm_year);
// Use century to keep track of combinations of %y and %C.
if (data != nullptr && !explicit_century) {
century = tm.tm_year < 69 ? 20 : 19;
}
fmt += 1;
use_century = true;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
continue;
}
if (fmt < end_of_fmt && *fmt == 'C') {
// '%EC' treated like '%C'.
// If the next element in the format is another formatting escape,
// don't allow 'ParseInt' to consume a third digit.
if (fmt + 1 < end_of_fmt && fmt[1] == '%') {
data = ParseInt(data, end_of_data, 2, 0, 99, ¢ury);
} else {
data = ParseInt(data, end_of_data, 3, 0, 100, ¢ury);
}
// Note that the year value is offset in the tm by 1900.
if (data != nullptr && !use_century) tm.tm_year = 0;
fmt += 1;
use_century = true;
explicit_century = true;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
continue;
}
if (fmt + 1 < end_of_fmt && *fmt == '*' && *(fmt + 1) == 'S') {
data = ParseInt(data, end_of_data, 2, 0, 60, &tm.tm_sec);
data = ParseSubSecondsIfStartingWithPoint(
data, end_of_data, 0 /* max_digits */, scale, &subseconds);
fmt += 2;
continue;
}
if (fmt + 1 < end_of_fmt && *fmt == '4' && *(fmt + 1) == 'Y') {
const char* bp = data;
// Valid year range is 0 - 9999.
data = ParseInt(data, end_of_data, 4, 0, 9999, &tm.tm_year);
if (data != nullptr) {
if (data - bp == 4) {
tm.tm_year -= 1900;
} else {
data = nullptr; // Less than four digits, return an error.
}
}
fmt += 2;
// Full year form should overwrite century.
use_century = false;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
continue;
}
if (fmt < end_of_fmt && std::isdigit(*fmt)) {
int n = 0;
// Only %E0S to %E9S is supported (0-9 subseconds digits).
if (const char* np = ParseInt(fmt, end_of_fmt, 1, 0,
static_cast<int32_t>(scale), &n)) {
if (*np++ == 'S') {
data = ParseInt(data, end_of_data, 2, 0, 60, &tm.tm_sec);
if (n > 0) {
data = ParseSubSecondsIfStartingWithPoint(data, end_of_data, n,
scale, &subseconds);
}
fmt = np;
continue;
}
}
}
// Uses %H in en_US locale.
if (fmt < end_of_fmt && *fmt == 'c') twelve_hour = false;
// Uses %H in en_US locale.
if (fmt < end_of_fmt && *fmt == 'X') twelve_hour = false;
if (fmt < end_of_fmt) {
fmt += 1;
}
break;
}
case 'I':
case 'l': {
data = HandleTwelveHourFormatters(data, end_of_data, tm, twelve_hour);
continue;
}
case 'O':
if (fmt < end_of_fmt && *fmt == 'H') twelve_hour = false;
if (fmt < end_of_fmt && *fmt == 'I') {
data = HandleTwelveHourFormatters(data, end_of_data, tm, twelve_hour);
fmt++;
continue;
}
if (fmt < end_of_fmt && *fmt == 'u') {
// Day of week 1-7. '%Ou' is treated like '%u' in en_US locale.
// '%u' is defined as weekday number 1-7, starting Monday
date_parse_context.elements.push_back(
{'u', data, end_of_data, current_element_position});
data = ParseInt(data, end_of_data, 1, 1, 7, &tm.tm_wday);
fmt += 1;
continue;
}
if (fmt < end_of_fmt && *fmt == 'w') {
// Day of week 0-6. '%Ow' is treated like '%w' in en_US locale.
// '%w' is defined as weekday number 0-6, starting Sunday
date_parse_context.elements.push_back(
{'w', data, end_of_data, current_element_position});
data = ParseInt(data, end_of_data, 1, 0, 6, &tm.tm_wday);
fmt += 1;
continue;
}
if (fmt < end_of_fmt && *fmt == 'U') {
int week_number;
// Week number 00-53. '%OU' is treated like '%U' in en_US locale.
date_parse_context.non_iso_week_present = true;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.elements.push_back(
{'U', data, end_of_data, current_element_position});
data = ParseInt(data, end_of_data, 2, 0, 53, &week_number);
fmt += 1;
continue;
}
if (fmt < end_of_fmt && *fmt == 'V') {
int week_number;
// Week number 1-53. '%OV' is treated like '%V' in en_US locale.
date_parse_context.iso_week_present = true;
date_parse_context.elements.push_back(
{'V', data, end_of_data, current_element_position});
data = ParseInt(data, end_of_data, 2, 1, 53, &week_number);
fmt += 1;
continue;
}
if (fmt < end_of_fmt && *fmt == 'W') {
int week_number;
// Week number 0-53. '%OW' is treated like '%W' in en_US locale.
date_parse_context.iso_week_present = true;
date_parse_context.elements.push_back(
{'W', data, end_of_data, current_element_position});
data = ParseInt(data, end_of_data, 2, 0, 53, &week_number);
fmt += 1;
continue;
}
if (fmt < end_of_fmt) ++fmt;
break;
case 'D': // %m/%d/%y
case 'F': // %Y-%m-%d
case 'x': // locale-specific YMD format, %m/%d/%y in en_US locale
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_year_element_position =
current_element_position;
date_parse_context.last_month_element_position =
current_element_position;
date_parse_context.last_mday_element_position =
current_element_position;
break;
case 'B': // Full month name
case 'b': // Abbreviated month name
case 'h': // Abbreviated month name
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_month_element_position =
current_element_position;
break;
case 'e': // day of month (single digits preceded by a space)
date_parse_context.non_iso_date_part_present = true;
date_parse_context.last_mday_element_position =
current_element_position;
break;
case 'U': // week number of the year (starting Sunday) 00-53
case 'W': // week number of the year (starting Monday) 00-53
date_parse_context.non_iso_week_present = true;
date_parse_context.non_iso_date_part_present = true;
date_parse_context.elements.push_back(
{*(fmt - 1), data, end_of_data, current_element_position});
break;
case 'V': // ISO 8601 week number 01-53
date_parse_context.iso_week_present = true;
date_parse_context.elements.push_back(
{'V', data, end_of_data, current_element_position});
// ParseTM doesn't support this part, so parse the ISO week value
// to advance 'data' and continue.
int week_number;
data = ParseInt(data, end_of_data, 2, 1, 53, &week_number);
continue;
case 'A': // Full weekday name
case 'a': // Abbreviated weekday name
case 'u': // weekday number 1-7, starting Monday
case 'w': // weekday number 0-6, starting Sunday
date_parse_context.elements.push_back(
{*(fmt - 1), data, end_of_data, current_element_position});
break;
case 'J': // ISO day of year
date_parse_context.iso_dayofyear_present = true;
date_parse_context.elements.push_back(
{*(fmt - 1), data, end_of_data, current_element_position});
// ParseTM doesn't support this part, so parse the ISO day value
// to advance 'data' and continue.
int iso_dayofyear;
data = ParseInt(data, end_of_data, 3, 1, 371, &iso_dayofyear);
continue;
case 'j': // Day of year (non-ISO)
date_parse_context.non_iso_date_part_present = true;
date_parse_context.elements.push_back(
{*(fmt - 1), data, end_of_data, current_element_position});
break;
case 't':
case 'n': {
data = ConsumeWhitespace(data, end_of_data);
continue;
}
case 'g': { // ISO 8601 year without century, e.g., 19
date_parse_context.iso_year_present = true;
date_parse_context.elements.push_back(
{*(fmt - 1), data, end_of_data, current_element_position});
// Move 'data' past this element's data, but don't update the output.
int ignored;
data = ParseInt(data, end_of_data, 2, 0, 99, &ignored);
continue;
}
case 'G': { // ISO 8601 year with century, e.g., 2019
// To be (mostly) backwards compatible with the previous strptime
// implementation, we consume and ignore a large number of digits
// here. Technically, strptime will consume an arbitrarily large
// number of digits, but we will only consume enough to more than
// cover an int64_t (even though we only support a range of 10k years.
date_parse_context.iso_year_present = true;
date_parse_context.elements.push_back(
{*(fmt - 1), data, end_of_data, current_element_position});
// Move 'data' past this element's data, but don't update the output.
int ignored;
data = ParseInt(data, end_of_data, 20, 0, 99999, &ignored);
continue;
}
default:
// No special handling for this format element, let ParseTM/strptime()
// do it.
break;
}
std::string format_element(percent, fmt - percent);
// When no special handling for this format element in the switch statement
// above, call ParseTM() that invokes strptime() to parse the current
// format element and updates tm.
//
// strptime() requires that the input strings are null terminated. Thus, we
// make a string copy of the 'timestamp_string' from the position that we
// cannot handle in the switch statement above to the end of
// 'timestamp_string', because 'timestamp_string' is a string_view and may
// not be null-terminated. We only make the copy once and 'read_copy'is
// changed to true if the copy is made. If another format element is without
// special handling in the switch statement above, we won't make a copy
// again. The copy we made for the previous no-special-handling format
// element will be used. We just recompute the offset of the string copy and
// pass it to strptime().
if (!read_copy) {
read_copy = true;
data_copy_str = std::string(data, end_of_data - data);
original_data_copy_position = data;
}
const char* data_copy_pointer =
data_copy_str.c_str() + (data - original_data_copy_position);
const char* next_position =
ParseTM(data_copy_pointer, format_element.c_str(), &tm);
if (next_position != nullptr) {
data += next_position - data_copy_pointer;
} else {
data = nullptr;
}
}
// Adjust a 12-hour tm_hour value if it should be in the afternoon.
if (twelve_hour && afternoon) {
tm.tm_hour += 12;
}
// Skip any remaining whitespace.
if (data != nullptr) {
while (data < end_of_data && absl::ascii_isspace(*data)) ++data;
}
if (fmt != nullptr) {
// Note that in addition to skipping trailing whitespace in the format
// string, we must also handle a corner case where we have consumed the
// entire input data string, but the format string still contains %n or %t
// format elements (which consume 0 or more whitespaces). So we must
// also ignore any remaining %n or %t format elements.
while (fmt < end_of_fmt && (absl::ascii_isspace(*fmt) || *fmt == '%')) {
if (absl::ascii_isspace(*fmt)) {
++fmt;
continue;
}
if (++fmt == end_of_fmt) {
// The format string cannot end with a single '%'.
return MakeEvalError() << "Format string cannot end with a single '%'";
}
if (*fmt == 'n' || *fmt == 't') {
// We got '%n' or '%t', so increment and continue.
++fmt;
continue;
} else {
// We got a different format element, so stop skipping white space.
// This will cause us to return the 'Failed to parse input string'
// error below.
break;
}
}
}
if (data != end_of_data || fmt != end_of_fmt) {
return MakeEvalError() << "Failed to parse input string "
<< ToStringLiteral(timestamp_string);
}
// We must consume the entire input string and there must not be trailing
// garbage or it is an error.
if (data != end_of_data) {
return MakeEvalError() << "Illegal non-space trailing data '" << *data
<< "' in string "
<< ToStringLiteral(timestamp_string);
}
// If we saw %s then we ignore everything else and return the
// corresponding timestamp.
if (saw_percent_s) {
*timestamp = absl::FromUnixSeconds(percent_s_time);
if (!IsValidTime(*timestamp)) {
return MakeEvalError() << "Invalid result from parsing function";
}
return absl::OkStatus();
}
// If we saw %z or %Ez then we want to interpret the parsed fields in
// UTC and then shift by that offset. Otherwise we want to interpret
// the fields using the default or specified time zone name.
if (saw_timezone_offset) {
// We will apply the timezone_offset from UTC.
timezone = absl::UTCTimeZone();
} else {
SQL_RET_CHECK_EQ(0, timezone_offset_minutes);
}
// Normalizes a leap second of 60 to the following ":00.000000".
if (tm.tm_sec == 60) {
tm.tm_sec -= 1;
subseconds = absl::Seconds(1);
}
// Overflow cannot occur since the only valid range is years 0-10000.
int64_t year = tm.tm_year + 1900;
if (use_century) {
year += century * 100 - 1900;
}
int month = tm.tm_mon + 1;
int mday = tm.tm_mday;
if (parse_version2) {
SQL_RETURN_IF_ERROR(
UpdateYearMonthDayIfNeeded(&year, &month, &mday, &date_parse_context));
}
const absl::TimeConversion tc = absl::ConvertDateTime(
year, month, mday, tm.tm_hour, tm.tm_min, tm.tm_sec, timezone);
// ParseTime() fails if any normalization was done. That is,
// parsing "Sep 31" will not produce the equivalent of "Oct 1".
if (tc.normalized) {
return MakeEvalError() << "Out-of-range datetime field in parsing function";
}
*timestamp = tc.pre - absl::Minutes(timezone_offset_minutes) + subseconds;
if (!IsValidTime(*timestamp)) {
return MakeEvalError() << "Invalid result from parsing function";
}
return absl::OkStatus();
} // NOLINT(readability/fn_size)