bool DateV2Value::from_date_str_base()

in be/src/vec/runtime/vdatetime_value.cpp [1983:2222]


bool DateV2Value<T>::from_date_str_base(const char* date_str, int len, int scale,
                                        const cctz::time_zone* local_time_zone, bool convert_zero) {
    const char* ptr = date_str;
    const char* end = date_str + len;
    // ONLY 2, 6 can follow by a space
    const static int allow_space_mask = 4 | 64;
    uint32_t date_val[MAX_DATE_PARTS] = {0};
    int32_t date_len[MAX_DATE_PARTS] = {0};

    // Skip space character
    while (ptr < end && check_space(*ptr)) {
        ptr++;
    }
    if (ptr == end || !isdigit(*ptr)) {
        return false;
    }
    // Fix year length
    const char* pos = ptr;
    while (pos < end && (isdigit(*pos) || *pos == 'T')) {
        pos++;
    }
    int year_len = 4;
    int digits = pos - ptr;
    bool is_interval_format = false;
    bool has_bar = false;

    // Compatible with MySQL.
    // For YYYYMMDD/YYYYMMDDHHMMSS is 4 digits years
    if (pos == end || *pos == '.' ||
        time_zone_begins(pos, end)) { // no delimeter until ./Asia/Z/GMT...
        if (digits == 4 || digits == 8 || digits >= 14) {
            year_len = 4;
        } else {
            year_len = 2;
        }
        is_interval_format = true;
    }

    int field_idx = 0;
    int field_len = year_len;
    long sec_offset = 0;
    bool need_use_timezone = false;

    while (ptr < end && isdigit(*ptr) && field_idx < MAX_DATE_PARTS) {
        const char* start = ptr;
        int temp_val = 0;
        bool scan_to_delim = (!is_interval_format) && (field_idx != 6);
        while (ptr < end && isdigit(*ptr) && (scan_to_delim || field_len--)) { // field_len <= 7
            temp_val = temp_val * 10 + (*ptr - '0');
            ptr++;
        }

        if (ptr == start) {
            return false;
        }

        if (field_idx == 6) {
            if constexpr (is_datetime) {
                // round of microseconds
                // 1. normalize to 7 digits for rounding
                // 2. rounding
                // 3. nomalize to 6 digits for storage
                if (scale >= 0) {
                    // do normalization
                    const auto ms_digit_count = ptr - start;
                    const auto normalizer = int_exp10(std::abs(7 - ms_digit_count));
                    temp_val *= normalizer;

                    // check round
                    const auto rounder = int_exp10(std::abs(7 - scale));
                    const auto reminder = temp_val % rounder;
                    temp_val -= reminder;

                    if (reminder >= 5 * normalizer) {
                        temp_val += rounder;
                    }

                    // truncate to 6 digits
                    if (temp_val == int_exp10(7)) {
                        temp_val = 0;
                        sec_offset += 1;
                    } else {
                        temp_val /= 10;
                    }
                }

                // move ptr to start of timezone or end
                while (ptr < end && isdigit(*ptr)) {
                    ptr++;
                }
            } else {
                // Microsecond
                const auto ms_part = ptr - start;
                temp_val *= int_exp10(std::max(0L, 6 - ms_part));
            }
        }

        // Impossible
        if (temp_val > 999999L) {
            return false;
        }

        date_val[field_idx] = temp_val;

        if (field_idx == 6) {
            // select cast("2020-01-01 12:00:00.12345" as Datetime(4))
            // ptr - start will be 5, but scale is 4
            date_len[field_idx] = std::min(static_cast<int>(ptr - start), scale);
        } else {
            date_len[field_idx] = ptr - start;
        }

        field_len = 2;

        if (ptr == end) {
            field_idx++;
            break;
        }

        // timezone
        if (UNLIKELY((field_idx > 2 ||
                      !has_bar) /*dont treat xxxx-xx-xx:xx:xx as xxxx-xx(-xx:xx:xx)*/
                     && time_zone_begins(ptr, end))) {
            if (local_time_zone == nullptr) {
                return false;
            }
            need_use_timezone = true;
            field_idx++;
            break;
        }

        if (field_idx == 2 && *ptr == 'T') {
            // YYYYMMDDTHHMMDD, skip 'T' and continue
            ptr++;
            field_idx++;
            continue;
        }

        // Second part
        if (field_idx == 5) {
            if (*ptr == '.') {
                ptr++;
                // for datetime, we need to discard the fraction part
                // that beyond the scale + 1, and scale + 1 digit will
                // be used to round the fraction part
                if constexpr (is_datetime) {
                    field_len = std::min(7, scale + 1);
                } else {
                    field_len = 6;
                }
            } else if (isdigit(*ptr)) {
                field_idx++;
                break;
            }
            field_idx++;
            continue;
        }
        // escape separator
        while (ptr < end && (check_date_punct(*ptr) || check_space(*ptr))) {
            if (check_space(*ptr)) {
                if (((1 << field_idx) & allow_space_mask) == 0) {
                    return false;
                }
            }
            if (*ptr == '-') {
                has_bar = true;
            }
            ptr++;
        }
        field_idx++;
    }

    int num_field = field_idx;
    if (!is_interval_format) {
        year_len = date_len[0];
    }
    for (; field_idx < MAX_DATE_PARTS; ++field_idx) {
        date_val[field_idx] = 0;
    }

    if (year_len == 2) {
        if (date_val[0] < YY_PART_YEAR) {
            date_val[0] += 2000;
        } else {
            date_val[0] += 1900;
        }
    }

    if (num_field < 3) {
        return false;
    }
    if (is_invalid(date_val[0], date_val[1], date_val[2], 0, 0, 0, 0)) {
        if (date_val[0] == 0 && date_val[1] == 0 && date_val[2] == 0 && convert_zero) {
            date_val[1] = 1;
            date_val[2] = 1;
        } else {
            return false;
        }
    }

    if (need_use_timezone) {
        cctz::time_zone given_tz {};
        if (!TimezoneUtils::find_cctz_time_zone(std::string {ptr, end}, given_tz)) {
            return false; // invalid format
        }
        auto given = cctz::convert(cctz::civil_second {}, given_tz);
        auto local = cctz::convert(cctz::civil_second {}, *local_time_zone);
        // these two values is absolute time. so they are negative. need to use (-local) - (-given)
        sec_offset = std::chrono::duration_cast<std::chrono::seconds>(given - local).count();
    }

    // In check_range_and_set_time, for Date type the time part will be truncated. So if the timezone offset should make
    // rounding to date part, it would be lost. To avoid this, we use a Datetime type to do these calc. It will save the
    // time part and apply the offset. Then convert to Date type back.
    // see https://github.com/apache/doris/pull/33553 for more details.
    if constexpr (!is_datetime) {
        if (sec_offset) {
            DateV2Value<DateTimeV2ValueType> tmp;
            if (!tmp.check_range_and_set_time(date_val[0], date_val[1], date_val[2], date_val[3],
                                              date_val[4], date_val[5], date_val[6])) {
                return false;
            }
            if (!tmp.date_add_interval<TimeUnit::SECOND>(
                        TimeInterval {TimeUnit::SECOND, sec_offset, false})) {
                return false;
            }
            this->assign_from(tmp);
            return true;
        }
    }

    if (!check_range_and_set_time(date_val[0], date_val[1], date_val[2], date_val[3], date_val[4],
                                  date_val[5], date_val[6])) {
        return false;
    }

    return sec_offset ? date_add_interval<TimeUnit::SECOND>(
                                TimeInterval {TimeUnit::SECOND, sec_offset, false})
                      : true;
}