bool DateV2Value::from_date_format_str()

in be/src/vec/runtime/vdatetime_value.cpp [2232:2694]


bool DateV2Value<T>::from_date_format_str(const char* format, int format_len, const char* value,
                                          int64_t value_len, const char** sub_val_end) {
    if (value_len <= 0) [[unlikely]] {
        return false;
    }
    const char* ptr = format;
    const char* end = format + format_len;
    const char* val = value;
    const char* val_end = value + value_len;

    bool already_set_time_part = false; // skip time part in the end's setting.

    uint32_t part_used = 0;
    constexpr int YEAR_PART = 1U << 0;
    constexpr int MONTH_PART = 1U << 1;
    constexpr int DAY_PART = 1U << 2;
    constexpr int NORMAL_DATE_PART = YEAR_PART | MONTH_PART | DAY_PART;
    constexpr int WEEKDAY_PART = 1U << 3;
    constexpr int YEARDAY_PART = 1U << 4;
    constexpr int WEEK_NUM_PART = 1U << 5;
    constexpr int SPECIAL_DATE_PART = WEEKDAY_PART | YEARDAY_PART | WEEK_NUM_PART;
    [[maybe_unused]] constexpr int DATE_PART = NORMAL_DATE_PART | SPECIAL_DATE_PART;
    constexpr int HOUR_PART = 1U << 6;
    constexpr int MINUTE_PART = 1U << 7;
    constexpr int SECOND_PART = 1U << 8;
    constexpr int FRAC_PART = 1U << 9;
    constexpr int TIME_PART = HOUR_PART | MINUTE_PART | SECOND_PART | FRAC_PART;

    int half_day = 0; // 0 for am/none, 12 for pm.
    int weekday = -1;
    int yearday = -1;
    int week_num = -1;

    bool strict_week_number = false;
    bool sunday_first = false;
    bool strict_week_number_year_type = false;
    int strict_week_number_year = -1;
    bool hour_system_12 = false;

    auto [year, month, day, hour, minute, second, microsecond] = std::tuple {0, 0, 0, 0, 0, 0, 0};
    while (ptr < end && val < val_end) {
        // Skip space character
        while (val < val_end && check_space(*val)) {
            val++;
        }
        // Check switch
        if (*ptr == '%' && ptr + 1 < end) {
            const char* tmp = nullptr;
            int64_t int_value = 0;
            ptr++;
            switch (*ptr++) {
                // Year
            case 'y':
                // Year, numeric (two digits)
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                int_value += int_value >= 70 ? 1900 : 2000;
                year = int_value;
                val = tmp;
                part_used |= YEAR_PART;
                break;
            case 'Y':
                // Year, numeric, four digits
                tmp = val + min(4, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                if (tmp - val <= 2) {
                    int_value += int_value >= 70 ? 1900 : 2000;
                }
                year = int_value;
                val = tmp;
                part_used |= YEAR_PART;
                break;
                // Month
            case 'm':
            case 'c':
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                month = int_value;
                val = tmp;
                part_used |= MONTH_PART;
                break;
            case 'M':
                int_value = check_word(const_cast<const char**>(s_month_name), val, val_end, &val);
                if (int_value < 0) {
                    return false;
                }
                month = int_value;
                part_used |= MONTH_PART;
                break;
            case 'b':
                int_value = check_word(s_ab_month_name, val, val_end, &val);
                if (int_value < 0) {
                    return false;
                }
                month = int_value;
                part_used |= MONTH_PART;
                break;
                // Day
            case 'd':
            case 'e':
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                day = int_value;
                val = tmp;
                part_used |= DAY_PART;
                break;
            case 'D':
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                day = int_value;
                val = tmp + min(2, val_end - tmp);
                part_used |= DAY_PART;
                break;
                // Hour
            case 'h':
            case 'I':
            case 'l':
                hour_system_12 = true;
                part_used |= HOUR_PART;
                // Fall through
            case 'k':
            case 'H':
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                hour = int_value;
                val = tmp;
                part_used |= HOUR_PART;
                break;
                // Minute
            case 'i':
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                minute = int_value;
                val = tmp;
                part_used |= MINUTE_PART;
                break;
                // Second
            case 's':
            case 'S':
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                second = int_value;
                val = tmp;
                part_used |= SECOND_PART;
                break;
            // Micro second
            case 'f':
                tmp = val;
                // when there's still something to the end, fix the scale of ms.
                while (tmp < val_end && isdigit(*tmp)) {
                    tmp++;
                }

                if (tmp - val > 6) {
                    const char* tmp2 = val + 6;
                    if (!str_to_int64(val, &tmp2, &int_value)) {
                        return false;
                    }
                } else {
                    if (!str_to_int64(val, &tmp, &int_value)) {
                        return false;
                    }
                }
                if constexpr (is_datetime) {
                    microsecond = int_value * int_exp10(6 - min(6, tmp - val));
                    part_used |= FRAC_PART;
                }
                val = tmp;
                break;
                // AM/PM
            case 'p':
                if ((val_end - val) < 2 || toupper(*(val + 1)) != 'M' || !hour_system_12) {
                    return false;
                }
                if (toupper(*val) == 'P') {
                    // PM
                    half_day = 12;
                }
                val += 2;
                break;
                // Weekday
            case 'W':
                int_value = check_word(const_cast<const char**>(s_day_name), val, val_end, &val);
                if (int_value < 0) {
                    return false;
                }
                int_value++;
                weekday = int_value;
                part_used |= WEEKDAY_PART;
                break;
            case 'a':
                int_value = check_word(s_ab_day_name, val, val_end, &val);
                if (int_value < 0) {
                    return false;
                }
                int_value++;
                weekday = int_value;
                part_used |= WEEKDAY_PART;
                break;
            case 'w':
                tmp = val + min(1, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                if (int_value >= 7) {
                    return false;
                }
                if (int_value == 0) {
                    int_value = 7;
                }
                weekday = int_value;
                val = tmp;
                part_used |= WEEKDAY_PART;
                break;
            case 'j':
                tmp = val + min(3, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                yearday = int_value;
                val = tmp;
                part_used |= YEARDAY_PART;
                break;
            case 'u':
            case 'v':
            case 'U':
            case 'V':
                sunday_first = (*(ptr - 1) == 'U' || *(ptr - 1) == 'V');
                // Used to check if there is %x or %X
                strict_week_number = (*(ptr - 1) == 'V' || *(ptr - 1) == 'v');
                tmp = val + min(2, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                week_num = int_value;
                if (week_num > 53 || (strict_week_number && week_num == 0)) {
                    return false;
                }
                val = tmp;
                part_used |= WEEK_NUM_PART;
                break;
                // strict week number, must be used with %V or %v
            case 'x':
            case 'X':
                strict_week_number_year_type = (*(ptr - 1) == 'X');
                tmp = val + min(4, val_end - val);
                if (!str_to_int64(val, &tmp, &int_value)) {
                    return false;
                }
                strict_week_number_year = int_value;
                val = tmp;
                part_used |= WEEK_NUM_PART;
                break;
            case 'r': {
                if constexpr (is_datetime) {
                    DateV2Value<DateTimeV2ValueType> tmp_val;
                    if (!tmp_val.from_date_format_str("%I:%i:%S %p", 11, val, val_end - val,
                                                      &tmp)) {
                        return false;
                    }
                    this->date_v2_value_.hour_ = tmp_val.hour();
                    this->date_v2_value_.minute_ = tmp_val.minute();
                    this->date_v2_value_.second_ = tmp_val.second();
                    val = tmp;
                    part_used |= TIME_PART;
                    already_set_time_part = true;
                    break;
                } else {
                    return false;
                }
            }
            case 'T': {
                if constexpr (is_datetime) {
                    DateV2Value<DateTimeV2ValueType> tmp_val;
                    if (!tmp_val.from_date_format_str("%H:%i:%S", 8, val, val_end - val, &tmp)) {
                        return false;
                    }
                    this->date_v2_value_.hour_ = tmp_val.hour();
                    this->date_v2_value_.minute_ = tmp_val.minute();
                    this->date_v2_value_.second_ = tmp_val.second();
                    part_used |= TIME_PART;
                    already_set_time_part = true;
                    val = tmp;
                    break;
                } else {
                    return false;
                }
            }
            case '.':
                while (val < val_end && ispunct(*val)) {
                    val++;
                }
                break;
            case '@':
                while (val < val_end && isalpha(*val)) {
                    val++;
                }
                break;
            case '#':
                while (val < val_end && isdigit(*val)) {
                    val++;
                }
                break;
            case '%': // %%, escape the %
                if ('%' != *val) {
                    return false;
                }
                val++;
                break;
            default:
                return false;
            }
        } else if (!isspace(*ptr)) {
            if (*ptr != *val) {
                return false;
            }
            ptr++;
            val++;
        } else {
            ptr++;
        }
    }

    // for compatible with mysql, like something have %H:%i:%s format but no relative content...
    while (ptr < end) {
        if (*ptr == '%' && ptr + 1 < end) {
            ptr++;
            switch (*ptr++) {
            case 'H':
            case 'h':
            case 'I':
            case 'i':
            case 'k':
            case 'l':
            case 'r':
            case 's':
            case 'f':
            case 'S':
            case 'p':
            case 'T':
                part_used |= TIME_PART;
                break;
            default:
                break;
            }
        } else {
            ptr++;
        }
    }

    if (!part_used) {
        return false;
    }

    if (hour_system_12) {
        if (hour > 12 || hour < 1) {
            return false;
        }
        hour = (hour % 12) + half_day;
    }
    if (sub_val_end) {
        *sub_val_end = val;
    }

    // Compute timestamp type
    if (part_used & FRAC_PART) {
        if constexpr (!is_datetime) {
            return false;
        }
    } else if (part_used & TIME_PART) {
        if constexpr (!is_datetime) {
            return false;
        }
    }

    // Year day
    if (yearday > 0) {
        uint64_t days = doris::calc_daynr(year, 1, 1) + yearday - 1;
        if (!get_date_from_daynr(days)) {
            return false;
        }
    }
    // weekday
    if (week_num >= 0 && weekday > 0) {
        // Check
        if ((strict_week_number &&
             (strict_week_number_year < 0 || strict_week_number_year_type != sunday_first)) ||
            (!strict_week_number && strict_week_number_year >= 0)) {
            return false;
        }
        uint64_t days =
                doris::calc_daynr(strict_week_number ? strict_week_number_year : year, 1, 1);

        uint8_t weekday_b = doris::calc_weekday(days, sunday_first);

        if (sunday_first) {
            days += ((weekday_b == 0) ? 0 : 7) - weekday_b + (week_num - 1) * 7 + weekday % 7;
        } else {
            days += ((weekday_b <= 3) ? 0 : 7) - weekday_b + (week_num - 1) * 7 + weekday - 1;
        }
        if (!get_date_from_daynr(days)) {
            return false;
        }
    }
    // 1. already_set_date_part means _year, _month, _day be set, so we only set time part
    // 2. already_set_time_part means _hour, _minute, _second, _microsecond be set,
    //    so we only need to set date part
    // 3. if both are true, means all part of date_time be set, no need check_range_and_set_time
    bool already_set_date_part = yearday > 0 || (week_num >= 0 && weekday > 0);
    if (already_set_date_part && already_set_time_part) {
        return true;
    }
    if (already_set_date_part) {
        if constexpr (is_datetime) {
            return check_range_and_set_time(date_v2_value_.year_, date_v2_value_.month_,
                                            date_v2_value_.day_, hour, minute, second, microsecond);
        } else {
            return check_range_and_set_time(date_v2_value_.year_, date_v2_value_.month_,
                                            date_v2_value_.day_, 0, 0, 0, 0);
        }
    }
    // complete default month/day
    if (!(part_used & ~NORMAL_DATE_PART)) { // Ymd part only
        if (!(part_used & DAY_PART)) {
            day = 1;
            if (!(part_used & MONTH_PART)) {
                month = 1;
            }
        }
    }

    if (already_set_time_part) {
        if constexpr (is_datetime) {
            return check_range_and_set_time(year, month, day, date_v2_value_.hour_,
                                            date_v2_value_.minute_, date_v2_value_.second_,
                                            microsecond);
        } else {
            return check_range_and_set_time(year, month, day, 0, 0, 0, 0);
        }
    }
    if constexpr (is_datetime) {
        return check_range_and_set_time(year, month, day, hour, minute, second, microsecond,
                                        !(part_used & ~TIME_PART));
    } else {
        return check_range_and_set_time(year, month, day, 0, 0, 0, 0);
    }
}