def reduce_column_length()

in src/ColumnEncodingUtility/analyze-schema-compression.py [0:0]


def reduce_column_length(col_type, column_name, table_name):
    set_col_type = col_type

    # analyze the current size length for varchar columns and return early if they are below the threshold
    if "varchar" in col_type:
        curr_col_length = int(re.search(r'\d+', col_type).group())
        if curr_col_length < STRING_REDUCTION_MAX_LENGTH_THRESHOLD:
            return col_type
        else:
            col_len_statement = 'select /* computing max column length */ max(octet_length("%s")) from %s."%s"' % (
                column_name, schema_name, table_name)
    else:
        col_len_statement = 'select /* computing max column length */ max(abs("%s")) from %s."%s"' % (
            column_name, schema_name, table_name)

    if debug:
        comment(col_len_statement)

    comment("Analyzing max length of column '%s' for table '%s.%s' " % (
        column_name, schema_name, table_name))

    # run the analyze in a loop, because it could be locked by another process modifying rows
    # and get a timeout
    col_len_result = None
    col_len_retry = 10
    col_len_attempt_count = 0
    col_len_last_exception = None

    while col_len_attempt_count < col_len_retry and col_len_result is None:
        try:
            col_len_result = execute_query(col_len_statement)
            col_max_len = col_len_result[0][0]
            if col_max_len is None:
                col_max_len = 0
        except KeyboardInterrupt:
            # To handle Ctrl-C from user
            cleanup(get_pg_conn())
            return TERMINATED_BY_USER
        except Exception as e:
            print(e)
            col_len_attempt_count += 1
            col_len_last_exception = e

            # Exponential Backoff
            time.sleep(2 ** col_len_attempt_count * RETRY_TIMEOUT)

    if col_len_result is None:
        if col_len_last_exception is not None:
            print("Unable to determine length of %s for table %s due to Exception %s" % (
                column_name, table_name, col_len_last_exception.message))
            raise col_len_last_exception
        else:
            print(
                "Unable to determine length of %s for table %s due to Null response to query. No changes will be made" % (
                    column_name, table_name))

    if "varchar" in col_type:
        new_column_len = int(col_max_len * (1 + COL_LENGTH_EXPANSION_BUFFER))

        # if the new length would be greater than varchar(max) then return the current value - no changes
        if new_column_len > 65535:
            return col_type

        # if the new length would be smaller than the specified new varchar minimum then set to varchar minimum
        if new_column_len < new_varchar_min:
            new_column_len = new_varchar_min

        # if the new length would be 0 then return the current value - no changes
        if new_column_len == 0:
            return col_type

        if debug:
            comment(
                "Max width of character column '%s' for table '%s.%s' is %d. Current width is %d. Setting new size to %s" % (
                    column_name, schema_name, table_name, col_max_len,
                    curr_col_length, new_column_len))

        if new_column_len < curr_col_length:
            set_col_type = re.sub(str(curr_col_length), str(new_column_len), col_type)
    else:
        # Test to see if largest value is smaller than largest value of smallint (2 bytes)
        if col_max_len * (1 + COL_LENGTH_EXPANSION_BUFFER) <= int(math.pow(2, 15) - 1) and col_type != "smallint":

            set_col_type = re.sub(col_type, "smallint", col_type)
        # Test to see if largest value is smaller than largest value of smallint (4 bytes)
        elif col_max_len * (1 + COL_LENGTH_EXPANSION_BUFFER) <= int(math.pow(2, 31) - 1) and col_type != "integer":
            set_col_type = re.sub(col_type, "integer", col_type)

    return set_col_type