in src/ColumnEncodingUtility/analyze-schema-compression.py [0:0]
def reduce_column_length(col_type, column_name, table_name):
set_col_type = col_type
# analyze the current size length for varchar columns and return early if they are below the threshold
if "varchar" in col_type:
curr_col_length = int(re.search(r'\d+', col_type).group())
if curr_col_length < STRING_REDUCTION_MAX_LENGTH_THRESHOLD:
return col_type
else:
col_len_statement = 'select /* computing max column length */ max(octet_length("%s")) from %s."%s"' % (
column_name, schema_name, table_name)
else:
col_len_statement = 'select /* computing max column length */ max(abs("%s")) from %s."%s"' % (
column_name, schema_name, table_name)
if debug:
comment(col_len_statement)
comment("Analyzing max length of column '%s' for table '%s.%s' " % (
column_name, schema_name, table_name))
# run the analyze in a loop, because it could be locked by another process modifying rows
# and get a timeout
col_len_result = None
col_len_retry = 10
col_len_attempt_count = 0
col_len_last_exception = None
while col_len_attempt_count < col_len_retry and col_len_result is None:
try:
col_len_result = execute_query(col_len_statement)
col_max_len = col_len_result[0][0]
if col_max_len is None:
col_max_len = 0
except KeyboardInterrupt:
# To handle Ctrl-C from user
cleanup(get_pg_conn())
return TERMINATED_BY_USER
except Exception as e:
print(e)
col_len_attempt_count += 1
col_len_last_exception = e
# Exponential Backoff
time.sleep(2 ** col_len_attempt_count * RETRY_TIMEOUT)
if col_len_result is None:
if col_len_last_exception is not None:
print("Unable to determine length of %s for table %s due to Exception %s" % (
column_name, table_name, col_len_last_exception.message))
raise col_len_last_exception
else:
print(
"Unable to determine length of %s for table %s due to Null response to query. No changes will be made" % (
column_name, table_name))
if "varchar" in col_type:
new_column_len = int(col_max_len * (1 + COL_LENGTH_EXPANSION_BUFFER))
# if the new length would be greater than varchar(max) then return the current value - no changes
if new_column_len > 65535:
return col_type
# if the new length would be smaller than the specified new varchar minimum then set to varchar minimum
if new_column_len < new_varchar_min:
new_column_len = new_varchar_min
# if the new length would be 0 then return the current value - no changes
if new_column_len == 0:
return col_type
if debug:
comment(
"Max width of character column '%s' for table '%s.%s' is %d. Current width is %d. Setting new size to %s" % (
column_name, schema_name, table_name, col_max_len,
curr_col_length, new_column_len))
if new_column_len < curr_col_length:
set_col_type = re.sub(str(curr_col_length), str(new_column_len), col_type)
else:
# Test to see if largest value is smaller than largest value of smallint (2 bytes)
if col_max_len * (1 + COL_LENGTH_EXPANSION_BUFFER) <= int(math.pow(2, 15) - 1) and col_type != "smallint":
set_col_type = re.sub(col_type, "smallint", col_type)
# Test to see if largest value is smaller than largest value of smallint (4 bytes)
elif col_max_len * (1 + COL_LENGTH_EXPANSION_BUFFER) <= int(math.pow(2, 31) - 1) and col_type != "integer":
set_col_type = re.sub(col_type, "integer", col_type)
return set_col_type