in utilities/Hive_metastore_migration/src/hive_metastore_migration.py [0:0]
def extract_from_sds_skewed_info(self, hms, ms_sds):
skewed_info = ms_sds.select('SD_ID', 'skewedInfo.*')
ms_skewed_col_names = skewed_info.select('SD_ID', explode('skewedColumnNames').alias('SKEWED_COL_NAME'))
# with extra field 'STRING_LIST_STR'
skewed_col_value_loc_map = skewed_info\
.select('SD_ID', explode('skewedColumnValueLocationMaps')\
.alias('STRING_LIST_STR', 'LOCATION'))
skewed_col_value_loc_map = self.generate_id_df(skewed_col_value_loc_map, 'STRING_LIST_ID_KID')
udf_string_list_list = UserDefinedFunction(DataCatalogTransformer.udf_string_list_str_to_list,
ArrayType(StringType(), True))
skewed_string_list_values = skewed_col_value_loc_map\
.select(col('STRING_LIST_ID_KID').alias('STRING_LIST_ID'),
udf_string_list_list('STRING_LIST_STR').alias('STRING_LIST_LIST'))
ms_skewed_string_list_values = DataCatalogTransformer.generate_idx_for_df(
skewed_string_list_values,
'STRING_LIST_ID',
'STRING_LIST_LIST',
StringType()
).withColumnRenamed('col', 'STRING_LIST_VALUE')
ms_skewed_col_value_loc_map = skewed_col_value_loc_map\
.drop_columns(['STRING_LIST_STR'])
ms_skewed_string_list = ms_skewed_string_list_values.select('STRING_LIST_ID')
hms.ms_skewed_col_names = ms_skewed_col_names
hms.ms_skewed_col_value_loc_map = ms_skewed_col_value_loc_map
hms.ms_skewed_string_list_values = ms_skewed_string_list_values
hms.ms_skewed_string_list = ms_skewed_string_list