def extract_from_sds_skewed_info()

in utilities/Hive_metastore_migration/src/hive_metastore_migration.py [0:0]


    def extract_from_sds_skewed_info(self, hms, ms_sds):

        skewed_info = ms_sds.select('SD_ID', 'skewedInfo.*')

        ms_skewed_col_names = skewed_info.select('SD_ID', explode('skewedColumnNames').alias('SKEWED_COL_NAME'))

        # with extra field 'STRING_LIST_STR'
        skewed_col_value_loc_map = skewed_info\
            .select('SD_ID', explode('skewedColumnValueLocationMaps')\
                    .alias('STRING_LIST_STR', 'LOCATION'))

        skewed_col_value_loc_map = self.generate_id_df(skewed_col_value_loc_map, 'STRING_LIST_ID_KID')

        udf_string_list_list = UserDefinedFunction(DataCatalogTransformer.udf_string_list_str_to_list,
                                                   ArrayType(StringType(), True))

        skewed_string_list_values = skewed_col_value_loc_map\
            .select(col('STRING_LIST_ID_KID').alias('STRING_LIST_ID'),
                    udf_string_list_list('STRING_LIST_STR').alias('STRING_LIST_LIST'))

        ms_skewed_string_list_values = DataCatalogTransformer.generate_idx_for_df(
            skewed_string_list_values,
            'STRING_LIST_ID',
            'STRING_LIST_LIST',
            StringType()
        ).withColumnRenamed('col', 'STRING_LIST_VALUE')

        ms_skewed_col_value_loc_map = skewed_col_value_loc_map\
            .drop_columns(['STRING_LIST_STR'])

        ms_skewed_string_list = ms_skewed_string_list_values.select('STRING_LIST_ID')

        hms.ms_skewed_col_names = ms_skewed_col_names
        hms.ms_skewed_col_value_loc_map = ms_skewed_col_value_loc_map
        hms.ms_skewed_string_list_values = ms_skewed_string_list_values
        hms.ms_skewed_string_list = ms_skewed_string_list