def extract_sds()

in utilities/Hive_metastore_migration/src/hive_metastore_migration.py [0:0]


    def extract_sds(self, ms_tbls, ms_partitions):

        ms_tbls = ms_tbls.withColumn('ID', concat(ms_tbls.TBL_NAME, ms_tbls.DB_NAME))
        ms_partitions = ms_partitions.withColumn('ID', ms_partitions.PART_ID.cast(StringType()))
        ms_tbls_sds = ms_tbls\
            .select('ID', 'storageDescriptor.*')\
            .withColumn('type', lit("table"))
        ms_partitions_sds = ms_partitions\
            .select('ID', 'storageDescriptor.*')\
            .withColumn('type', lit("partition"))

        ms_sds_no_id = ms_partitions_sds\
            .union(ms_tbls_sds)

        ms_sds = self.generate_id_df(ms_sds_no_id, 'SD_ID')

        ms_sds_for_join = ms_sds.select('type', 'ID', 'SD_ID')

        cond = [ms_sds_for_join.type == 'partition',
                ms_sds_for_join.ID == ms_partitions.ID]
        ms_partitions = ms_partitions\
            .join(ms_sds_for_join, cond, 'inner')\
            .drop_columns(['ID', 'type'])

        cond = [ms_sds_for_join.type == 'table', ms_sds_for_join.ID == ms_tbls.ID]
        ms_tbls = ms_tbls\
            .join(ms_sds_for_join, cond, 'inner')\
            .drop('ID').drop_columns(['ID', 'type'])

        ms_sds = ms_sds.drop_columns(['ID', 'type'])

        return (ms_sds, ms_tbls, ms_partitions)