in utilities/Hive_metastore_migration/src/hive_metastore_migration.py [0:0]
def extract_sds(self, ms_tbls, ms_partitions):
ms_tbls = ms_tbls.withColumn('ID', concat(ms_tbls.TBL_NAME, ms_tbls.DB_NAME))
ms_partitions = ms_partitions.withColumn('ID', ms_partitions.PART_ID.cast(StringType()))
ms_tbls_sds = ms_tbls\
.select('ID', 'storageDescriptor.*')\
.withColumn('type', lit("table"))
ms_partitions_sds = ms_partitions\
.select('ID', 'storageDescriptor.*')\
.withColumn('type', lit("partition"))
ms_sds_no_id = ms_partitions_sds\
.union(ms_tbls_sds)
ms_sds = self.generate_id_df(ms_sds_no_id, 'SD_ID')
ms_sds_for_join = ms_sds.select('type', 'ID', 'SD_ID')
cond = [ms_sds_for_join.type == 'partition',
ms_sds_for_join.ID == ms_partitions.ID]
ms_partitions = ms_partitions\
.join(ms_sds_for_join, cond, 'inner')\
.drop_columns(['ID', 'type'])
cond = [ms_sds_for_join.type == 'table', ms_sds_for_join.ID == ms_tbls.ID]
ms_tbls = ms_tbls\
.join(ms_sds_for_join, cond, 'inner')\
.drop('ID').drop_columns(['ID', 'type'])
ms_sds = ms_sds.drop_columns(['ID', 'type'])
return (ms_sds, ms_tbls, ms_partitions)