in sdlf-utils/pipeline-examples/glue-jobs-deployer/pipeline_scripts/examplepipeline-glue-job.py [0:0]
def deleteOldData(targetTableBucketLocation):
"""Drops data directly in s3 to overwrite insert into operations in partitioned tables
"""
# spark.sql("DROP TABLE IF EXISTS " + props.targetTableFQDN )
# spark.sql("TRUNCATE TABLE " + props.targetTableFQDN )
log.info('Table Bucket:::' + targetTableBucketLocation)
partitions = []
partitionToDelete = ''
# KillAndFill table
if not props.hasDtPartition and len(props.partitionValues) == 0:
pathToDelete = props.targetTablePathLocation
deleteBucketPath(pathToDelete, targetTableBucketLocation)
else:
select = " show partitions " + props.targetTableFQDN
partitions = spark.sql(select).rdd.map(lambda x: x[0]).collect()
# Incremental table
# Case 1 - Only has dt partition
if props.hasDtPartition and len(props.partitionValues) == 0:
datePartitions = getDateRangePartitions(props)
if props.isCdcTable:
datePartitions = [props.targetPartitionDtField + "=" + datePartition for datePartition in
props.cdcDatePartitionsToProcess]
for partitionToDelete in datePartitions:
# if the table has other partition different than 1 date partitions, but are asking only for the date partition
if len(tablePartitionFields) > 1:
partitions = list(set(map(lambda x: x.split("/")[0], partitions)))
if partitionToDelete in partitions:
pathToDelete = props.targetTablePathLocation + "/" + partitionToDelete
deleteBucketPath(pathToDelete, targetTableBucketLocation)
# Case 2 - doesn't have date partition but has other partitions. Assumption: you can ask only for one value per partition level.
elif not props.hasDtPartition and len(props.partitionValues) > 0:
for partitionFieldNameToDelete in props.partitionValues:
partitionToDelete += partitionFieldNameToDelete + "=" + props.partitionValues[
partitionFieldNameToDelete] + "/"
partitionToDelete = partitionToDelete[:-1]
if partitionToDelete in partitions:
pathToDelete = props.targetTablePathLocation + "/" + partitionToDelete
deleteBucketPath(pathToDelete, targetTableBucketLocation)
# Case 3 - have date partition and other partitions. Assumption: date partition is the first partition level. for the other partitions, you can ask only for one value per partition level.
elif props.hasDtPartition and len(props.partitionValues) > 0:
for dtPartitionToDelete in getDateRangePartitions(props):
for partitionFieldNameToDelete in props.partitionValues:
partitionToDelete += partitionFieldNameToDelete + "=" + props.partitionValues[
partitionFieldNameToDelete] + "/"
partitionToDelete = partitionToDelete[:-1]
completePartition = dtPartitionToDelete + "/" + partitionToDelete
if completePartition in partitions:
pathToDelete = props.targetTablePathLocation + "/" + completePartition
deleteBucketPath(pathToDelete, targetTableBucketLocation)
partitionToDelete = ''