in app/source/dragen/src/scheduler/aws_utils.py [0:0]
def s3_download_dir(bucket, src_dir, tgt_dir, region='us-east-1', nosign=False):
# Get the list of objects specified within the "dir"
if nosign:
client = boto3.client('s3', region, config=Config(signature_version=UNSIGNED))
else:
client = boto3.client('s3', region)
response = client.list_objects(Bucket=bucket, Prefix=src_dir)
if not response['Contents']:
return 0
# Filter out any results that are "dirs" by checking for ending '/'
object_list = [x for x in response['Contents'] if not x['Key'].endswith('/')]
# To avoid a race condition for parallel downloads, make sure each has a directory created
# - Create the full dir path of each object and make sure the dir exists
list([utils.check_create_dir(str(tgt_dir.rstrip('/') + '/' + x['Key']).rsplit('/', 1)[0]) for x in object_list])
# Convert the list of objects to a dict we can pass to the download function
download_dict_list = [{
'bucket': bucket,
'obj_key': x['Key'],
'tgt_path': tgt_dir.rstrip('/') + '/' + x['Key'],
'region': region
} for x in object_list]
# Create a thread pools to handle the downloads faster
pool = Pool(DOWNLOAD_THREAD_COUNT)
# Use the multiple thread pools to divvy up the downloads
results = pool.map(s3_download_file, download_dict_list)
# Close the pool and wait for the work to finish
pool.close()
pool.join()
# return the total number of bytes downloaded
return sum(results)