in distributed_training/src_dir/util.py [0:0]
def sync_s3_checkpoints_to_local(
local_path="/opt/ml/checkpoints",
s3_path=os.path.dirname(os.path.dirname(os.getenv('SM_MODULE_DIR', ''))) +
'/checkpoints'):
""" sample function to sync checkpoints from s3 to local path """
import boto3, botocore
#creat if local path does not exists
if not os.path.exists(local_path):
print(f"Provided local path {local_path} does not exist. Creating...")
try:
os.makedirs(local_path)
except Exception as e:
raise RuntimeError(f"failed to create {local_path}")
#check if s3 bucket exists
s3 = boto3.resource('s3')
if 's3://' not in s3_path:
raise ValueError(
"Provided s3 path {s3_path} is not valid. Please check")
s3_bucket = s3_path.replace('s3://', '').split('/')[0]
print(f"S3 Bucket: {s3_bucket}")
try:
s3.meta.client.head_bucket(Bucket=s3_bucket)
except botocore.exceptions.ClientError as e:
error_code = e.response['Error']['Code']
if error_code == '404':
raise RuntimeError('S3 bucket does not exist. Please check')
aws_s3_sync(s3_path, local_path)
return