in autopilot/mlops/timeseries/aws-automl-ts-cdk/glue/preprocess.py [0:0]
def download_and_extract(bucket, prefix, csv_dir):
# List objects in the given bucket with the provided prefix
s3_objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)['Contents']
# Filter out possible filenames
possible_filenames = ["data.zip", "tts.csv", "TTS.csv"]
fileuri = None
for obj in s3_objects:
filename = os.path.basename(obj['Key'])
if filename.lower() in [name.lower() for name in possible_filenames]:
fileuri = obj['Key']
break
# If none of the filenames matched, raise an error
if not fileuri:
exit()
file_name = os.path.join('/tmp', os.path.basename(fileuri))
print(f"File Name is: {file_name}")
os.makedirs(os.path.dirname(file_name), exist_ok=True)
s3.download_file(bucket, fileuri, file_name)
if fileuri.endswith('.zip'):
try:
with zipfile.ZipFile(file_name, 'r') as zip_ref:
zip_ref.extractall(csv_dir)
return "zip"
except FileNotFoundError:
print(f"{file_name} not found.")
exit()
elif fileuri.lower().endswith('.csv'):
source = os.path.join('/tmp', os.path.basename(fileuri))
destination = os.path.join(csv_dir, 'training_data.csv')
os.makedirs(csv_dir, exist_ok=True)
os.rename(source, destination)
return "csv"
else:
print(f"Unsupported file type for {file_name}")
exit()