def download_and_extract()

in autopilot/mlops/timeseries/aws-automl-ts-cdk/glue/preprocess.py [0:0]


def download_and_extract(bucket, prefix, csv_dir):
    # List objects in the given bucket with the provided prefix
    s3_objects = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)['Contents']
    
    # Filter out possible filenames
    possible_filenames = ["data.zip", "tts.csv", "TTS.csv"]
    fileuri = None
    for obj in s3_objects:
        filename = os.path.basename(obj['Key'])
        if filename.lower() in [name.lower() for name in possible_filenames]:
            fileuri = obj['Key']
            break

    # If none of the filenames matched, raise an error
    if not fileuri:
        exit()

    file_name = os.path.join('/tmp', os.path.basename(fileuri))
    print(f"File Name is: {file_name}")
    os.makedirs(os.path.dirname(file_name), exist_ok=True)
    
    s3.download_file(bucket, fileuri, file_name)

    if fileuri.endswith('.zip'):
        try:
            with zipfile.ZipFile(file_name, 'r') as zip_ref:
                zip_ref.extractall(csv_dir)
            return "zip"
        except FileNotFoundError:
            print(f"{file_name} not found.")
            exit()
    elif fileuri.lower().endswith('.csv'):
        source = os.path.join('/tmp', os.path.basename(fileuri))
        destination = os.path.join(csv_dir, 'training_data.csv')
        os.makedirs(csv_dir, exist_ok=True)
        os.rename(source, destination)
        return "csv"
    else:
        print(f"Unsupported file type for {file_name}")
        exit()