def preprocess()

in autopilot/mlops/timeseries/aws-automl-ts-cdk/glue/preprocess.py [0:0]


def preprocess(csv_dir):
    # Check which files are present
    print("Data merge for ZIP started.")
    tts_present = os.path.exists(os.path.join(csv_dir, 'TTS.csv'))
    rts_present = os.path.exists(os.path.join(csv_dir, 'RTS.csv'))
    metadata_present = os.path.exists(os.path.join(csv_dir, 'metadata.csv'))

    # Load necessary files
    if tts_present:
        tts_df = pd.read_csv(os.path.join(csv_dir, 'TTS.csv'))
    if rts_present:
        rts_df = pd.read_csv(os.path.join(csv_dir, 'RTS.csv'))
    if metadata_present:
        metadata_df = pd.read_csv(os.path.join(csv_dir, 'metadata.csv'))

    # Scenario 1: Only TTS.csv is present
    if tts_present and not rts_present and not metadata_present:
        final_data = tts_df

    # Scenario 2: TTS.csv is present along with one of RTS.csv OR metadata.csv
    elif tts_present and rts_present and not metadata_present:
        final_data = pd.merge(tts_df, rts_df, how='right', on=['product_code', 'location_code', 'timestamp']) # Change the merge columns and type of Merge based on your dataset.
    elif tts_present and not rts_present and metadata_present:
        final_data = pd.merge(tts_df, metadata_df, how='right', on=['product_code']) # Change the merge columns and type of Merge based on your dataset.

    # Scenario 3: All files are present
    elif tts_present and rts_present and metadata_present:
        merged_data = pd.merge(tts_df, rts_df, how='right', on=['product_code', 'location_code', 'timestamp']) # Change the merge columns and type of Merge based on your dataset.
        final_data = pd.merge(merged_data, metadata_df, how='right', on=['product_code']) # Change the merge columns and type of Merge based on your dataset.

    # Error if no recognized pattern is present
    else:
        print("Unrecognized file combination in directory.")
        exit()

    final_data.to_csv(os.path.join(csv_dir, 'training_data.csv'), index=False)
    print(f"Final data merged into: {final_data}")