in autopilot/mlops/timeseries/aws-automl-ts-cdk/glue/preprocess.py [0:0]
def preprocess(csv_dir):
# Check which files are present
print("Data merge for ZIP started.")
tts_present = os.path.exists(os.path.join(csv_dir, 'TTS.csv'))
rts_present = os.path.exists(os.path.join(csv_dir, 'RTS.csv'))
metadata_present = os.path.exists(os.path.join(csv_dir, 'metadata.csv'))
# Load necessary files
if tts_present:
tts_df = pd.read_csv(os.path.join(csv_dir, 'TTS.csv'))
if rts_present:
rts_df = pd.read_csv(os.path.join(csv_dir, 'RTS.csv'))
if metadata_present:
metadata_df = pd.read_csv(os.path.join(csv_dir, 'metadata.csv'))
# Scenario 1: Only TTS.csv is present
if tts_present and not rts_present and not metadata_present:
final_data = tts_df
# Scenario 2: TTS.csv is present along with one of RTS.csv OR metadata.csv
elif tts_present and rts_present and not metadata_present:
final_data = pd.merge(tts_df, rts_df, how='right', on=['product_code', 'location_code', 'timestamp']) # Change the merge columns and type of Merge based on your dataset.
elif tts_present and not rts_present and metadata_present:
final_data = pd.merge(tts_df, metadata_df, how='right', on=['product_code']) # Change the merge columns and type of Merge based on your dataset.
# Scenario 3: All files are present
elif tts_present and rts_present and metadata_present:
merged_data = pd.merge(tts_df, rts_df, how='right', on=['product_code', 'location_code', 'timestamp']) # Change the merge columns and type of Merge based on your dataset.
final_data = pd.merge(merged_data, metadata_df, how='right', on=['product_code']) # Change the merge columns and type of Merge based on your dataset.
# Error if no recognized pattern is present
else:
print("Unrecognized file combination in directory.")
exit()
final_data.to_csv(os.path.join(csv_dir, 'training_data.csv'), index=False)
print(f"Final data merged into: {final_data}")