def save_files()

in 1. Amazon SageMaker Processing/preprocess.py [0:0]

13 lines of code
2 McCabe index (conditional complexity)


def save_files(base_dir: str, data_df: pd.DataFrame, data_fg: pd.DataFrame, fg_name: str, 
               val_size=0.2, test_size=0.05, current_host=None, sagemaker_session=None):
        
    logger.info(f"Splitting {len(data_df)} rows of data into train, val, test.")

    train_df, val_df = train_test_split(data_df, test_size=val_size, random_state=42)
    val_df, test_df = train_test_split(val_df, test_size=test_size, random_state=42)

    logger.info(f"Writing out datasets to {base_dir}")
    tmp_id = uuid.uuid4().hex[:8]
    train_df.to_csv(f"{base_dir}/train/train_{current_host}_{tmp_id}.csv", header=False, index=False)
    val_df.to_csv(f"{base_dir}/validation/validation_{current_host}_{tmp_id}.csv", header=False, index=False)

    # Save test data without header
    test_df.to_csv(f"{base_dir}/test/test_{current_host}_{tmp_id}.csv", header=False, index=False)

    
    if fg_name:
        # batch ingestion to the feature group of all the data
        ingest_data(data_fg, fg_name, sagemaker_session)

    return