def run_main()

in src/preprocess.py [0:0]


def run_main():
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler())
    logger.debug("Starting preprocessing.")

    parser = argparse.ArgumentParser()
    parser.add_argument("--data-manifest", type=str, required=True)
    args = parser.parse_args()

    logger.debug("Downloading raw input data")
    base_dir = "/opt/ml/processing"
    data_builder = DataBuilder(base_dir, args.data_manifest)
    df = data_builder.build()

    logger.debug("Preprocessing raw input data")
    data_processor = DataProcessor(df)
    data_output = data_processor.process()

    len_data_output = len(data_output)
    logger.info("Splitting %d rows of data into train, validation, test datasets.", len_data_output)
    np.random.shuffle(data_output)
    train, validation, test = np.split(
        data_output, [int(0.7 * len_data_output), int(0.85 * len_data_output)]
    )

    logger.info("Writing out datasets to %s.", base_dir)
    pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
    pd.DataFrame(validation).to_csv(
        f"{base_dir}/validation/validation.csv", header=False, index=False
    )
    pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)

    logger.info("Saving the preprocessing model to %s", base_dir)
    data_processor.save_model(os.path.join(base_dir, "model"))