in src/preprocess.py [0:0]
def run_main():
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
logger.addHandler(logging.StreamHandler())
logger.debug("Starting preprocessing.")
parser = argparse.ArgumentParser()
parser.add_argument("--data-manifest", type=str, required=True)
args = parser.parse_args()
logger.debug("Downloading raw input data")
base_dir = "/opt/ml/processing"
data_builder = DataBuilder(base_dir, args.data_manifest)
df = data_builder.build()
logger.debug("Preprocessing raw input data")
data_processor = DataProcessor(df)
data_output = data_processor.process()
len_data_output = len(data_output)
logger.info("Splitting %d rows of data into train, validation, test datasets.", len_data_output)
np.random.shuffle(data_output)
train, validation, test = np.split(
data_output, [int(0.7 * len_data_output), int(0.85 * len_data_output)]
)
logger.info("Writing out datasets to %s.", base_dir)
pd.DataFrame(train).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
pd.DataFrame(validation).to_csv(
f"{base_dir}/validation/validation.csv", header=False, index=False
)
pd.DataFrame(test).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)
logger.info("Saving the preprocessing model to %s", base_dir)
data_processor.save_model(os.path.join(base_dir, "model"))