def fit()

in src/mlmax/preprocessing.py [0:0]


def fit(df, args):
    preprocess = make_column_transformer(
        (
            ["age", "num persons worked for employer"],
            KBinsDiscretizer(encode="onehot-dense", n_bins=10),
        ),
        (
            ["capital gains", "capital losses", "dividends from stocks"],
            StandardScaler(),
        ),
        (
            ["education", "major industry code", "class of worker"],
            OneHotEncoder(sparse=False),
        ),
    )
    print("Creating preprocessing and feature engineering transformations")
    preprocess.fit(df)
    joblib.dump(preprocess, "./model.joblib")
    model_output_directory = os.path.join(args.data_dir, "model/proc_model.tar.gz")
    print(f"Saving model to {model_output_directory}")
    with tarfile.open(model_output_directory, mode="w:gz") as archive:
        archive.add("./model.joblib", recursive=True)
    return preprocess