in src/mlmax/preprocessing.py [0:0]
def fit(df, args):
preprocess = make_column_transformer(
(
["age", "num persons worked for employer"],
KBinsDiscretizer(encode="onehot-dense", n_bins=10),
),
(
["capital gains", "capital losses", "dividends from stocks"],
StandardScaler(),
),
(
["education", "major industry code", "class of worker"],
OneHotEncoder(sparse=False),
),
)
print("Creating preprocessing and feature engineering transformations")
preprocess.fit(df)
joblib.dump(preprocess, "./model.joblib")
model_output_directory = os.path.join(args.data_dir, "model/proc_model.tar.gz")
print(f"Saving model to {model_output_directory}")
with tarfile.open(model_output_directory, mode="w:gz") as archive:
archive.add("./model.joblib", recursive=True)
return preprocess