in sagemaker/containers/model/src/training.py [0:0]
def train_fn(args):
# # load data
data_schema, label_schema = load_schemas(args.schemas)
X_train = datasets.read_json_dataset(args.data_train, data_schema)
y_train = datasets.read_json_dataset(args.label_train, label_schema)
X_test = datasets.read_json_dataset(args.data_test, data_schema)
y_test = datasets.read_json_dataset(args.label_test, label_schema)
# convert from column vector to 1d array of int
y_train = y_train[:, 0].astype('int')
y_test = y_test[:, 0].astype('int')
# create components
preprocessor = create_preprocessor(data_schema)
classifier = LGBMClassifier(
max_depth=args.tree_max_depth,
num_leaves=args.tree_num_leaves,
boosting_type=args.tree_boosting_type,
min_child_samples=args.tree_min_child_samples,
n_estimators=args.tree_n_estimators
)
# create pipeline
pipeline = Pipeline(
[("preprocessor", preprocessor), ("classifier", classifier)]
)
train_pipeline(pipeline, X_train, y_train, args.cv_splits)
features_schema = transform_schema(preprocessor, data_schema)
test_pipeline(pipeline, X_test, y_test)
# save components
model_dir = Path(args.model_dir)
model_dir.mkdir(exist_ok=True, parents=True)
joblib.dump(preprocessor, Path(model_dir, "preprocessor.joblib"))
joblib.dump(classifier, Path(model_dir, "classifier.joblib"))
data_schema.save(Path(model_dir, "data.schema.json"))
features_schema.save(Path(model_dir, "features.schema.json"))