def train_fn()

in sagemaker_studio/containers/model/src/training.py [0:0]


def train_fn(args):
    # # load data
    data_schema, label_schema = load_schemas(args.schemas)
    X_train = datasets.read_json_dataset(args.data_train, data_schema)
    y_train = datasets.read_json_dataset(args.label_train, label_schema)
    X_test = datasets.read_json_dataset(args.data_test, data_schema)
    y_test = datasets.read_json_dataset(args.label_test, label_schema)

    # convert from column vector to 1d array of int
    y_train = y_train[:, 0].astype('int')
    y_test = y_test[:, 0].astype('int')

    # create components
    preprocessor = create_preprocessor(data_schema)
    classifier = LGBMClassifier(
        max_depth=args.tree_max_depth,
        num_leaves=args.tree_num_leaves,
        boosting_type=args.tree_boosting_type,
        min_child_samples=args.tree_min_child_samples,
        n_estimators=args.tree_n_estimators
    )

    # create pipeline
    pipeline = Pipeline(
        [("preprocessor", preprocessor), ("classifier", classifier)]
    )
    train_pipeline(pipeline, X_train, y_train, args.cv_splits)
    features_schema = transform_schema(preprocessor, data_schema)
    test_pipeline(pipeline, X_test, y_test)

    # save components
    model_dir = Path(args.model_dir)
    model_dir.mkdir(exist_ok=True, parents=True)
    joblib.dump(preprocessor, Path(model_dir, "preprocessor.joblib"))
    joblib.dump(classifier, Path(model_dir, "classifier.joblib"))
    data_schema.save(Path(model_dir, "data.schema.json"))
    features_schema.save(Path(model_dir, "features.schema.json"))