def train_with_grid_search()

in src/baselines/lgbm.py [0:0]


def train_with_grid_search(args, train_dataset, dev_dataset, log_fp=None):
    parameters = {
        "num_leaves": [8, 12, 16, 20],
        "max_depth": [9, 15, 17, 20, 25, 30, 35, 39],
        "min_split_gain": [0, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7, 0.9, 1],
        "subsample": [0.6, 0.7, 0.8, 0.8, 1],
        "n_estimators": [50, 100, 200, 250, 300],
        "learning_rate": [0.001, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2],
        "feature_fraction": [0.6, 0.7, 0.8, 0.9, 0.95],
        "bagging_fraction": [0.6, 0.7, 0.8, 0.9, 0.95],
        "bagging_freq": [2, 4, 5, 6, 8],
        "lambda_l1": [0, 0.1, 0.4, 0.5, 0.6],
        "lambda_l2": [0, 10, 15, 35, 40],
        "colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1],
        "reg_alpha": [0, 0.01, 0.02, 0.05, 0.09, 0.1, 1],
        "reg_lambda": [0, 0.1, 0.5, 1],
        "cat_smooth": [1, 10, 15, 20, 35]
    }
    if args.max_metric_type == "acc":
        feval = "accuracy"
    elif args.max_metric_type == "prec":
        feval = "precision"
    elif args.max_metric_type == "recall":
        feval = "recall"
    elif args.max_metric_type == "f1":
        feval = "f1_macro"
    elif args.max_metric_type == "pr_auc":
        feval = "average_precision"
    elif args.max_metric_type == "roc_auc":
        feval = "roc_auc"
    else:
        feval = "f1_macro"
    param = get_model_param(args)
    if args.output_mode in ["binary_class", "binary-class"]:
        class_weight = {0: 1, 1: args.pos_weight}
    else:
        class_weight = None
    gbm = LGBMClassifier(
        boosting_type=param["boosting_type"],
        num_leaves=param["num_leaves"],
        max_depth=param["max_depth"],
        learning_rate=param["learning_rate"],
        n_estimators=100,
        subsample_for_bin=200000,
        objective=param["objective"],
        num_class=1 if args.num_labels == 2 else args.num_labels,
        class_weight=class_weight,
        min_split_gain=0,
        min_child_weight=1e-3,
        min_child_samples=20,
        subsample=1,
        subsample_freq=0,
        colsample_bytree=1.0,
        reg_alpha=0.0,
        reg_lambda=0.0,
        random_state=args.seed,
        n_jobs=-1,
        silent="warn",
        importance_type="split"
    )

    # use gridsearch without fit function
    train_val_features = np.concatenate((train_dataset.get_data(), dev_dataset.get_data()), axis=0)
    train_val_labels = np.concatenate((train_dataset.get_label(), dev_dataset.get_label()), axis=0)

    dev_fold = np.zeros(train_val_features.shape[0])
    dev_fold[:train_dataset.get_data().shape[0]] = -1
    ps = PredefinedSplit(test_fold=dev_fold)

    # gsearch = GridSearchCV(gbm, param_grid=parameters, scoring=feval, cv=ps)
    gsearch = RandomizedSearchCV(gbm, param_distributions=parameters, scoring=feval, cv=ps)
    gsearch.fit(train_val_features,
                train_val_labels,
                eval_set=[(dev_dataset.get_data(), dev_dataset.get_label())],
                eval_metric=['auc', 'binary_logloss'],
                early_stopping_rounds=args.early_stopping_rounds if args.early_stopping_rounds > 0 else None,
                verbose=1)

    logger.info("Best score[%s]: %0.6f" % (args.max_metric_type, gsearch.best_score_))
    logger.info("Best parameters set:")
    log_fp.write("Best score[%s]: %0.6f\n" % (args.max_metric_type, gsearch.best_score_))
    log_fp.write("Best parameters set:\n")
    best_parameters = gsearch.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        logger.info("\t%s: %r" % (param_name, best_parameters[param_name]))
        log_fp.write("%s: %r\n" % (param_name, best_parameters[param_name]))
    log_fp.write("#" * 50 + "\n")
    global_step = best_parameters["n_estimators"]
    prefix = "checkpoint-{}".format(global_step)
    checkpoint = os.path.join(args.output_dir, prefix)
    if not os.path.exists(checkpoint):
        os.makedirs(checkpoint)
    # save gridsearch
    joblib.dump(gsearch, os.path.join(args.output_dir, "lgbm_gridsearch.pkl"))

    log_fp.write(str(gsearch.best_estimator_) + "\n" + "#" * 50 + "\n")

    tr_loss = 0
    max_metric_model_info = {"global_step": global_step}
    save_model(gsearch.best_estimator_, os.path.join(checkpoint, "lgbm_model.txt"))
    json.dump(best_parameters, open(os.path.join(checkpoint, "config.json"), "w"))
    return global_step, tr_loss, max_metric_model_info