in src/baselines/lgbm.py [0:0]
def train_with_grid_search(args, train_dataset, dev_dataset, log_fp=None):
parameters = {
"num_leaves": [8, 12, 16, 20],
"max_depth": [9, 15, 17, 20, 25, 30, 35, 39],
"min_split_gain": [0, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7, 0.9, 1],
"subsample": [0.6, 0.7, 0.8, 0.8, 1],
"n_estimators": [50, 100, 200, 250, 300],
"learning_rate": [0.001, 0.01, 0.02, 0.05, 0.1, 0.15, 0.2],
"feature_fraction": [0.6, 0.7, 0.8, 0.9, 0.95],
"bagging_fraction": [0.6, 0.7, 0.8, 0.9, 0.95],
"bagging_freq": [2, 4, 5, 6, 8],
"lambda_l1": [0, 0.1, 0.4, 0.5, 0.6],
"lambda_l2": [0, 10, 15, 35, 40],
"colsample_bytree": [0.6, 0.7, 0.8, 0.9, 1],
"reg_alpha": [0, 0.01, 0.02, 0.05, 0.09, 0.1, 1],
"reg_lambda": [0, 0.1, 0.5, 1],
"cat_smooth": [1, 10, 15, 20, 35]
}
if args.max_metric_type == "acc":
feval = "accuracy"
elif args.max_metric_type == "prec":
feval = "precision"
elif args.max_metric_type == "recall":
feval = "recall"
elif args.max_metric_type == "f1":
feval = "f1_macro"
elif args.max_metric_type == "pr_auc":
feval = "average_precision"
elif args.max_metric_type == "roc_auc":
feval = "roc_auc"
else:
feval = "f1_macro"
param = get_model_param(args)
if args.output_mode in ["binary_class", "binary-class"]:
class_weight = {0: 1, 1: args.pos_weight}
else:
class_weight = None
gbm = LGBMClassifier(
boosting_type=param["boosting_type"],
num_leaves=param["num_leaves"],
max_depth=param["max_depth"],
learning_rate=param["learning_rate"],
n_estimators=100,
subsample_for_bin=200000,
objective=param["objective"],
num_class=1 if args.num_labels == 2 else args.num_labels,
class_weight=class_weight,
min_split_gain=0,
min_child_weight=1e-3,
min_child_samples=20,
subsample=1,
subsample_freq=0,
colsample_bytree=1.0,
reg_alpha=0.0,
reg_lambda=0.0,
random_state=args.seed,
n_jobs=-1,
silent="warn",
importance_type="split"
)
# use gridsearch without fit function
train_val_features = np.concatenate((train_dataset.get_data(), dev_dataset.get_data()), axis=0)
train_val_labels = np.concatenate((train_dataset.get_label(), dev_dataset.get_label()), axis=0)
dev_fold = np.zeros(train_val_features.shape[0])
dev_fold[:train_dataset.get_data().shape[0]] = -1
ps = PredefinedSplit(test_fold=dev_fold)
# gsearch = GridSearchCV(gbm, param_grid=parameters, scoring=feval, cv=ps)
gsearch = RandomizedSearchCV(gbm, param_distributions=parameters, scoring=feval, cv=ps)
gsearch.fit(train_val_features,
train_val_labels,
eval_set=[(dev_dataset.get_data(), dev_dataset.get_label())],
eval_metric=['auc', 'binary_logloss'],
early_stopping_rounds=args.early_stopping_rounds if args.early_stopping_rounds > 0 else None,
verbose=1)
logger.info("Best score[%s]: %0.6f" % (args.max_metric_type, gsearch.best_score_))
logger.info("Best parameters set:")
log_fp.write("Best score[%s]: %0.6f\n" % (args.max_metric_type, gsearch.best_score_))
log_fp.write("Best parameters set:\n")
best_parameters = gsearch.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
logger.info("\t%s: %r" % (param_name, best_parameters[param_name]))
log_fp.write("%s: %r\n" % (param_name, best_parameters[param_name]))
log_fp.write("#" * 50 + "\n")
global_step = best_parameters["n_estimators"]
prefix = "checkpoint-{}".format(global_step)
checkpoint = os.path.join(args.output_dir, prefix)
if not os.path.exists(checkpoint):
os.makedirs(checkpoint)
# save gridsearch
joblib.dump(gsearch, os.path.join(args.output_dir, "lgbm_gridsearch.pkl"))
log_fp.write(str(gsearch.best_estimator_) + "\n" + "#" * 50 + "\n")
tr_loss = 0
max_metric_model_info = {"global_step": global_step}
save_model(gsearch.best_estimator_, os.path.join(checkpoint, "lgbm_model.txt"))
json.dump(best_parameters, open(os.path.join(checkpoint, "config.json"), "w"))
return global_step, tr_loss, max_metric_model_info