def run()

in src/baselines/xgb.py [0:0]


def run():
    args = main()
    logging.basicConfig(format="%(asctime)s-%(levelname)s-%(name)s | %(message)s", datefmt="%Y/%m/%d %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    # overwrite the output dir
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
    else:
        os.makedirs(args.output_dir)
    # create logger dir
    if not os.path.exists(args.log_dir):
        os.makedirs(args.log_dir)
    log_fp = open(os.path.join(args.log_dir, "logs.txt"), "w")
    # create tensorboard logger dir
    if not os.path.exists(args.tb_log_dir):
        os.makedirs(args.tb_log_dir)

    X_train, y_train, label_list = load_dataset(args, "train")
    feature_size = X_train.shape[1]
    feature_names = ["f_%d" % idx for idx in range(feature_size)]
    train_dataset = xgboost.DMatrix(X_train, y_train,
                                    feature_names=feature_names, feature_types=["float"] * feature_size)
    X_dev, y_dev, _ = load_dataset(args, "dev")
    dev_dataset = xgboost.DMatrix(X_dev, y_dev,
                                  feature_names=feature_names, feature_types=["float"] * feature_size)

    X_test, y_test, _ = load_dataset(args, "test")
    test_dataset = xgboost.DMatrix(X_test, y_test, feature_names=feature_names, feature_types=["float"] * feature_size)

    args.num_labels = len(np.unique(y_train))
    # output training/evaluation hyperparameters into logger
    logger.info("==== Training/Evaluation Parameters: =====")
    for attr, value in sorted(args.__dict__.items()):
        logger.info("\t{}={}".format(attr, value))
    logger.info("==== Parameters End =====\n")

    args_dict = {}
    for attr, value in sorted(args.__dict__.items()):
        if attr != "device":
            args_dict[attr] = value
    log_fp.write(json.dumps(args_dict, ensure_ascii=False) + "\n")
    log_fp.write("#" * 50 + "\n")

    if args.task_type in ["binary_class", "binary-class"]:
        n_train_pos = y_train.sum()
        n_dev_pos = y_dev.sum()
        n_test_pos = y_test.sum()
        logger.info("Train data, positive: {}, negative: {}, positive ratio: {:.2f}".format(n_train_pos, len(y_train) - n_train_pos, n_train_pos / len(y_train)))
        logger.info("Dev data, positive: {}, negative: {}, positive ratio: {:.2f}".format(n_dev_pos, len(y_dev) - n_train_pos, n_dev_pos / len(y_dev)))
        logger.info("Test data, positive: {}, negative: {}, positive ratio: {:.2f}".format(n_test_pos, len(y_test) - n_test_pos, n_test_pos / len(y_test)))
        log_fp.write("Train data, positive: {}, negative: {}, positive ratio: {:.2f}\n".format(n_train_pos, len(y_train) - n_train_pos, n_train_pos / len(y_train)))
        log_fp.write("Dev data, positive: {}, negative: {}, positive ratio: {:.2f}\n".format(n_dev_pos, len(y_dev) - n_train_pos, n_dev_pos / len(y_dev)))
        log_fp.write("Test data, positive: {}, negative: {}, positive ratio: {:.2f}\n".format(n_test_pos, len(y_test) - n_test_pos, n_test_pos / len(y_test)))
        log_fp.write("#" * 50 + "\n")
    log_fp.write("num labels: %d\n" % args.num_labels)
    log_fp.write("#" * 50 + "\n")
    log_fp.flush()

    max_metric_model_info = None
    if args.do_train:
        logger.info("++++++++++++Training+++++++++++++")
        if args.grid_search:
            global_step, tr_loss, max_metric_model_info = train_with_grid_search(args, X_train, y_train, X_dev, y_dev, log_fp=log_fp)
        else:
            global_step, tr_loss, max_metric_model_info = trainer(args, train_dataset, dev_dataset, log_fp=log_fp)
        logger.info("global_step = %s, average loss = %s", global_step, tr_loss)

    #  save
    if args.do_train:
        logger.info("++++++++++++Save Model+++++++++++++")
        # Create output directory if needed
        global_step = max_metric_model_info["global_step"]
        logger.info("max %s global step: %d" % (args.max_metric_type, global_step))
        log_fp.write("max %s global step: %d\n" % (args.max_metric_type, global_step))
        prefix = "checkpoint-{}".format(global_step)
        checkpoint = os.path.join(args.output_dir, prefix)
        logger.info("Saving model checkpoint to %s", checkpoint)
        torch.save(args, os.path.join(checkpoint, "training_args.bin"))
        save_labels(os.path.join(checkpoint, "label.txt"), label_list)

    # evaluation
    if args.do_eval and args.local_rank in [-1, 0]:
        logger.info("++++++++++++Validation+++++++++++++")
        log_fp.write("++++++++++++Validation+++++++++++++\n")
        global_step = max_metric_model_info["global_step"]
        logger.info("max %s global step: %d" % (args.max_metric_type, global_step))
        log_fp.write("max %s global step: %d\n" % (args.max_metric_type, global_step))
        prefix = "checkpoint-{}".format(global_step)
        checkpoint = os.path.join(args.output_dir, prefix)
        logger.info("checkpoint path: %s" % checkpoint)
        log_fp.write("checkpoint path: %s\n" % checkpoint)
        xgb_model = load_model(os.path.join(checkpoint, "xgb_model.txt"))
        result = evaluate(args, xgb_model, dev_dataset, X_dev, y_dev, prefix=prefix, log_fp=log_fp)
        result = dict(("evaluation_" + k + "_{}".format(global_step), v) for k, v in result.items())
        logger.info(json.dumps(result, ensure_ascii=False))
        log_fp.write(json.dumps(result, ensure_ascii=False) + "\n")

    # Testing
    if args.do_predict and args.local_rank in [-1, 0]:
        logger.info("++++++++++++Testing+++++++++++++")
        log_fp.write("++++++++++++Testing+++++++++++++\n")
        global_step = max_metric_model_info["global_step"]
        logger.info("max %s global step: %d" % (args.max_metric_type, global_step))
        log_fp.write("max %s global step: %d\n" % (args.max_metric_type, global_step))
        prefix = "checkpoint-{}".format(global_step)
        checkpoint = os.path.join(args.output_dir, prefix)
        logger.info("checkpoint path: %s" % checkpoint)
        log_fp.write("checkpoint path: %s\n" % checkpoint)
        xgb_model = load_model(os.path.join(checkpoint, "xgb_model.txt"))
        result = predict(args, xgb_model, test_dataset, X_test, y_test, prefix=prefix, log_fp=log_fp)
        result = dict(("evaluation_" + k + "_{}".format(global_step), v) for k, v in result.items())
        logger.info(json.dumps(result, ensure_ascii=False))
        log_fp.write(json.dumps(result, ensure_ascii=False) + "\n")
    log_fp.close()