def train_xgb()

in scripts/interpret.py [0:0]


def train_xgb():
    """Train XGBoost model to explain categorical and numerical feature importance.
    """
    # load one-hot feature names
    filepath = Path(model_dir, "one_hot_feature_names.json")
    numerical_feature_names, categorical_feature_names, _ = preprocess.load_feature_names(filepath)
    one_hot_feature_names = numerical_feature_names + categorical_feature_names
    
    # load train data and exclude text embeddings
    train = pd.read_csv(Path(model_dir, "train.csv"))
    train_notext = train.iloc[:, :len(one_hot_feature_names)]
    labels = pd.read_csv(Path(model_dir, "labels.csv"))
    test = pd.read_csv(Path(model_dir, "test.csv"))
    test_notext = test.iloc[:, :len(one_hot_feature_names)]
    labels_test = pd.read_csv(Path(model_dir, "labels_test.csv"))

    # train XGBoost model
    xgb = XGBClassifier()
    xgb.fit(train_notext, labels)
    
    y_pred = xgb.predict_proba(test_notext)
    y_pred = [p[1] for p in y_pred]

    # plot important features
    topn = 10
    sorted_idx = xgb.feature_importances_.argsort()
    plt.barh(np.array(one_hot_feature_names)[sorted_idx[-topn:]],
             xgb.feature_importances_[sorted_idx[-topn:]])
    plt.title("Xgboost Feature Importance")
    plt.show()

    return y_pred