in scripts/interpret.py [0:0]
def train_xgb():
"""Train XGBoost model to explain categorical and numerical feature importance.
"""
# load one-hot feature names
filepath = Path(model_dir, "one_hot_feature_names.json")
numerical_feature_names, categorical_feature_names, _ = preprocess.load_feature_names(filepath)
one_hot_feature_names = numerical_feature_names + categorical_feature_names
# load train data and exclude text embeddings
train = pd.read_csv(Path(model_dir, "train.csv"))
train_notext = train.iloc[:, :len(one_hot_feature_names)]
labels = pd.read_csv(Path(model_dir, "labels.csv"))
test = pd.read_csv(Path(model_dir, "test.csv"))
test_notext = test.iloc[:, :len(one_hot_feature_names)]
labels_test = pd.read_csv(Path(model_dir, "labels_test.csv"))
# train XGBoost model
xgb = XGBClassifier()
xgb.fit(train_notext, labels)
y_pred = xgb.predict_proba(test_notext)
y_pred = [p[1] for p in y_pred]
# plot important features
topn = 10
sorted_idx = xgb.feature_importances_.argsort()
plt.barh(np.array(one_hot_feature_names)[sorted_idx[-topn:]],
xgb.feature_importances_[sorted_idx[-topn:]])
plt.title("Xgboost Feature Importance")
plt.show()
return y_pred