in src/mlmax/monitoring.py [0:0]
def generate_statistic(X_train: pd.DataFrame, y_train: pd.Series, args=None):
"""
Examples of X_train:
age education major industry code ...
26873 28 5th or 6th grade Business and repair services
179865 31 Bachelors degree(BA AB BS) Finance insurance and real estate
"""
cat_cols, num_cols = get_cols_types(X_train)
logger.info(f"Categorical cols: {cat_cols}")
logger.info(f"Numerical cols: {num_cols}")
stats_dict = get_dataframe_stats(X_train)
for col, dtype in X_train.dtypes.to_dict().items():
logger.debug(col, dtype)
if dtype == "object":
dist = get_cat_counts(X_train[col])
else:
dist = get_num_distribution(X_train[col])
skew = X_train[col].skew()
kurtosis = X_train[col].kurtosis()
stats_dict[col]["skew"] = skew
stats_dict[col]["kurtosis"] = kurtosis
stats_dict[col]["distribution"] = dist
# Reformat
festures_list = [{"name": k, "statistic": v} for k, v in stats_dict.items()]
result = {"features": festures_list}
return result