def generate_statistic()

in src/mlmax/monitoring.py [0:0]


def generate_statistic(X_train: pd.DataFrame, y_train: pd.Series, args=None):
    """
    Examples of X_train:

            age                    education                 major industry code ...
    26873    28             5th or 6th grade        Business and repair services
    179865   31   Bachelors degree(BA AB BS)   Finance insurance and real estate

    """
    cat_cols, num_cols = get_cols_types(X_train)
    logger.info(f"Categorical cols: {cat_cols}")
    logger.info(f"Numerical cols: {num_cols}")
    stats_dict = get_dataframe_stats(X_train)
    for col, dtype in X_train.dtypes.to_dict().items():
        logger.debug(col, dtype)
        if dtype == "object":
            dist = get_cat_counts(X_train[col])
        else:
            dist = get_num_distribution(X_train[col])
            skew = X_train[col].skew()
            kurtosis = X_train[col].kurtosis()
            stats_dict[col]["skew"] = skew
            stats_dict[col]["kurtosis"] = kurtosis

        stats_dict[col]["distribution"] = dist

    # Reformat
    festures_list = [{"name": k, "statistic": v} for k, v in stats_dict.items()]
    result = {"features": festures_list}

    return result