def compute_nan_stats()

in src/utils.py [0:0]


def compute_nan_stats(ref_df, df, col_type):
    df_count = len(df)
    df_nan_count_per = df.isna().sum()/df_count
    ref_df_nan_count_per = ref_df.isna().sum()/len(ref_df)

    nan_diff = abs(df_nan_count_per - ref_df_nan_count_per) * 100

    if col_type == ColType.NUMERICAL:
        p_value_with_nan = ks_2samp(ref_df, df)[1]
        p_value_without_nan = ks_2samp(ref_df.dropna(), df.dropna())[1]

        drift_na = (p_value_with_nan < 0.05) ^ (p_value_without_nan < 0.05)
    elif col_type == ColType.CATEGORICAL:
        ref_df_freq = [(1 - ref_df_nan_count_per) * df_count, ref_df_nan_count_per * df_count]
        df_freq = [(1 - df_nan_count_per) * df_count, df_nan_count_per * df_count]
        drift_na = chisquare(ref_df_freq, df_freq)[1] < 0.05
    else:
        raise ValueError('Column type is neither numerical or categorical')

    return nan_diff, drift_na