in src/utils.py [0:0]
def compute_nan_stats(ref_df, df, col_type):
df_count = len(df)
df_nan_count_per = df.isna().sum()/df_count
ref_df_nan_count_per = ref_df.isna().sum()/len(ref_df)
nan_diff = abs(df_nan_count_per - ref_df_nan_count_per) * 100
if col_type == ColType.NUMERICAL:
p_value_with_nan = ks_2samp(ref_df, df)[1]
p_value_without_nan = ks_2samp(ref_df.dropna(), df.dropna())[1]
drift_na = (p_value_with_nan < 0.05) ^ (p_value_without_nan < 0.05)
elif col_type == ColType.CATEGORICAL:
ref_df_freq = [(1 - ref_df_nan_count_per) * df_count, ref_df_nan_count_per * df_count]
df_freq = [(1 - df_nan_count_per) * df_count, df_nan_count_per * df_count]
drift_na = chisquare(ref_df_freq, df_freq)[1] < 0.05
else:
raise ValueError('Column type is neither numerical or categorical')
return nan_diff, drift_na