in src/utils.py [0:0]
def get_prob_dist_func(ref_df, df, col_type):
if col_type == ColType.NUMERICAL:
# Deciles are constructed to represent probability events
deciles = [scoreatpercentile(ref_df, i) for i in np.linspace(0, 100, 11)]
ref_df_prob = pd.cut(ref_df, deciles, duplicates='drop').value_counts(normalize=True).sort_index()
df_prob = pd.cut(df, deciles, duplicates='drop').value_counts(normalize=True).sort_index()
else:
ref_df_prob = ref_df.value_counts(normalize=True, dropna=False)
ref_df_prob.rename(index={np.nan: 'NaN'}, inplace=True)
df_prob = df.value_counts(normalize=True, dropna=False)
df_prob.rename(index={np.nan: 'NaN'}, inplace=True)
# Find the intersection of labels from both data sets
common_labels = sorted(set(df_prob.keys()) & set(ref_df_prob.keys()))
df_filter_prob = [df_prob[k] for k in common_labels]
ref_df_filter_prob = [ref_df_prob[k] for k in common_labels]
return ref_df_filter_prob, df_filter_prob, abs(len(df_prob.keys()) - len(ref_df_prob.keys()))