def get_prob_dist_func()

in src/utils.py [0:0]


def get_prob_dist_func(ref_df, df, col_type):
    if col_type == ColType.NUMERICAL:
        # Deciles are constructed to represent probability events
        deciles = [scoreatpercentile(ref_df, i) for i in np.linspace(0, 100, 11)]
        ref_df_prob = pd.cut(ref_df, deciles, duplicates='drop').value_counts(normalize=True).sort_index()
        df_prob = pd.cut(df, deciles, duplicates='drop').value_counts(normalize=True).sort_index()
    else:
        ref_df_prob = ref_df.value_counts(normalize=True, dropna=False)
        ref_df_prob.rename(index={np.nan: 'NaN'}, inplace=True)
        df_prob = df.value_counts(normalize=True, dropna=False)
        df_prob.rename(index={np.nan: 'NaN'}, inplace=True)

    # Find the intersection of labels from both data sets
    common_labels = sorted(set(df_prob.keys()) & set(ref_df_prob.keys()))
    df_filter_prob = [df_prob[k] for k in common_labels]
    ref_df_filter_prob = [ref_df_prob[k] for k in common_labels]

    return ref_df_filter_prob, df_filter_prob, abs(len(df_prob.keys()) - len(ref_df_prob.keys()))