def get_features_distribution_by_segment_group()

in bindings/python-compute/feature_differentiation.py [0:0]


    def get_features_distribution_by_segment_group(self):
        distribution_list = []
        buffer = 1. / float(len(self.target))

        for feature_name in self.feature_df.columns:
            distribution_dict = self.get_feature_distribution_by_segment_group(feature_name)

            # ignore features with too may categories, like uuid; ignore features with only one category
            if len(distribution_dict['distribution'][0]) <= 1 or \
                len(distribution_dict['distribution'][0]) > max(len(self.target) / 10., 100):
                continue

            distribution_dict['divergence'] =  entropy(
                np.array(distribution_dict['distribution'][1]) + buffer,
                np.array(distribution_dict['distribution'][2]) + buffer
            )
            distribution_list.append(distribution_dict)
        return sorted(distribution_list, key=lambda x: x['divergence'], reverse=True)