in bindings/python-compute/feature_differentiation.py [0:0]
def get_features_distribution_by_segment_group(self):
distribution_list = []
buffer = 1. / float(len(self.target))
for feature_name in self.feature_df.columns:
distribution_dict = self.get_feature_distribution_by_segment_group(feature_name)
# ignore features with too may categories, like uuid; ignore features with only one category
if len(distribution_dict['distribution'][0]) <= 1 or \
len(distribution_dict['distribution'][0]) > max(len(self.target) / 10., 100):
continue
distribution_dict['divergence'] = entropy(
np.array(distribution_dict['distribution'][1]) + buffer,
np.array(distribution_dict['distribution'][2]) + buffer
)
distribution_list.append(distribution_dict)
return sorted(distribution_list, key=lambda x: x['divergence'], reverse=True)