bindings/python-compute/feature_differentiation.py (122 lines of code) (raw):
import pandas as pd
import numpy as np
from scipy.stats.kde import gaussian_kde
from scipy.stats import entropy
from numpy.linalg import LinAlgError
from .data_manager import DUMMY_PREFIX_SEP
from .constants import RANGE_FILTER
CLUSTER_COL = 'clusters'
GROUP_ID_COL = 'clusterGroupId'
NUMERICAL_DOMAIN_INTERVAL = 100
# index of nearest element in array to value
nearest_index = lambda arr, val: (np.abs(arr - val)).argmin()
# get a distribution where all densities are 0 except for the position of unique value (=1)
def get_single_value_distribution(domain, raw_values):
unique_value = raw_values.unique()[0]
distribution = np.zeros(domain.shape)
distribution[nearest_index(domain, unique_value)] = 1.
return distribution
class FeatureDifferentiation(object):
def __init__(self, feature_df, categorical_features=None):
self.feature_df = feature_df.copy()
self.categorical_features = categorical_features
self.cat_dict = self.compute_categorical_features_dict()
self.features_meta_data = self.compute_features_meta_data()
def set_params(self, segment_group_0, segment_group_1, segment_ids):
self.target = pd.Series([0 if i in segment_group_0 else (1 if i in segment_group_1 else 2)
for i in segment_ids])
def compute_categorical_features_dict(self):
if self.categorical_features is None:
# if data type is non-number or has small number of unique values
self.categorical_features = [c for c in self.feature_df.columns
if len(self.feature_df[c].unique()) < 7 or self.feature_df.dtypes[c] == 'object']
cat_dict = {}
# create a dict of list, dict fields are categorical parent features,
# and list elements are parent feature names suffixed with category names
for c in self.categorical_features:
# in case of features are already one-hot encoded
c_split = c.split(DUMMY_PREFIX_SEP)
try:
cat_dict[c_split[0]]
except:
cat_dict[c_split[0]] = []
cat_dict[c_split[0]] += [c]
return cat_dict
# todo: merge histogram computation in compute_features_meta_data, compute_split_cat_count, compute_split_kde
def compute_features_meta_data(self):
features_list = []
for feature_name in self.feature_df.columns:
col = self.feature_df[feature_name]
if feature_name in self.cat_dict:
type = 'categorical'
value_counts = col.value_counts(dropna=False)
distribution = np.stack([value_counts.index.values, value_counts.values]).tolist()
else:
x = np.linspace(np.min(col), np.max(col), num=NUMERICAL_DOMAIN_INTERVAL)
try:
kde = gaussian_kde(col, bw_method=0.1)
except LinAlgError:
kde = lambda x: get_single_value_distribution(x, col)
distribution = np.stack([x, 10000*kde(x)]).tolist()
if len(distribution[0]) <= 1 or \
len(distribution[0]) > max(len(col) / 10., 100):
continue
feature_dict = {
'name': feature_name,
'type': 'categorical' if feature_name in self.cat_dict else 'numerical',
'distribution': distribution
}
features_list.append(feature_dict)
return features_list
def compute_split_cat_count(self, col, exclude_outlier=False):
if exclude_outlier:
cc = col.value_counts()
mask = col.isin(cc.index[cc < len(col) * .005])
col.loc[mask] = 'OTHER_CATEGORY'
cc0 = col[self.target == 0].value_counts(normalize=True, dropna=False)
cc1 = col[self.target == 1].value_counts(normalize=True, dropna=False)
count_df = pd.concat([cc0, cc1], axis=1).fillna(0)
count_df.columns = [0, 1]
count_df.index = count_df.index.fillna('NO_CATEGORY')
buffer = 1. / float(len(self.target))
count_df['ratio'] = count_df.apply(lambda row: (row[0] + buffer) / (row[1] + buffer), axis=1)
count_df.sort_values('ratio', inplace=True)
return np.stack((count_df.index.values, 10000*count_df[0].values, 10000*count_df[1].values))
def compute_split_kde(self, col, exclude_outlier=True):
try:
col_value_range = RANGE_FILTER[col.name]
except:
if exclude_outlier:
col_value_range = [np.percentile(col, 1), np.percentile(col, 99)]
else:
col_value_range = [np.min(col), np.max(col)]
x = np.linspace(col_value_range[0], col_value_range[1], num=NUMERICAL_DOMAIN_INTERVAL)
# could also use pd.Series.value_counts, (use bins)
# if unique values in array <=1, kde will have exception
try:
kde0 = gaussian_kde(col[self.target == 0], bw_method=0.1)
except LinAlgError:
kde0 = lambda x: get_single_value_distribution(x, col[self.target == 0])
try:
kde1 = gaussian_kde(col[self.target == 1], bw_method=0.1)
except LinAlgError:
kde1 = lambda x: get_single_value_distribution(x, col[self.target == 1])
return np.stack((x, 10000*kde0(x), 10000*kde1(x)))
def get_features_meta_data(self):
return self.features_meta_data
def get_feature_distribution_by_segment_group(self, feature_name):
distribution_dict = {}
main_name = feature_name.split(DUMMY_PREFIX_SEP)[0]
distribution_dict['name'] = main_name
if main_name in self.cat_dict:
distribution_dict['distribution'] = self.compute_split_cat_count(self.feature_df[main_name]).tolist()
distribution_dict['type'] = 'categorical'
else:
distribution_dict['distribution'] = self.compute_split_kde(self.feature_df[main_name]).tolist()
distribution_dict['type'] = 'numerical'
return distribution_dict
def get_features_distribution_by_segment_group(self):
distribution_list = []
buffer = 1. / float(len(self.target))
for feature_name in self.feature_df.columns:
distribution_dict = self.get_feature_distribution_by_segment_group(feature_name)
# ignore features with too may categories, like uuid; ignore features with only one category
if len(distribution_dict['distribution'][0]) <= 1 or \
len(distribution_dict['distribution'][0]) > max(len(self.target) / 10., 100):
continue
distribution_dict['divergence'] = entropy(
np.array(distribution_dict['distribution'][1]) + buffer,
np.array(distribution_dict['distribution'][2]) + buffer
)
distribution_list.append(distribution_dict)
return sorted(distribution_list, key=lambda x: x['divergence'], reverse=True)