bindings/python-compute/performance_comparison.py (98 lines of code) (raw):
import numpy as np
import pandas as pd
from sklearn.cluster import AgglomerativeClustering, KMeans
from scipy.stats.kde import gaussian_kde
from .utils import compute_filter
percentile_list = [1, 10, 25, 50, 75, 90, 99]
percentiles_func = lambda x: [np.percentile(x, p) for p in percentile_list]
def density_func(col):
x = np.linspace(np.min(col), np.max(col), num=100)
kde = gaussian_kde(col)
return [x.tolist(), kde(x).tolist()]
# get a list of prediction probability columns which are independent with each other
# possible columns are in "modelClass_<modelId>_<classId>" form
# for predictions of N1 number of models on N2 number of classes, there are N1 * (N2 - 1) columns that are independent
def get_independent_preds(df_columns, models=None):
columns = [c for c in df_columns if c.startswith('model_') or c.startswith('modelClass_')]
if len(columns[0].split('_')) < 3:
# in case of using loss_df
indep_cols_all = columns
else:
# in case of using pred_df
non_class0 = [c for c in columns if not c.endswith('_0')]
indep_cols_all = columns if len(non_class0) == 0 else \
[c for c in columns if c.startswith('modelClass_') and not c.endswith('_0')]
if (models is not None and len(models) > 0):
return [c for c in indep_cols_all if int(c.split('_')[1]) in models]
return indep_cols_all
class PerformanceComparison(object):
def __init__(self, pred_df, loss_df, feature_df, uuid, model_meta):
self.feature_df = feature_df
self.pred_df = pred_df
self.loss_df = loss_df
self.uuid = uuid
self.model_meta = model_meta
self.metric = None
self.metric_df = None
self.n_clusters = None
self.n_segments = None
self.segment_ids = None
self.clustering_columns = None
self.segment_filters = None
def set_params(self, n_clusters=None, metric='performance', base_models=None, segment_filters=None):
is_manual = bool(segment_filters)
should_compute_metric = self.should_compute_metric(metric)
if should_compute_metric:
self.metric = metric
metric_df = self.loss_df.copy() if self.metric == 'performance' else self.pred_df.copy()
self.ipd = get_independent_preds(metric_df.columns)
# todo: consider cases with more than one class
self.metric_df = metric_df[self.ipd].rename(columns={cc: 'model_' + cc.split('_')[1] for cc in self.ipd})
if not is_manual:
self.n_clusters = n_clusters
self.n_segments = n_clusters
self.clustering_columns = get_independent_preds(self.metric_df.columns, base_models)
# todo: no need to compute each time. just need to get children_ property of the clustering model
self.compute_clusters()
else:
self.segment_filters = segment_filters
self.n_segments = len(segment_filters)
self.compute_explicit_segments()
def should_compute_metric(self, metric):
if self.metric_df is None:
return True
return self.metric != metric
def compute_clusters(self):
# self.clustering_model = AgglomerativeClustering(
# n_clusters=self.n_clusters, affinity='euclidean', linkage='ward', compute_full_tree=True)
self.clustering_model = KMeans(n_clusters=self.n_clusters, precompute_distances=True, random_state=0)
segment_ids = self.clustering_model.fit_predict(self.metric_df[self.clustering_columns])
self.metric_df['clusters'] = segment_ids
gb = self.metric_df.groupby('clusters')
# sorting clusters based on model_0 median performance
mean_df = gb.aggregate(np.median).rename(columns={cc: 'model_' + cc.split('_')[1] for cc in self.ipd})
sorted_ind = mean_df['model_0'].sort_values(ascending=True, inplace=False).index
cluster_id_map = {v: k for (k, v) in enumerate(list(sorted_ind))}
self.metric_df['clusters'] = self.metric_df['clusters'].apply(lambda x: cluster_id_map[x])
self.segment_ids = self.metric_df['clusters']
def compute_explicit_segments(self):
full_df = pd.concat([self.pred_df, self.loss_df, self.feature_df], axis=1)
self.segment_ids = np.zeros(full_df.shape[0])
for i, filters in enumerate(self.segment_filters):
filter_for_segment = compute_filter(filters, full_df)
self.segment_ids[filter_for_segment] = i
def get_segment_ids(self):
return self.segment_ids
def get_models_performance_by_segment(self):
segments_list = []
for s in range(self.n_segments):
segment_mask = (self.segment_ids == s)
segment_dict = {
'segmentId': 'segment_' + str(s),
'numDataPoints': segment_mask.sum(),
'dataIds': self.uuid[segment_mask].tolist()
# 'dataIds': np.where(self.segment_ids == c)[0].tolist()
}
models_list = []
models_df = self.metric_df.loc[segment_mask]
models_df = models_df[[c for c in models_df.columns if c.startswith('model_')]]
for m in models_df.columns:
model_dict = {
'modelId': m,
'modelName': self.model_meta[m],
'percentiles': percentiles_func(models_df[m]),
'density': density_func(models_df[m])
}
models_list.append(model_dict)
segment_dict['modelsPerformance'] = models_list
segments_list.append(segment_dict)
return segments_list