in src/drift_detector.py [0:0]
def compute_drift(train_df, infer_df):
col_type = utils.get_column_types(train_df)
train_count, infer_count = len(train_df), len(infer_df)
drift_data = []
for col in train_df.columns:
drift_col = {'Feature': col, 'Column Type': col_type[col]}
train_col, infer_col = train_df[col], infer_df[col]
train_prob, infer_prob, compute_unique_count_drift = \
utils.get_prob_dist_func(train_col, infer_col, col_type[col])
drift_col['drift_score'] = utils.compute_drift_score(train_prob, infer_prob)
drift_col['NaN % Diff'], drift_col['is_nan_signif'] = \
utils.compute_nan_stats(train_col, infer_col, col_type[col])
if col_type[col] == ColType.NUMERICAL:
train_norm_col, infer_norm_col = utils.normalize(train_col, infer_col)
drift_col['KS'], drift_col['p-value'] = ks_2samp(train_norm_col, infer_norm_col)
drift_col['wasserstein_distance'] = wasserstein_distance(train_norm_col, infer_norm_col)
elif col_type[col] == ColType.CATEGORICAL:
# Chisquare requires frequency and it has to be larger than 5. Hence it's multiplied by the size of
# the inference data set.
train_freq = [int(p * infer_count) for p in train_prob]
infer_freq = [int(p * infer_count) for p in infer_prob]
drift_col['chisquare'], drift_col['p-value'] = chisquare(train_freq, infer_freq)
drift_col['Unique Count Drift'] = compute_unique_count_drift
drift_col['jensenshannon'] = jensenshannon(train_prob, infer_prob)
drift_data.append(drift_col)
drift_df = pd.DataFrame(drift_data)
return drift_df