in next_steps/data_science/diagnose/diagnose.py [0:0]
def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, tic=0):
""" Y:target (unobserved), X:data (observed) """
N = Y.shape[1]
p = _normalize_distribution(Y)
q = _normalize_distribution(X)
if method.lower() in ['kl', 'kl-divergence']:
eps_ratio = (1-EPS_GREEDY) / (EPS_GREEDY / N)
log_p = (p * eps_ratio).log1p()
log_q = (q * eps_ratio).log1p()
temporal_loss = (p .multiply (log_p - log_q)).sum(axis=1)
loss_fmt = '{:.2f}'
elif method.lower() in ['ce', 'cross-entropy']:
eps_ratio = (1-EPS_GREEDY) / (EPS_GREEDY / N)
log_q = (q * eps_ratio).log1p()
temporal_loss = -((p .multiply (log_q)).sum(axis=1) + np.log(EPS_GREEDY/N))
loss_fmt = '{:.2f}'
elif method.lower() in ['oov', 'out-sample items']:
temporal_loss = 1.0 - (p .multiply (q>0)).sum(axis=1)
loss_fmt = '{:.1%}'
elif method.lower() in ['tv', 'total variation']:
temporal_loss = (p-q).multiply(p>q).sum(axis=1)
loss_fmt = '{:.1%}'
else:
raise NotImplementedError
temporal_loss = pd.Series(np.ravel(temporal_loss), index=index)
avg_loss = np.average(temporal_loss.values, weights=df_wgt.values)
print('temporal {}, freq={}, hist_len={}, avg_loss={}, time={:.1f}s'.format(
method, freq, hist_len, loss_fmt.format(avg_loss), time.time() - tic,
))
return temporal_loss, df_wgt, avg_loss, loss_fmt