def compute_distribution_shift()

in next_steps/data_science/diagnose/diagnose.py [0:0]


def compute_distribution_shift(index, df_wgt, Y, X, method, hist_len, freq=None, tic=0):
    """ Y:target (unobserved), X:data (observed) """

    N = Y.shape[1]
    p = _normalize_distribution(Y)
    q = _normalize_distribution(X)

    if method.lower() in ['kl', 'kl-divergence']:
        eps_ratio = (1-EPS_GREEDY) / (EPS_GREEDY / N)
        log_p = (p * eps_ratio).log1p()
        log_q = (q * eps_ratio).log1p()
        temporal_loss = (p .multiply (log_p - log_q)).sum(axis=1)
        loss_fmt = '{:.2f}'

    elif method.lower() in ['ce', 'cross-entropy']:
        eps_ratio = (1-EPS_GREEDY) / (EPS_GREEDY / N)
        log_q = (q * eps_ratio).log1p()
        temporal_loss = -((p .multiply (log_q)).sum(axis=1) + np.log(EPS_GREEDY/N))
        loss_fmt = '{:.2f}'

    elif method.lower() in ['oov', 'out-sample items']:
        temporal_loss = 1.0 - (p .multiply (q>0)).sum(axis=1)
        loss_fmt = '{:.1%}'

    elif method.lower() in ['tv', 'total variation']:
        temporal_loss = (p-q).multiply(p>q).sum(axis=1)
        loss_fmt = '{:.1%}'

    else:
        raise NotImplementedError

    temporal_loss = pd.Series(np.ravel(temporal_loss), index=index)

    avg_loss = np.average(temporal_loss.values, weights=df_wgt.values)

    print('temporal {}, freq={}, hist_len={}, avg_loss={}, time={:.1f}s'.format(
        method, freq, hist_len, loss_fmt.format(avg_loss), time.time() - tic,
    ))
    return temporal_loss, df_wgt, avg_loss, loss_fmt