def attacked_EXP3_P()

in isoexp/mab/smab_algs.py [0:0]


def attacked_EXP3_P(T, MAB, target_arm, eta = None, gamma = None, delta=0.99, constant_attack=False):

    K = len(MAB)
    estimated_S = np.zeros((K,))
    rewards = np.zeros((T,))
    draws = 0 * rewards
    sum_exp = K
    exp_estimated_S = np.ones((K,))
    arms = np.linspace(0, K - 1, K, dtype='int')
    N = np.ones((K,))  # number of observations of each arm
    S = np.zeros((K,))
    beta = np.zeros((K,))
    attacks = np.zeros((T,))
    time_of_attacks = np.zeros((T,))
    if eta is None and gamma is None:
        eta = np.sqrt(np.log(K + 1) / (K * T))
        gamma = 0
    elif eta is None:
        eta = np.sqrt(np.log(K + 1) / (K * T))
    elif gamma is None:
        gamma = 0
    for t in range(T):
        P = (1 - gamma) * exp_estimated_S / sum_exp + gamma/K*np.ones((K,))
        if t < K:
            action = t
            attack_t = 0
        else:
            action = np.random.choice(arms, p=P)
            if action != target_arm:
                time_of_attacks[t] = 1
                if constant_attack:
                    attack_t = - 2*np.maximum(0, MAB[action].mean - MAB[target_arm].mean)
                else:
                    beta = np.sqrt(np.log(np.pi ** 2 * K * N ** 2 / (3 * delta)) / (2 * N))
                    attack_t = - np.maximum((S / N)[action] - (S / N)[target_arm] + beta[action] + beta[target_arm], 0)
            else:
                attack_t = 0
        attacks[t] = attack_t
        true_X = 1 * MAB[action].sample().squeeze()
        X = true_X + attack_t
        estimated_S = estimated_S + 1
        estimated_S[action] = estimated_S[action] - (1 - X) /P[action]
        exp_estimated_S = exp_estimated_S*np.exp(eta)
        exp_estimated_S[action] = exp_estimated_S[action] * np.exp(eta * (- (1 - X) /P[action]))
        sum_exp = np.sum(exp_estimated_S)
        rewards[t] = true_X
        draws[t] = action
        N[action] += 1
        S[action] += true_X

    return rewards, draws, attacks, time_of_attacks