def epsGREEDY()

in isoexp/mab/smab_algs.py [0:0]


def epsGREEDY(T, MAB, epsilon=0.1):
    """
    Args:
        T (int): horizon
        MAB (list): list of available MAB models
    Returns:
        rewards (array-like): observed rewards
        draws (array-like): indexes of selected arms
    """
    K = len(MAB)
    rewards = np.zeros((T,))
    draws = np.zeros((T,))

    N = np.ones((K,))  # number of observations of each arm
    S = np.zeros((K,))  # sum of rewards for each arm

    for k in range(K):
        a = k
        r = MAB[a].sample()

        # update quantities
        rewards[k] = r
        draws[k] = a
        S[a] += r
        N[a] += 1

    for t in range(K, T):
        # select the arm
        ucb = S / N

        rnd = np.random.rand()
        if rnd <= epsilon:
            a = np.random.choice(K)
        else:
            idxs = np.flatnonzero(np.isclose(ucb, ucb.max()))
            a = np.asscalar(np.random.choice(idxs))

        r = MAB[a].sample()

        # update quantities
        rewards[t] = r
        draws[t] = a
        S[a] += r
        N[a] += 1

    return rewards, draws