def ExploreThenExploit()

in isoexp/mab/smab_algs.py [0:0]


def ExploreThenExploit(T, MAB, T1):
    K = len(MAB)
    rewards = np.zeros((T,))
    draws = np.zeros((T,))

    N = np.ones((K,))  # number of observations of each arm
    S = np.zeros((K,))  # sum of rewards for each arm

    T1 = np.ceil(T1).astype(np.int)

    for t in range(T1):
        a = np.random.choice(K)
        r = MAB[a].sample()

        # update quantities
        rewards[t] = r
        draws[t] = a
        S[a] += r
        N[a] += 1

    for t in range(T1, T):
        # select the arm
        ucb = S / N
        idxs = np.flatnonzero(np.isclose(ucb, ucb.max()))
        a = np.asscalar(np.random.choice(idxs))
        r = MAB[a].sample()

        # update quantities
        rewards[t] = r
        draws[t] = a
        S[a] += r
        N[a] += 1

    return rewards, draws