in isoexp/mab/smab_algs.py [0:0]
def ExploreThenExploit(T, MAB, T1):
K = len(MAB)
rewards = np.zeros((T,))
draws = np.zeros((T,))
N = np.ones((K,)) # number of observations of each arm
S = np.zeros((K,)) # sum of rewards for each arm
T1 = np.ceil(T1).astype(np.int)
for t in range(T1):
a = np.random.choice(K)
r = MAB[a].sample()
# update quantities
rewards[t] = r
draws[t] = a
S[a] += r
N[a] += 1
for t in range(T1, T):
# select the arm
ucb = S / N
idxs = np.flatnonzero(np.isclose(ucb, ucb.max()))
a = np.asscalar(np.random.choice(idxs))
r = MAB[a].sample()
# update quantities
rewards[t] = r
draws[t] = a
S[a] += r
N[a] += 1
return rewards, draws