in isoexp/mab/smab_algs.py [0:0]
def epsGREEDY(T, MAB, epsilon=0.1):
"""
Args:
T (int): horizon
MAB (list): list of available MAB models
Returns:
rewards (array-like): observed rewards
draws (array-like): indexes of selected arms
"""
K = len(MAB)
rewards = np.zeros((T,))
draws = np.zeros((T,))
N = np.ones((K,)) # number of observations of each arm
S = np.zeros((K,)) # sum of rewards for each arm
for k in range(K):
a = k
r = MAB[a].sample()
# update quantities
rewards[k] = r
draws[k] = a
S[a] += r
N[a] += 1
for t in range(K, T):
# select the arm
ucb = S / N
rnd = np.random.rand()
if rnd <= epsilon:
a = np.random.choice(K)
else:
idxs = np.flatnonzero(np.isclose(ucb, ucb.max()))
a = np.asscalar(np.random.choice(idxs))
r = MAB[a].sample()
# update quantities
rewards[t] = r
draws[t] = a
S[a] += r
N[a] += 1
return rewards, draws