in isoexp/mab/smab_algs.py [0:0]
def SoftMAB(T, MAB, temp=1.0):
"""
Args:
T (int): horizon
MAB (list): list of available MAB models
Returns:
rewards (array-like): observed rewards
draws (array-like): indexes of selected arms
"""
K = len(MAB)
rewards = np.zeros((T,))
draws = np.zeros((T,))
N = np.ones((K,)) # number of observations of each arm
S = np.zeros((K,)) # sum of rewards for each arm
for k in range(K):
a = k
r = MAB[a].sample()
# update quantities
rewards[k] = r
draws[k] = a
S[a] += r
N[a] += 1
for t in range(K, T):
# select the arm
ucb = S / N
proba = np.exp(ucb / temp)
proba = proba / np.sum(proba)
a = np.random.choice(K, p=proba)
r = MAB[a].sample()
# update quantities
rewards[t] = r
draws[t] = a
S[a] += r
N[a] += 1
return rewards, draws