in isoexp/mab/smab_algs.py [0:0]
def BootstrapedUCB(T, MAB, delta = 0.1, b_rep = 200):
K = len(MAB)
rewards = np.zeros((T,))
draws = np.zeros((T,))
N = np.zeros((K,))
S = np.zeros((K,))
rewards_arm = {}
for k in range(K):
a = k
r = 1*MAB[a].sample().squeeze()
rewards[k] = r
draws[k] = a
rewards_arm[k] = [r]
S[a] += r
N[a] += 1
for t in range(K, T):
alpha = 1/(t+1)
bootstrap_quantile = quantile((1-delta)*alpha, S, N, rewards_arm, B = b_rep)
phi = np.sqrt(2*np.log(1/alpha)/N)
## Theoretical ucb
#ucb = S / N + (bootstrap_quantile + np.sqrt(np.log(2/(delta*alpha))/N)*phi)
## Ucb used in practice
ucb = S / N + (bootstrap_quantile + np.sqrt(1/N)*phi)
idxs = np.flatnonzero(np.isclose(ucb, ucb.max()))
a = np.asscalar(np.random.choice(idxs))
r = 1*MAB[a].sample().squeeze()
rewards[t] = r
draws[t] = a
rewards_arm[a].append(r)
S[a] += r
N[a] += 1
return rewards, draws