class BaseTSAgent(object):
def __init__(self):
self.counts = [0 for _ in range(n_arms)]
self.wins = [0 for _ in range(n_arms)]
def sample(self, arm, reward):
self.counts[arm] = self.counts[arm] + 1
self.wins[arm] = self.wins[arm] + reward
class BernouliTSAgent(BaseTSAgent):
def get_arm(self):
beta = lambda N, a: np.random.beta(a + 1, N - a + 1)
result = [beta(self.counts[i], self.wins[i]) for i in range(n_arms)]
arm = result.index(max(result))
return arm
class NormalTSAgent(BaseTSAgent):
def get_arm(self):
normal = lambda mu, sigma: np.random.normal(mu, sigma)
result = [
normal(
self.wins[i] / self.counts[i] if self.counts[i] > 0 else 0.0,
np.sqrt((self.wins[i]/ self.counts[i]) * (1 - self.wins[i]/self.counts[i]) / self.counts[i]) if self.counts[i] > 0 else 1.0
) for i in range(n_arms)
]
arm = result.index(max(result))
return arm
class LinTSAgent(object):
def __init__(self):
self.phis = np.array([[arm[0], arm[1], 1] for arm in arms]).T
self.alpha = 1
self.sigma = 1
self.inv_A = np.identity(self.phis.shape[0])
self.b = np.zeros((self.phis.shape[0], 1))
def get_arm(self):
post_mean = self.inv_A.dot(self.b)
post_var = self.inv_A
pred_mean = self.phis.T.dot(post_mean)
pred_var = self.phis.T.dot(post_var).dot(self.phis)
result = [np.random.normal(pred_mean[i],
self.alpha * np.sqrt(pred_var[i, i]))
for i in range(n_arms)]
arm = np.argmax(result)
return arm
def sample(self, arm_index, reward):
phi = self.phis[:, [arm_index]]
iAppTiA = self.inv_A.dot(phi).dot(phi.T).dot(self.inv_A)
s2_pTiAp = self.sigma ** 2 + phi.T.dot(self.inv_A).dot(phi)
self.inv_A = self.inv_A - iAppTiA / s2_pTiAp
self.b = self.b + (self.sigma ** 2) * reward * phi