in fairdiplomacy/agents/ce1p_agent.py [0:0]
def get_orders(self, game, power) -> List[str]:
# CFR data structures
self.sigma: Dict[Tuple[Power, Action], float] = {}
self.cum_sigma: Dict[Tuple[Power, Action], float] = defaultdict(float)
# self.cum_regrets: Dict[Tuple[Power, Action], float] = defaultdict(float)
# self.last_regrets: Dict[Tuple[Power, Action], float] = defaultdict(float)
self.cum_swap_regrets: Dict[Tuple[Power, Action, Action], float] = defaultdict(
float
) # for swap regret minimization
self.last_swap_regrets: Dict[Tuple[Power, Action, Action], float] = defaultdict(float)
self.swap_sigma: Dict[
Tuple[Power, Action, Action], float
] = {} # for swap regret minimization
# TODO: parallelize these calls
power_plausible_orders = self.get_plausible_orders(game, limit=self.n_plausible_orders)
power_plausible_orders = {p: sorted(v) for p, v in power_plausible_orders.items()}
logging.info(f"power_plausible_orders: {power_plausible_orders}")
if len(power_plausible_orders[power]) == 1:
return list(list(power_plausible_orders[power]).pop())
iter = 0.000001
for _ in range(self.n_rollouts):
iter += 1.0
discount_factor = (iter - 1.0) / iter
for pwr, actions in power_plausible_orders.items():
if len(actions) == 0:
continue
for action in actions:
# self.cum_regrets[(pwr, action)] *= discount_factor
self.cum_sigma[(pwr, action)] *= discount_factor
for swap_action in actions:
self.cum_swap_regrets[(pwr, swap_action, action)] *= discount_factor
# get policy probs for all powers
power_action_ps: Dict[Power, List[float]] = {
pwr: self.strategy(pwr, actions)
for (pwr, actions) in power_plausible_orders.items()
}
# sample policy for all powers
idxs = {
pwr: np.random.choice(range(len(action_ps)), p=action_ps)
for pwr, action_ps in power_action_ps.items()
if len(action_ps) > 0
}
power_sampled_orders: Dict[Power, Tuple[Action, float]] = {
pwr: (
(power_plausible_orders[pwr][idxs[pwr]], action_ps[idxs[pwr]])
if pwr in idxs
else ((), 1.0)
)
for pwr, action_ps in power_action_ps.items()
}
# logging.info(f"power_sampled_orders: {power_sampled_orders}")
# for each power: compare all actions against sampled opponent action
set_orders_dicts = [
{**{p: a for p, (a, _) in power_sampled_orders.items()}, pwr: action}
for pwr, actions in power_plausible_orders.items()
for action in actions
]
all_rollout_results = self.distribute_rollouts(game, set_orders_dicts)
for pwr, actions in power_plausible_orders.items():
if len(actions) == 0:
continue
# pop this power's results
results, all_rollout_results = (
all_rollout_results[: len(actions)],
all_rollout_results[len(actions) :],
)
action_utilities: List[float] = [r[1][pwr] for r in results]
state_utility = np.dot(power_action_ps[pwr], action_utilities)
action_regrets = [(u - state_utility) for u in action_utilities]
if pwr == power:
old_avg_strategy = self.avg_strategy(power, actions)
# update cfr data structures
# for action, regret, s in zip(actions, action_regrets, power_action_ps[pwr]):
# self.cum_regrets[(pwr, action)] += regret
# self.last_regrets[(pwr, action)] = regret
# self.cum_sigma[(pwr, action)] += s
# pos_regrets = [max(0, self.cum_regrets[(pwr, a)]) for a in actions] # Normal Linear CFR
# # pos_regrets = [max(0, self.cum_regrets[(pwr, a)] + self.last_regrets[(pwr, a)]) for a in actions] # Optimistic Linear CFR
# sum_pos_regrets = sum(pos_regrets)
# for action, pos_regret in zip(actions, pos_regrets):
# if sum_pos_regrets == 0:
# self.sigma[(pwr, action)] = 1.0 / len(actions)
# else:
# self.sigma[(pwr, action)] = pos_regret / sum_pos_regrets
# USE BELOW FOR SWAP REGRET MINIMIZATION
for action, regret, s in zip(actions, action_regrets, power_action_ps[pwr]):
# self.cum_regrets[(pwr, action)] += regret
# self.last_regrets[(pwr, action)] = regret
self.cum_sigma[(pwr, action)] += s
for swap_action in actions:
# pos_regrets = [max(0, self.cum_swap_regrets[(pwr, swap_action, a)]) for a in actions] # Normal Linear CFR
pos_regrets = [
max(
0,
self.cum_swap_regrets[(pwr, swap_action, a)]
+ self.last_swap_regrets[(pwr, swap_action, a)],
)
for a in actions
] # Optimistic Linear CFR
sum_pos_regrets = sum(pos_regrets)
for action, pos_regret in zip(actions, pos_regrets):
if sum_pos_regrets == 0:
self.swap_sigma[(pwr, swap_action, action)] = 1.0 / len(actions)
else:
self.swap_sigma[(pwr, swap_action, action)] = (
pos_regret / sum_pos_regrets
)
temp_sigma = [(1.0 / len(actions)) for action in actions]
new_temp_sigma = [(1.0 / len(actions)) for action in actions]
for x in range(10):
# Compute transition
for i in range(len(actions)):
action = actions[i]
new_temp_sigma[i] = 0
for swap_i in range(len(actions)):
swap_action = actions[swap_i]
new_temp_sigma[i] += (
temp_sigma[swap_i] * self.swap_sigma[(pwr, swap_action, action)]
)
# Normalize
sigma_sum = sum(new_temp_sigma)
assert sigma_sum > 0
for i in range(len(actions)):
temp_sigma[i] = new_temp_sigma[i] / sigma_sum
for i in range(len(actions)):
action = actions[i]
self.sigma[(pwr, action)] = temp_sigma[i]
# logging.info(
# "RECOMPUTED TRUE sigma for action {} = {}".format(
# action,
# self.sigma[(pwr, action)]
# )
# )
for swap_i in range(len(actions)):
swap_action = actions[swap_i]
swap_state_utility = 0
# logging.info("swap action={}".format(swap_action))
for i in range(len(actions)):
action = actions[i]
# logging.info("action={}".format(action))
# logging.info("action utility={}".format(action_utilities[i]))
# logging.info(f"swap sigma={self.swap_sigma[(pwr,swap_action,action)]}")
swap_state_utility += (
self.swap_sigma[(pwr, swap_action, action)] * action_utilities[i]
)
# logging.info(f"swap state utility={swap_state_utility}")
for i in range(len(actions)):
action = actions[i]
self.cum_swap_regrets[(pwr, swap_action, action)] += self.sigma[
(pwr, swap_action)
] * (action_utilities[i] - swap_state_utility)
self.last_swap_regrets[(pwr, swap_action, action)] = self.sigma[
(pwr, swap_action)
] * (action_utilities[i] - swap_state_utility)
# logging.info(
# "new swap regret for swap_action {} and action {} = {}".format(
# swap_action,
# action,
# self.cum_swap_regrets[(pwr, swap_action, action)]
# )
# )
for swap_action in actions:
# pos_regrets = [max(0, self.cum_swap_regrets[(pwr, swap_action, a)]) for a in actions] # Normal Linear CFR
pos_regrets = [
max(
0,
self.cum_swap_regrets[(pwr, swap_action, a)]
+ self.last_swap_regrets[(pwr, swap_action, a)],
)
for a in actions
] # Optimistic Linear CFR
sum_pos_regrets = sum(pos_regrets)
for action, pos_regret in zip(actions, pos_regrets):
if sum_pos_regrets == 0:
self.swap_sigma[(pwr, swap_action, action)] = 1.0 / len(actions)
else:
self.swap_sigma[(pwr, swap_action, action)] = (
pos_regret / sum_pos_regrets
)
# logging.info(
# "new swap sigma for swap_action {} and action {} = {}".format(
# swap_action,
# action,
# self.swap_sigma[(pwr, swap_action, action)]
# )
# )
# true policy is p s.t. p = pQ, where Q is the swap policy matrix
# for action in actions:
# self.sigma[(pwr, action)] = 1.0 / len(actions)
temp_sigma = [(1.0 / len(actions)) for action in actions]
new_temp_sigma = [(1.0 / len(actions)) for action in actions]
for x in range(10):
# Compute transition
for i in range(len(actions)):
action = actions[i]
new_temp_sigma[i] = 0
# logging.info(
# "initial new_temp_sigma on step {} for action {} = {}".format(
# x,
# action,
# new_temp_sigma[i],
# )
# )
for swap_i in range(len(actions)):
swap_action = actions[swap_i]
new_temp_sigma[i] += (
temp_sigma[swap_i] * self.swap_sigma[(pwr, swap_action, action)]
)
# logging.info(
# "adding to new_temp_sigma for action {} and swap_action {} temp_sigma {} and swap_sigma {} = {}".format(
# action,
# swap_action,
# temp_sigma[swap_i],
# self.swap_sigma[(pwr, swap_action, action)],
# new_temp_sigma[i],
# )
# )
# logging.info(
# "new_temp_sigma on step {} for action {} = {}".format(
# x,
# action,
# new_temp_sigma[i],
# )
# )
# Normalize
sigma_sum = sum(new_temp_sigma)
assert sigma_sum > 0
for i in range(len(actions)):
temp_sigma[i] = new_temp_sigma[i] / sigma_sum
for i in range(len(actions)):
action = actions[i]
self.sigma[(pwr, action)] = temp_sigma[i]
# logging.info(
# "new TRUE sigma for action {} = {}".format(
# action,
# self.sigma[(pwr, action)]
# )
# )
if pwr == power:
new_avg_strategy = self.avg_strategy(power, actions)
logging.debug(
"old_avg_strat= {} new_avg_strat= {} mse= {}".format(
old_avg_strategy,
new_avg_strategy,
sum((a - b) ** 2 for a, b in zip(old_avg_strategy, new_avg_strategy)),
)
)
# if (iter > 25 and iter < 25.5) or (iter > 50 and iter < 50.5) or (iter > 100 and iter < 100.5) or (iter > 200 and iter < 200.5) or (iter > 400 and iter < 400.5):
# # Compute NashConv. Specifically, for each power, compute EV of each action assuming opponent ave policies
# # get policy probs for all powers
# power_action_ps: Dict[Power, List[float]] = {
# pwr: self.avg_strategy(pwr, actions)
# for (pwr, actions) in power_plausible_orders.items()
# }
# logging.info(
# "EV computation on iter {} power_sampled_orders: {}".format(
# iter,
# power_sampled_orders,
# )
# )
# logging.info("Policies: {}".format(power_action_ps))
# total_action_utilities: Dict[Tuple[Power, Action], float] = defaultdict(float)
# temp_action_utilities: Dict[Tuple[Power, Action], float] = defaultdict(float)
# total_state_utility: Dict[Power, float] = defaultdict(float)
# max_state_utility: Dict[Power, float] = defaultdict(float)
# for pwr, actions in power_plausible_orders.items():
# total_action_utilities[(pwr,action)] = 0
# total_state_utility[pwr] = 0
# max_state_utility[pwr] = 0
# # total_state_utility = [0 for u in idxs]
# nash_conv = 0
# for _ in range(100):
# # sample policy for all powers
# idxs = {
# pwr: np.random.choice(range(len(action_ps)), p=action_ps)
# for pwr, action_ps in power_action_ps.items()
# if len(action_ps) > 0
# }
# power_sampled_orders: Dict[Power, Tuple[Action, float]] = {
# pwr: (
# (power_plausible_orders[pwr][idxs[pwr]], action_ps[idxs[pwr]])
# if pwr in idxs
# else ((), 1.0)
# )
# for pwr, action_ps in power_action_ps.items()
# }
# # for each power: compare all actions against sampled opponent action
# set_orders_dicts = [
# {**{p: a for p, (a, _) in power_sampled_orders.items()}, pwr: action}
# for pwr, actions in power_plausible_orders.items()
# for action in actions
# ]
# all_rollout_results = self.distribute_rollouts(game, set_orders_dicts)
# for pwr, actions in power_plausible_orders.items():
# if len(actions) == 0:
# continue
# # pop this power's results
# results, all_rollout_results = (
# all_rollout_results[: len(actions)],
# all_rollout_results[len(actions) :],
# )
# for r in results:
# action = r[0][pwr]
# val = r[1][pwr]
# temp_action_utilities[(pwr,action)] = val
# total_action_utilities[(pwr,action)] += val
# # logging.info("results for power={}".format(pwr))
# # for i in range(len(power_plausible_orders[pwr])):
# # action = power_plausible_orders[pwr][i]
# # util = action_utilities[i]
# # logging.info("{} {} = {}".format(pwr,action,util))
# # for action in power_plausible_orders[pwr]:
# # logging.info("{} {} = {}".format(pwr,action,action_utilities))
# # logging.info("action utilities={}".format(action_utilities))
# #logging.info("Results={}".format(results))
# #state_utility = np.dot(power_action_ps[pwr], action_utilities)
# # action_regrets = [(u - state_utility) for u in action_utilities]
# # logging.info("Action utilities={}".format(temp_action_utilities))
# # for action in actions:
# # total_action_utilities[(pwr,action)] += temp_action_utilities[(pwr,action)]
# # logging.info("Total action utilities={}".format(total_action_utilities))
# # total_state_utility[pwr] += state_utility
# # total_state_utility[:] = [x / 100 for x in total_state_utility]
# for pwr, actions in power_plausible_orders.items():
# #ps = self.avg_strategy(pwr, power_plausible_orders[pwr])
# for i in range(len(actions)):
# action = actions[i]
# total_action_utilities[(pwr,action)] /= 100.0
# if total_action_utilities[(pwr,action)] > max_state_utility[pwr]:
# max_state_utility[pwr] = total_action_utilities[(pwr,action)]
# total_state_utility[pwr] += total_action_utilities[(pwr,action)] * power_action_ps[pwr][i]
# for pwr, actions in power_plausible_orders.items():
# logging.info(
# "results for power={} value={} diff={}".format(
# pwr,
# total_state_utility[pwr],
# (max_state_utility[pwr] - total_state_utility[pwr])
# )
# )
# nash_conv += max_state_utility[pwr] - total_state_utility[pwr]
# for i in range(len(actions)):
# action = actions[i]
# logging.info(
# "{} {} = {} (prob {})".format(
# pwr,
# action,
# total_action_utilities[(pwr,action)],
# power_action_ps[pwr][i],
# )
# )
# logging.info(f"Nash Convergence on iter {iter} = {nash_conv}")
# # logging.info(
# # "total_state_utility= {} total_action_utilities= {}".format(
# # total_state_utility,
# # total_action_utilities,
# # )
# # )
logging.info("cum_strats= {}".format(self.cum_sigma))
# return best order: sample from average policy
ps = self.avg_strategy(power, power_plausible_orders[power])
idx = np.random.choice(range(len(ps)), p=ps)
return list(power_plausible_orders[power][idx])