in agents/compare.py [0:0]
def compare(agent_paths):
eval_setups = [each.stem for each in agent_paths[0].iterdir()]
auccess = {agent_dir: [] for agent_dir in agent_paths}
ind_solved_by = {agent_dir: [] for agent_dir in agent_paths}
for eval_setup in eval_setups:
try:
auccess = {agent_dir: [] for agent_dir in agent_paths}
ind_solved_by = {agent_dir: [] for agent_dir in agent_paths}
seeds = set(
[each.stem for each in (agent_paths[0] / eval_setup).iterdir()])
for agent_dir in agent_paths:
for seed in (agent_dir / eval_setup).iterdir():
with open(seed / 'results.json') as f:
results = json.load(f)
auccess[agent_dir].append(
results['metrics']['independent_solved_by_aucs'][100])
ind_solved_by[agent_dir].append(
results['metrics']['independent_solved_by'][100])
assert seed.stem in seeds, f'Seed {seed}, not in {seeds}'
print(f'\n\n-----------{eval_setup}----------------')
print(f'Evaluated on {len(seeds)} seeds: {seeds}')
for agent_dir in agent_paths:
print('\nPath for agent is', agent_dir)
agent_auccess = np.array(auccess[agent_dir])
agent_ind_solved_by = np.array(ind_solved_by[agent_dir])
print('AUCCESS')
print(
f'\tMean: {round(agent_auccess.mean(), 3)}\n',
f'\tSTD: {round(agent_auccess.std(), 3)}',
)
for other_agent in set(agent_paths) - set([agent_dir]):
sig_test = scipy.stats.wilcoxon(
agent_auccess,
y=np.array(auccess[other_agent]),
alternative='greater',
)
print(
'\tIs this agent\'s AUCCESS significantly higher than',
f'{other_agent}?\n\t\t{sig_test.pvalue < 0.01},',
f'p-value: {round(sig_test.pvalue, 4)}')
print('% Independently solved at 100 attempts')
print(
f'\tMean: {round(agent_ind_solved_by.mean(), 3)}\n',
f'\tSTD: {round(agent_ind_solved_by.std(), 3)}',
)
except Exception as e:
print('Error comparing results for', eval_setup)
print(e)
continue