in evals/elsuite/twenty_questions/eval.py [0:0]
def run(self, recorder: Recorder) -> Dict[str, Union[float, int]]:
samples = self.get_samples()
self.rng.shuffle(samples)
samples = samples[: self.n_samples] if self.n_samples else samples
if self.shortlist_variant:
self.shortlist = [sample["word"] for sample in samples]
self.eval_all_samples(recorder, samples)
events = recorder.get_events("match")
scores = [event.data["score"] for event in events]
num_guesses = [event.data["num_guesses"] for event in events]
num_questions = [event.data["num_questions"] for event in events]
num_violations = [event.data["num_violations"] for event in events]
num_gamemaster_refusals = [event.data["num_gamemaster_refusals"] for event in events]
incorrect_guesses = [event.data["incorrect_guesses"] for event in events]
word_difficulties = [event.data["word_difficulty"] for event in events]
return {
"score": sum(scores) / len(scores),
"accuracy": evals.metrics.get_accuracy(events),
"bootstrap_std": evals.metrics.get_bootstrap_accuracy_std(events),
"average_num_guesses": sum(num_guesses) / len(num_guesses),
"average_num_questions": sum(num_questions) / len(num_questions),
"average_num_violations": sum(num_violations) / len(num_violations),
"average_num_gamemaster_refusals": sum(num_gamemaster_refusals)
/ len(num_gamemaster_refusals),
"average_num_incorrect_guesses": sum((len(ig) for ig in incorrect_guesses))
/ len(incorrect_guesses),
"average_word_difficulty": sum(word_difficulties) / len(word_difficulties),
}