def run()

in evals/elsuite/twenty_questions/eval.py [0:0]


    def run(self, recorder: Recorder) -> Dict[str, Union[float, int]]:
        samples = self.get_samples()
        self.rng.shuffle(samples)
        samples = samples[: self.n_samples] if self.n_samples else samples

        if self.shortlist_variant:
            self.shortlist = [sample["word"] for sample in samples]

        self.eval_all_samples(recorder, samples)
        events = recorder.get_events("match")

        scores = [event.data["score"] for event in events]
        num_guesses = [event.data["num_guesses"] for event in events]
        num_questions = [event.data["num_questions"] for event in events]
        num_violations = [event.data["num_violations"] for event in events]
        num_gamemaster_refusals = [event.data["num_gamemaster_refusals"] for event in events]
        incorrect_guesses = [event.data["incorrect_guesses"] for event in events]
        word_difficulties = [event.data["word_difficulty"] for event in events]

        return {
            "score": sum(scores) / len(scores),
            "accuracy": evals.metrics.get_accuracy(events),
            "bootstrap_std": evals.metrics.get_bootstrap_accuracy_std(events),
            "average_num_guesses": sum(num_guesses) / len(num_guesses),
            "average_num_questions": sum(num_questions) / len(num_questions),
            "average_num_violations": sum(num_violations) / len(num_violations),
            "average_num_gamemaster_refusals": sum(num_gamemaster_refusals)
            / len(num_gamemaster_refusals),
            "average_num_incorrect_guesses": sum((len(ig) for ig in incorrect_guesses))
            / len(incorrect_guesses),
            "average_word_difficulty": sum(word_difficulties) / len(word_difficulties),
        }