Path Lines of Code LICENSE.md 17 README.md 14 project/nanoeval/README.md 84 project/nanoeval/nanoeval/examples/_gpqa.py 31 project/nanoeval/nanoeval/examples/bundled_evals_test.py 12 project/nanoeval/nanoeval/examples/gpqa_api.py 22 project/nanoeval/nanoeval/examples/gpqa_mock.py 21 project/nanoeval/nanoeval/examples/gpqa_mock_mp.py 23 project/nanoeval/nanoeval/examples/gpqa_simple.py 9 project/nanoeval/nanoeval/solvers/computer_tasks/demo/__init__.py 1 project/nanoeval/nanoeval/solvers/computer_tasks/demo/_demo_task.py 22 project/nanoeval/nanoeval/solvers/computer_tasks/demo/agent.py 14 project/nanoeval/nanoeval/solvers/computer_tasks/demo/runtime.py 22 project/nanoeval_alcatraz/README.md 2 project/paperbench/README.md 175 project/paperbench/data/judge_eval/all-in-one/0/grading/expected_result.json 2634 project/paperbench/data/judge_eval/rice/0/grading/expected_result.json 5507 project/paperbench/data/judge_eval/semantic-self-consistency/0/grading/expected_result.json 1145 project/paperbench/data/judge_eval/stay-on-topic-with-classifier-free-guidance/0/grading/expected_result.json 1744 project/paperbench/data/papers/adaptive-pruning/addendum.md 3 project/paperbench/data/papers/adaptive-pruning/blacklist.txt 1 project/paperbench/data/papers/adaptive-pruning/paper.md 3 project/paperbench/data/papers/adaptive-pruning/rubric.json 1253 project/paperbench/data/papers/all-in-one/addendum.md 3 project/paperbench/data/papers/all-in-one/blacklist.txt 1 project/paperbench/data/papers/all-in-one/paper.md 3 project/paperbench/data/papers/all-in-one/rubric.json 1698 project/paperbench/data/papers/bam/addendum.md 3 project/paperbench/data/papers/bam/blacklist.txt 1 project/paperbench/data/papers/bam/paper.md 3 project/paperbench/data/papers/bam/rubric.json 7379 project/paperbench/data/papers/bbox/addendum.md 3 project/paperbench/data/papers/bbox/blacklist.txt 1 project/paperbench/data/papers/bbox/paper.md 3 project/paperbench/data/papers/bbox/rubric.json 3097 project/paperbench/data/papers/bridging-data-gaps/addendum.md 3 project/paperbench/data/papers/bridging-data-gaps/blacklist.txt 1 project/paperbench/data/papers/bridging-data-gaps/paper.md 3 project/paperbench/data/papers/bridging-data-gaps/rubric.json 1484 project/paperbench/data/papers/fre/addendum.md 3 project/paperbench/data/papers/fre/blacklist.txt 1 project/paperbench/data/papers/fre/paper.md 3 project/paperbench/data/papers/fre/rubric.json 4651 project/paperbench/data/papers/ftrl/addendum.md 3 project/paperbench/data/papers/ftrl/blacklist.txt 1 project/paperbench/data/papers/ftrl/paper.md 3 project/paperbench/data/papers/ftrl/rubric.json 1686 project/paperbench/data/papers/lbcs/addendum.md 3 project/paperbench/data/papers/lbcs/blacklist.txt 1 project/paperbench/data/papers/lbcs/judge.addendum.md 3 project/paperbench/data/papers/lbcs/paper.md 3 project/paperbench/data/papers/lca-on-the-line/addendum.md 3 project/paperbench/data/papers/lca-on-the-line/blacklist.txt 1 project/paperbench/data/papers/lca-on-the-line/paper.md 3 project/paperbench/data/papers/lca-on-the-line/rubric.json 7565 project/paperbench/data/papers/mechanistic-understanding/addendum.md 3 project/paperbench/data/papers/mechanistic-understanding/blacklist.txt 1 project/paperbench/data/papers/mechanistic-understanding/paper.md 3 project/paperbench/data/papers/mechanistic-understanding/rubric.json 928 project/paperbench/data/papers/pinn/addendum.md 3 project/paperbench/data/papers/pinn/blacklist.txt 1 project/paperbench/data/papers/pinn/paper.md 3 project/paperbench/data/papers/rice/addendum.md 3 project/paperbench/data/papers/rice/blacklist.txt 1 project/paperbench/data/papers/rice/judge.addendum.md 3 project/paperbench/data/papers/rice/judge/jsrl/paper.md 3 project/paperbench/data/papers/rice/judge/statemask/paper.md 3 project/paperbench/data/papers/rice/paper.md 3 project/paperbench/data/papers/rice/rubric.json 3551 project/paperbench/data/papers/robust-clip/addendum.md 3 project/paperbench/data/papers/robust-clip/blacklist.txt 1 project/paperbench/data/papers/robust-clip/paper.md 3 project/paperbench/data/papers/robust-clip/rubric.json 1062 project/paperbench/data/papers/sample-specific-masks/addendum.md 3 project/paperbench/data/papers/sample-specific-masks/blacklist.txt 1 project/paperbench/data/papers/sample-specific-masks/paper.md 3 project/paperbench/data/papers/sample-specific-masks/rubric.json 2837 project/paperbench/data/papers/sapg/addendum.md 3 project/paperbench/data/papers/sapg/blacklist.txt 1 project/paperbench/data/papers/sapg/paper.md 3 project/paperbench/data/papers/sapg/rubric.json 2026 project/paperbench/data/papers/self-composing-policies/addendum.md 3 project/paperbench/data/papers/self-composing-policies/blacklist.txt 1 project/paperbench/data/papers/self-composing-policies/paper.md 3 project/paperbench/data/papers/self-composing-policies/rubric.json 2581 project/paperbench/data/papers/self-expansion/addendum.md 3 project/paperbench/data/papers/self-expansion/blacklist.txt 1 project/paperbench/data/papers/self-expansion/paper.md 3 project/paperbench/data/papers/self-expansion/rubric.json 2651 project/paperbench/data/papers/semantic-self-consistency/addendum.md 3 project/paperbench/data/papers/semantic-self-consistency/blacklist.txt 1 project/paperbench/data/papers/semantic-self-consistency/judge.addendum.md 3 project/paperbench/data/papers/semantic-self-consistency/paper.md 3 project/paperbench/data/papers/semantic-self-consistency/rubric.json 723 project/paperbench/data/papers/sequential-neural-score-estimation/addendum.md 3 project/paperbench/data/papers/sequential-neural-score-estimation/blacklist.txt 1 project/paperbench/data/papers/sequential-neural-score-estimation/paper.md 3 project/paperbench/data/papers/sequential-neural-score-estimation/rubric.json 892 project/paperbench/data/papers/stay-on-topic-with-classifier-free-guidance/addendum.md 3 project/paperbench/data/papers/stay-on-topic-with-classifier-free-guidance/blacklist.txt 1 project/paperbench/data/papers/stay-on-topic-with-classifier-free-guidance/judge.addendum.md 3 project/paperbench/data/papers/stay-on-topic-with-classifier-free-guidance/paper.md 3 project/paperbench/data/papers/stay-on-topic-with-classifier-free-guidance/rubric.json 1367 project/paperbench/data/papers/stochastic-interpolants/addendum.md 3 project/paperbench/data/papers/stochastic-interpolants/blacklist.txt 1 project/paperbench/data/papers/stochastic-interpolants/paper.md 3 project/paperbench/data/papers/stochastic-interpolants/rubric.json 683 project/paperbench/data/papers/test-time-model-adaptation/addendum.md 3 project/paperbench/data/papers/test-time-model-adaptation/blacklist.txt 1 project/paperbench/data/papers/test-time-model-adaptation/paper.md 3 project/paperbench/data/papers/test-time-model-adaptation/rubric.json 1725 project/paperbench/data/papers/what-will-my-model-forget/addendum.md 3 project/paperbench/data/papers/what-will-my-model-forget/blacklist.txt 1 project/paperbench/data/papers/what-will-my-model-forget/paper.md 3 project/paperbench/data/papers/what-will-my-model-forget/rubric.json 8247 project/paperbench/experiments/splits/all.txt 20 project/paperbench/experiments/splits/debug.txt 1 project/paperbench/experiments/splits/dev.txt 3 project/paperbench/experiments/splits/human.txt 5 project/paperbench/experiments/splits/lite.txt 5 project/paperbench/paperbench/agents/aisi-basic-agent/requirements.txt 6 project/paperbench/paperbench/agents/dummy/requirements.txt 2 project/paperbench/paperbench/agents/instructions/code_only_instructions.txt 17 project/paperbench/paperbench/agents/instructions/instructions.txt 54 project/paperbench/paperbench/agents/instructions/instructions_iterative.txt 22 project/paperbench/paperbench/agents/launch.json 13 project/paperbench/paperbench/judge/judge_eval/README.md 92 project/paperbench/paperbench/nano/README.md 26 project/paperbench/pytest.ini 5 pyrightconfig.json 11 pytest.ini 2