path # lines of code scripts/modelgraded_generator.py 185 scripts/pattern_identification_generator.py 48 scripts/battle_generator.py 49 MANIFEST.in 4 pyproject.toml 64 evals/base.py 51 evals/solvers/prompts/cot.py 4 evals/solvers/prompts/hhh.py 99 evals/solvers/postprocessors/base.py 6 evals/solvers/postprocessors/postprocessors.py 20 evals/solvers/solver.py 125 evals/solvers/nested/fewshot_solver.py 91 evals/solvers/nested/hhh_solver.py 33 evals/solvers/nested/cot_solver.py 61 evals/solvers/nested/self_consistency_solver.py 118 evals/solvers/memory.py 50 evals/solvers/utils.py 37 evals/solvers/providers/together/together_solver.py 68 evals/solvers/providers/google/gemini_solver.py 157 evals/solvers/providers/openai/openai_assistants_solver.py 186 evals/solvers/providers/openai/openai_solver.py 181 evals/solvers/providers/anthropic/anthropic_solver.py 89 evals/solvers/human_cli_solver.py 29 evals/task_state.py 13 evals/utils/log_utils.py 53 evals/utils/misc.py 19 evals/utils/snowflake.py 100 evals/utils/api_utils.py 15 evals/__init__.py 14 evals/elsuite/error_recovery/scripts/dataset_creation.py 106 evals/elsuite/error_recovery/scripts/make_plots.py 446 evals/elsuite/error_recovery/defaults.py 12 evals/elsuite/error_recovery/eval.py 204 evals/elsuite/lambada.py 39 evals/elsuite/translate.py 66 evals/elsuite/sandbagging/scripts/sandbagging_subset_plots.py 46 evals/elsuite/sandbagging/scripts/consistency_plots.py 88 evals/elsuite/sandbagging/scripts/sandbagging_all_plots.py 27 evals/elsuite/sandbagging/scripts/utils.py 100 evals/elsuite/sandbagging/solvers.py 152 evals/elsuite/sandbagging/sandbagging_eval.py 68 evals/elsuite/sandbagging/mmlu_eval.py 60 evals/elsuite/sandbagging/defaults.py 12 evals/elsuite/sandbagging/utils.py 47 evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_prompts.py 12 evals/elsuite/multistep_web_tasks/solvers/webarena_solvers/webarena_solvers.py 94 evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_solver.py 173 evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_prompts.py 4 evals/elsuite/multistep_web_tasks/session.py 416 evals/elsuite/multistep_web_tasks/constants.py 70 evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_utils.py 11 evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_env.py 89 evals/elsuite/multistep_web_tasks/webarena/bash_env/actions.py 17 evals/elsuite/multistep_web_tasks/webarena/bash_env/__init__.py 1 evals/elsuite/multistep_web_tasks/webarena/bash_env/bash_utils.py 15 evals/elsuite/multistep_web_tasks/webarena/bash_env/basic_bash_env.py 163 evals/elsuite/multistep_web_tasks/webarena/browser_env/processors.py 495 evals/elsuite/multistep_web_tasks/webarena/browser_env/actions.py 1014 evals/elsuite/multistep_web_tasks/webarena/browser_env/__init__.py 1 evals/elsuite/multistep_web_tasks/webarena/browser_env/browser_utils.py 78 evals/elsuite/multistep_web_tasks/webarena/browser_env/auto_login.py 100 evals/elsuite/multistep_web_tasks/webarena/browser_env/helper_functions.py 129 evals/elsuite/multistep_web_tasks/webarena/browser_env/env_config.py 29 evals/elsuite/multistep_web_tasks/webarena/browser_env/constants.py 282 evals/elsuite/multistep_web_tasks/webarena/browser_env/basic_browser_env.py 191 evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/evaluators.py 273 evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/__init__.py 1 evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/helper_functions.py 110 evals/elsuite/multistep_web_tasks/webarena/core/playwright_api.py 279 evals/elsuite/multistep_web_tasks/webarena/core/env.py 75 evals/elsuite/multistep_web_tasks/webarena/core/utils.py 188 evals/elsuite/multistep_web_tasks/webarena/eval_run.py 277 evals/elsuite/multistep_web_tasks/webarena/task_description.py 2 evals/elsuite/multistep_web_tasks/eval.py 51 evals/elsuite/multistep_web_tasks/utils.py 33 evals/elsuite/multistep_web_tasks/reproducibility/run_environments.py 29 evals/elsuite/multistep_web_tasks/reproducibility/make_task_jsonl.py 38 evals/elsuite/multistep_web_tasks/reproducibility/make_plots.py 94 evals/elsuite/multistep_web_tasks/docker/flask-playwright/app.py 165 evals/elsuite/multistep_web_tasks/docker/homepage/templates/index.html 108 evals/elsuite/multistep_web_tasks/docker/homepage/templates/scratchpad.html 105 evals/elsuite/multistep_web_tasks/docker/homepage/templates/calculator.html 106 evals/elsuite/multistep_web_tasks/docker/homepage/app.py 16 evals/elsuite/theory_of_mind/scripts/data_generation.py 66 evals/elsuite/theory_of_mind/scripts/make_plots.py 90 evals/elsuite/multiple_choice.py 84 evals/elsuite/bluff/strategy_solver.py 88 evals/elsuite/bluff/scripts/make_plots.py 103 evals/elsuite/bluff/bluff/players.py 107 evals/elsuite/bluff/bluff/cards.py 206 evals/elsuite/bluff/bluff/__init__.py 1 evals/elsuite/bluff/bluff/game.py 48 evals/elsuite/bluff/bluff/round.py 37 evals/elsuite/bluff/bluff/task_description.py 1 evals/elsuite/bluff/solver_player.py 82 evals/elsuite/bluff/eval.py 164 evals/elsuite/bluff/prompts.py 6 evals/elsuite/make_me_pay/solvers/lm_con_artist_solver.py 64 evals/elsuite/make_me_pay/solvers/prompts.py 18 evals/elsuite/make_me_pay/scripts/make_plots.py 102 evals/elsuite/make_me_pay/makemepay.py 222 evals/elsuite/make_me_pay/eval.py 126 evals/elsuite/make_me_pay/utils.py 47 evals/elsuite/make_me_pay/task_description.py 57 evals/elsuite/self_prompting/solvers/custom_cot_solver.py 57 evals/elsuite/self_prompting/solvers/baselines.py 45 evals/elsuite/self_prompting/scripts/dataset/compile_data.py 67 evals/elsuite/self_prompting/scripts/dataset/eval_list.py 52 evals/elsuite/self_prompting/scripts/make_plots.py 118 evals/elsuite/self_prompting/eval.py 210 evals/elsuite/self_prompting/task_description.py 2 evals/elsuite/already_said_that/scripts/gen_data.py 50 evals/elsuite/already_said_that/scripts/make_plots.py 263 evals/elsuite/already_said_that/solvers.py 29 evals/elsuite/already_said_that/eval.py 130 evals/elsuite/already_said_that/prompts.py 1 evals/elsuite/already_said_that/utils.py 104 evals/elsuite/already_said_that/distractors.py 81 evals/elsuite/ballots/scripts/make_plots.py 233 evals/elsuite/ballots/eval.py 161 evals/elsuite/ballots/prompts.py 44 evals/elsuite/ballots/utils.py 118 evals/elsuite/basic/json_validator.py 41 evals/elsuite/basic/includes.py 48 evals/elsuite/basic/json_match.py 77 evals/elsuite/basic/match.py 57 evals/elsuite/basic/match_with_solvers.py 65 evals/elsuite/basic/fuzzy_match.py 49 evals/elsuite/cant_do_that_anymore/chess/notation.py 61 evals/elsuite/cant_do_that_anymore/chess/board.py 162 evals/elsuite/cant_do_that_anymore/chess/move_variants.py 113 evals/elsuite/cant_do_that_anymore/chess/pieces.py 203 evals/elsuite/cant_do_that_anymore/chess/utils.py 75 evals/elsuite/cant_do_that_anymore/scripts/dataset_creation.py 235 evals/elsuite/cant_do_that_anymore/scripts/diagonal_dataset_creation.py 216 evals/elsuite/cant_do_that_anymore/scripts/make_plots.py 100 evals/elsuite/cant_do_that_anymore/defaults.py 2 evals/elsuite/cant_do_that_anymore/eval.py 170 evals/elsuite/cant_do_that_anymore/utils.py 178 evals/elsuite/bugged_tools/tools.py 497 evals/elsuite/bugged_tools/scripts/plot_experiments.py 107 evals/elsuite/bugged_tools/bugged_tools.py 132 evals/elsuite/bugged_tools/eval.py 210 evals/elsuite/bugged_tools/utils.py 48 evals/elsuite/bugged_tools/task_description.py 9 evals/elsuite/identifying_variables/renderers/base.py 11 evals/elsuite/identifying_variables/renderers/__init__.py 10 evals/elsuite/identifying_variables/renderers/corrset.py 216 evals/elsuite/identifying_variables/renderers/templates.py 26 evals/elsuite/identifying_variables/renderers/tabular.py 125 evals/elsuite/identifying_variables/scripts/plotting_utils.py 128 evals/elsuite/identifying_variables/scripts/gen_data.py 319 evals/elsuite/identifying_variables/scripts/table_utils.py 36 evals/elsuite/identifying_variables/scripts/make_plots.py 325 evals/elsuite/identifying_variables/structs.py 18 evals/elsuite/identifying_variables/latent_funcs.py 30 evals/elsuite/identifying_variables/solvers.py 27 evals/elsuite/identifying_variables/constants.py 10 evals/elsuite/identifying_variables/metrics.py 54 evals/elsuite/identifying_variables/eval.py 227 evals/elsuite/identifying_variables/prompts.py 8 evals/elsuite/identifying_variables/utils.py 72 evals/elsuite/identifying_variables/graph_utils.py 130 evals/elsuite/steganography/scripts/dataset/custom_datasets.py 197 evals/elsuite/steganography/scripts/dataset/complexity_metrics.py 29 evals/elsuite/steganography/scripts/dataset/csv2jsonl.py 16 evals/elsuite/steganography/scripts/dataset/utils.py 29 evals/elsuite/steganography/scripts/dataset/dataset.py 106 evals/elsuite/steganography/scripts/make_plots.py 75 evals/elsuite/steganography/reconstruction_metrics.py 30 evals/elsuite/steganography/steganography.py 84 evals/elsuite/steganography/monitor.py 52 evals/elsuite/steganography/eval.py 69 evals/elsuite/steganography/prompts.py 55 evals/elsuite/solver_tools_convo.py 181 evals/elsuite/hr_ml_agent_bench/solvers/baseline.py 90 evals/elsuite/hr_ml_agent_bench/scripts/plot_experiments.py 307 evals/elsuite/hr_ml_agent_bench/scripts/run_experiments.py 60 evals/elsuite/hr_ml_agent_bench/actions.py 37 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/prepare.py 22 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/scripts/grade.py 26 evals/elsuite/hr_ml_agent_bench/benchmarks/spaceship_titanic/env/train.py 34 evals/elsuite/hr_ml_agent_bench/benchmarks/__init__.py 1 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/scripts/grade.py 44 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/env/train.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/naive.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/pong/baselines/human.py 43 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/scripts/grade.py 31 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/env/train.py 27 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/naive.py 27 evals/elsuite/hr_ml_agent_bench/benchmarks/bipedal_walker/baselines/human.py 57 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/prepare.py 4 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/scripts/grade.py 53 evals/elsuite/hr_ml_agent_bench/benchmarks/ogbn_arxiv/env/train.py 126 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/prepare.py 98 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/scripts/grade.py 22 evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/train.py 124 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/human_baseline.py 83 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/scripts/grade.py 64 evals/elsuite/hr_ml_agent_bench/benchmarks/vectorization/env/train.py 118 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/prepare.py 20 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/scripts/grade.py 33 evals/elsuite/hr_ml_agent_bench/benchmarks/house_price/env/train.py 41 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/scripts/grade.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/env/train.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/naive.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/inverted_pendulum/baselines/human.py 31 evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/scripts/grade.py 27 evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/env/train.py 21 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/scripts/grade.py 45 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/env/train.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/naive.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/pusher/baselines/human.py 29 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/prepare.py 24 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/scripts/grade.py 36 evals/elsuite/hr_ml_agent_bench/benchmarks/feedback/env/train.py 66 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/scripts/grade.py 40 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/env/train.py 27 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/naive.py 27 evals/elsuite/hr_ml_agent_bench/benchmarks/cartpole/baselines/human.py 30 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/scripts/grade.py 45 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/env/train.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/naive.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/humanoid/baselines/human.py 33 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/scripts/grade.py 43 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/env/train.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/naive.py 28 evals/elsuite/hr_ml_agent_bench/benchmarks/ant/baselines/human.py 36 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/prepare.py 5 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/scripts/grade.py 35 evals/elsuite/hr_ml_agent_bench/benchmarks/cifar10/env/train.py 105 evals/elsuite/hr_ml_agent_bench/__init__.py 1 evals/elsuite/hr_ml_agent_bench/environment.py 283 evals/elsuite/hr_ml_agent_bench/high_level_actions.py 191 evals/elsuite/hr_ml_agent_bench/auto_marking.py 53 evals/elsuite/hr_ml_agent_bench/autoeval.py 172 evals/elsuite/hr_ml_agent_bench/schema.py 46 evals/elsuite/hr_ml_agent_bench/eval.py 91 evals/elsuite/hr_ml_agent_bench/prompts.py 26 evals/elsuite/hr_ml_agent_bench/utils.py 112 evals/elsuite/hr_ml_agent_bench/low_level_actions.py 304 evals/elsuite/hr_ml_agent_bench/prepare_task.py 40 evals/elsuite/make_me_say/autoeval.py 116 evals/elsuite/make_me_say/defaults.py 34 evals/elsuite/make_me_say/core.py 223 evals/elsuite/make_me_say/eval.py 48 evals/elsuite/make_me_say/utils.py 34 evals/elsuite/modelgraded/base.py 16 evals/elsuite/modelgraded/classify.py 97 evals/elsuite/modelgraded/classify_utils.py 145 evals/elsuite/incontext_rl/anti-cot_solver.py 29 evals/elsuite/incontext_rl/scripts/plot_experiments.py 233 evals/elsuite/incontext_rl/defaults.py 11 evals/elsuite/incontext_rl/eval.py 246 evals/elsuite/incontext_rl/env_setup.py 7 evals/elsuite/incontext_rl/baselines.py 93 evals/elsuite/track_the_stat/prompts/median.py 2 evals/elsuite/track_the_stat/prompts/__init__.py 11 evals/elsuite/track_the_stat/prompts/mode.py 2 evals/elsuite/track_the_stat/scripts/make_plots.py 235 evals/elsuite/track_the_stat/solvers.py 72 evals/elsuite/track_the_stat/eval.py 80 evals/elsuite/track_the_stat/utils.py 47 evals/elsuite/utils.py 150 evals/elsuite/skill_acquisition/scraping/scrape_distractor_articles.py 76 evals/elsuite/skill_acquisition/scraping/scrape_miskito.py 106 evals/elsuite/skill_acquisition/scripts/make_plots.py 157 evals/elsuite/skill_acquisition/solvers.py 13 evals/elsuite/skill_acquisition/eval.py 313 evals/elsuite/skill_acquisition/utils.py 115 evals/elsuite/skill_acquisition/task_description.py 1 evals/elsuite/mmmu/eval.py 159 evals/elsuite/schelling_point/eval.py 73 evals/elsuite/schelling_point/prompts.py 25 evals/elsuite/schelling_point/utils.py 61 evals/elsuite/text_compression/scripts/dataset/custom_datasets.py 119 evals/elsuite/text_compression/scripts/dataset/complexity_metrics.py 29 evals/elsuite/text_compression/scripts/dataset/csv2jsonl.py 16 evals/elsuite/text_compression/scripts/dataset/utils.py 29 evals/elsuite/text_compression/scripts/dataset/dataset.py 63 evals/elsuite/text_compression/scripts/make_plots.py 63 evals/elsuite/text_compression/reconstruction_metrics.py 30 evals/elsuite/text_compression/compression.py 83 evals/elsuite/text_compression/eval.py 52 evals/elsuite/text_compression/prompts.py 46 evals/elsuite/twenty_questions/scripts/make_plots.py 111 evals/elsuite/twenty_questions/eval.py 169 evals/elsuite/twenty_questions/utils.py 47 evals/elsuite/function_deduction/scripts/dataset/create_dataset.py 44 evals/elsuite/function_deduction/scripts/make_plots.py 195 evals/elsuite/function_deduction/solvers.py 140 evals/elsuite/function_deduction/eval.py 244 evals/elsuite/function_deduction/prompts.py 6 evals/elsuite/function_deduction/baselines.py 91 evals/data.py 148 evals/registry.py 242 evals/registry/solvers/track_the_stat.yaml 75 evals/registry/solvers/function_deduction.yaml 174 evals/registry/solvers/bluff.yaml 80 evals/registry/solvers/together.yaml 85 evals/registry/solvers/self_prompting.yaml 96 evals/registry/solvers/skill_acquisition.yaml 267 evals/registry/solvers/twenty_questions.yaml 75 evals/registry/solvers/cant_do_that_anymore.yaml 16 evals/registry/solvers/identifying_variables.yaml 4 evals/registry/solvers/anthropic.yaml 90 evals/registry/solvers/hr-ml-agent-bench.yaml 37 evals/registry/solvers/theory_of_mind.yaml 394 evals/registry/solvers/multistep_web_tasks.yaml 56 evals/registry/solvers/already_said_that.yaml 75 evals/registry/solvers/error_recovery.yaml 33 evals/registry/solvers/gemini.yaml 15 evals/registry/solvers/sandbagging.yaml 109 evals/registry/solvers/incontext_rl.yaml 24 evals/registry/solvers/defaults.yaml 294 evals/registry/solvers/make-me-pay.yaml 101 evals/registry/data/hindi_words/samples.jsonl 3 evals/registry/data/integer-sequence-predictions/obscure-sequences.jsonl 3 evals/registry/data/integer-sequence-predictions/misc-and-recent-sequences.jsonl 3 evals/registry/data/integer-sequence-predictions/samples.jsonl 3 evals/registry/data/integer-sequence-predictions/notable-sequences.jsonl 3 evals/registry/data/dutch-lexicon/samples.jsonl 3 evals/registry/data/chinese-lantern-riddles/samples.jsonl 3 evals/registry/data/reasoning_with_contradictory_statements/samples.jsonl 3 evals/registry/data/invert_word_wise/invert.jsonl 3 evals/registry/data/next-val-series/next-val-series.jsonl 3 evals/registry/data/sort_numeric/samples.jsonl 3 evals/registry/data/thirty_six_stratagems/samples.jsonl 3 evals/registry/data/korean_yaminjeongeum/samples.jsonl 3 evals/registry/data/shared_border/samples.jsonl 3 evals/registry/data/code_combination/samples.jsonl 3 evals/registry/data/non-compound-names/samples.jsonl 3 evals/registry/data/non-compound-names/samples_meta.jsonl 3 evals/registry/data/convert-hex-hsl-lightness/samples.jsonl 3 evals/registry/data/hebrew_plurals/samples.jsonl 3 evals/registry/data/japanese_mahjong_discard_tile/samples.jsonl 3 evals/registry/data/shape_in_shape/shape_in_shape.jsonl 3 evals/registry/data/icelandic-inflection-medium/samples.jsonl 3 evals/registry/data/error_recovery/medium.jsonl 3 evals/registry/data/error_recovery/small.jsonl 3 evals/registry/data/error_recovery/main.jsonl 3 evals/registry/data/decrypt_caesar_cipher/samples.jsonl 3 evals/registry/data/svg_alphabet/samples.jsonl 3 evals/registry/data/belarusian_word_analogy_inflection/samples.jsonl 3 evals/registry/data/medmcqa/convert.js 44 evals/registry/data/medmcqa/samples.jsonl 3 evals/registry/data/code_progress/samples.jsonl 3 evals/registry/data/finance_calc/samples.jsonl 3 evals/registry/data/recurrence-relation/samples.jsonl 3 evals/registry/data/korean-postposition/samples.jsonl 3 evals/registry/data/countries/samples.jsonl 3 evals/registry/data/relative_orientations/samples.jsonl 3 evals/registry/data/vigenere/samples.jsonl 3 evals/registry/data/dutch-rhymes/samples.jsonl 3 evals/registry/data/find-letter/samples.jsonl 3 evals/registry/data/rare-and-loanwords-dutch-lexicon/samples.jsonl 3 evals/registry/data/food/samples.jsonl 3 evals/registry/data/spanish-lexicon/samples.jsonl 3 evals/registry/data/unsolvable_questions/convert.js 51 evals/registry/data/unsolvable_questions/findFailures.js 43 evals/registry/data/unsolvable_questions/samples.jsonl 3 evals/registry/data/coq-editing/samples.jsonl 3 evals/registry/data/coq-editing/labeled-samples.jsonl 3 evals/registry/data/medication_dose/samples.jsonl 3 evals/registry/data/chinese_chu_ci/samples.jsonl 3 evals/registry/data/nepali_numerals/samples.jsonl 3 evals/registry/data/matrix_mult_rows/samples.jsonl 3 evals/registry/data/belarusian_russian_translation/samples.jsonl 3 evals/registry/data/russian-verse/samples.jsonl 3 evals/registry/data/number_reading/number_reading.jsonl 3 evals/registry/data/chinese_hard_translations/samples.jsonl 3 evals/registry/data/product-ie/zeroshot/product_ie_zero_shot_samples.jsonl 3 evals/registry/data/product-ie/fewshot/product_ie_one_shot_samples.jsonl 3 evals/registry/data/financial-derivatives/questions.jsonl 3 evals/registry/data/map-electronic-component-part-to-fact/samples.jsonl 3 evals/registry/data/game_theory/samples.jsonl 3 evals/registry/data/german-part-of-speech/buildDataDe.py 69 evals/registry/data/german-part-of-speech/samples.jsonl 3 evals/registry/data/german-part-of-speech/parsePosDe.py 172 evals/registry/data/turkish-exams-qa/samples.jsonl 3 evals/registry/data/probability_questions/probability_questions.jsonl 3 evals/registry/data/macedonian-exams-qa/samples.jsonl 3 evals/registry/data/bias_detection/samples.jsonl 3 evals/registry/data/sandbagging/samples-all-large.jsonl 3 evals/registry/data/sandbagging/samples-ast.jsonl 3 evals/registry/data/sandbagging/samples-non-ast.jsonl 3 evals/registry/data/sandbagging/samples-all.jsonl 3 evals/registry/data/ascii-digit-recognition/samples.jsonl 3 evals/registry/data/spanish-exams-qa/samples.jsonl 3 evals/registry/data/complex-analogies-en-ru/samples.jsonl 3 evals/registry/data/brazilian_laws/samples.jsonl 3 evals/registry/data/find-thirukkural/samples.jsonl 3 evals/registry/data/italian_big_math_expression/samples.jsonl 3 evals/registry/data/tetris/tetris.jsonl 3 evals/registry/data/raven-matrices/symbolic/distribute_nine.jsonl 3 evals/registry/data/raven-matrices/symbolic/center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic/up_center_single_down_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic/in_distribute_four_out_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic/distribute_four.jsonl 3 evals/registry/data/raven-matrices/symbolic/left_center_single_right_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic/in_center_single_out_center_single.jsonl 3 evals/registry/data/raven-matrices/text/distribute_nine.jsonl 3 evals/registry/data/raven-matrices/text/center_single.jsonl 3 evals/registry/data/raven-matrices/text/up_center_single_down_center_single.jsonl 3 evals/registry/data/raven-matrices/text/in_distribute_four_out_center_single.jsonl 3 evals/registry/data/raven-matrices/text/distribute_four.jsonl 3 evals/registry/data/raven-matrices/text/left_center_single_right_center_single.jsonl 3 evals/registry/data/raven-matrices/text/in_center_single_out_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/distribute_nine.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/up_center_single_down_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/in_distribute_four_out_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/distribute_four.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/left_center_single_right_center_single.jsonl 3 evals/registry/data/raven-matrices/symbolic-open/in_center_single_out_center_single.jsonl 3 evals/registry/data/raven-matrices/text-open/distribute_nine.jsonl 3 evals/registry/data/raven-matrices/text-open/center_single.jsonl 3 evals/registry/data/raven-matrices/text-open/up_center_single_down_center_single.jsonl 3 evals/registry/data/raven-matrices/text-open/in_distribute_four_out_center_single.jsonl 3 evals/registry/data/raven-matrices/text-open/distribute_four.jsonl 3 evals/registry/data/raven-matrices/text-open/left_center_single_right_center_single.jsonl 3 evals/registry/data/raven-matrices/text-open/in_center_single_out_center_single.jsonl 3 evals/registry/data/arithmetical_puzzles/arithmetical_puzzles.jsonl 3 evals/registry/data/hebrew_homophones/samples.jsonl 3 evals/registry/data/music_theory_scale_modes/samples.jsonl 3 evals/registry/data/superficialpatterns/samples.jsonl 3 evals/registry/data/beam_analysis/beam-analysis.jsonl 3 evals/registry/data/chess/match.jsonl 3 evals/registry/data/dna_melting_calculation/samples.jsonl 3 evals/registry/data/rhetorical_devices/samples.jsonl 3 evals/registry/data/stock_options/stock_options_iron_condor_spread.jsonl 3 evals/registry/data/stock_options/stock_options_bear_call_spread.jsonl 3 evals/registry/data/stock_options/stock_options_inverse_iron_condor_spread.jsonl 3 evals/registry/data/stock_options/stock_option_terms_iron_condor_spread.jsonl 3 evals/registry/data/stock_options/stock_option_terms_bull_call_spread.jsonl 3 evals/registry/data/stock_options/stock_options_bull_call_spread.jsonl 3 evals/registry/data/stock_options/stock_option_terms_iron_butterfly_spread.jsonl 3 evals/registry/data/stock_options/stock_options_inverse_iron_butterfly_spread.jsonl 3 evals/registry/data/stock_options/stock_options_iron_butterfly_spread.jsonl 3 evals/registry/data/stock_options/stock_option_terms_inverse_iron_condor_spread.jsonl 3 evals/registry/data/stock_options/stock_option_terms_inverse_iron_butterfly_spread.jsonl 3 evals/registry/data/stock_options/stock_option_terms_bear_call_spread.jsonl 3 evals/registry/data/korean-consonant-vowel-combination/samples.jsonl 3 evals/registry/data/abstract-causal-reasoning/symbolic_samples.jsonl 3 evals/registry/data/abstract-causal-reasoning/text_samples.jsonl 3 evals/registry/data/chinese_poem/samples.jsonl 3 evals/registry/data/sql/co_sql.jsonl 3 evals/registry/data/sql/spider_sql.jsonl 3 evals/registry/data/chess_piece_count/fuzzy_match.jsonl 3 evals/registry/data/chinese_homophonic/chinese_homophonic.jsonl 3 evals/registry/data/lunar_calendar/iso_to_lunar_calendar.jsonl 3 evals/registry/data/lunar_calendar/lunar_calendar_to_iso.jsonl 3 evals/registry/data/directions/samples.jsonl 3 evals/registry/data/pararule-plus-multi-step-deductive-reasoning/pararule-plus-multi-step-deductive-reasoning.jsonl 3 evals/registry/data/numbers_game/samples.jsonl 3 evals/registry/data/seating_arrangements/samples.jsonl 3 evals/registry/data/ascii_wordart/ascii_wordart.jsonl 3 evals/registry/data/job_listing_title_for_a_caregiver_in_japan/samples.jsonl 3 evals/registry/data/rubiks-colors/samples.jsonl 3 evals/registry/data/chinese_zodiac/samples.jsonl 3 evals/registry/data/poker_hand_ranks/full_samples.jsonl 3 evals/registry/data/polish-lexicon/samples.jsonl 3 evals/registry/data/bulgarian-lexicon/samples.jsonl 3 evals/registry/data/sindarin_fluency/sindarin_nouns.jsonl 3 evals/registry/data/GPT-model-text-detection/samples.jsonl 3 evals/registry/data/backgammon/generate_samples.ipynb 1349 evals/registry/data/backgammon/backgammon-can-hit.jsonl 3 evals/registry/data/backgammon/backgammon-illegal-move.jsonl 3 evals/registry/data/italian-new-words/samples.jsonl 3 evals/registry/data/belarusian_grammar/samples.jsonl 3 evals/registry/data/ru_rhyming_phrases/samples.jsonl 3 evals/registry/data/brazilian-lexicon/samples.jsonl 3 evals/registry/data/internal_representations/samples.jsonl 3 evals/registry/data/polish-exams-qa/samples.jsonl 3 evals/registry/data/logic-riddles/samples.jsonl 3 evals/registry/data/swedish_sat/samples.jsonl 3 evals/registry/data/reverse_string/reverse_string.jsonl 3 evals/registry/data/cube-pack/samples.jsonl 3 evals/registry/data/vietnamese-exams-qa/samples.jsonl 3 evals/registry/data/theory_of_mind/hitom/hitom-multiple-choice.jsonl 3 evals/registry/data/theory_of_mind/hitom/hitom.jsonl 3 evals/registry/data/russian-english-homonym-context-resolution/samples.jsonl 3 evals/registry/data/italian-exams-qa/samples.jsonl 3 evals/registry/data/aime_evaluation/samples.jsonl 3 evals/registry/data/viewport_to_grid_size/samples.jsonl 3 evals/registry/data/math_logic_operations/samples.jsonl 3 evals/registry/data/portuguese-syllable-count/samples.jsonl 3 evals/registry/data/numerical-cabbala-casanova/samples.jsonl 3 evals/registry/data/compare-countries-area/samples.jsonl 3 evals/registry/data/algebra_word_problems/samples.jsonl 3 evals/registry/data/reverse-polish-notation/questions.jsonl 3 evals/registry/data/date-calculator/samples.jsonl 3 evals/registry/data/mazes/3x3-mazes.jsonl 3 evals/registry/data/mazes/nxn_maze_eval_generator.py 162 evals/registry/data/mazes/4x4-mazes.jsonl 3 evals/registry/data/mazes/4x4-mazes-singlemove.jsonl 3 evals/registry/data/mazes/3x3-mazes-singlemove.jsonl 3 evals/registry/data/mazes/10x10-mazes-singlemove.jsonl 3 evals/registry/data/mazes/10x10-mazes.jsonl 3 evals/registry/data/Japanese_onomatopoeia/samples.jsonl 3 evals/registry/data/swedish-spelling/samples.jsonl 3 evals/registry/data/balance_chemical_equation/samples.jsonl 3 evals/registry/data/which_is_heavier/which_is_heavier.jsonl 3 evals/registry/data/mapping_to_matricies/data_generator.py 30 evals/registry/data/mapping_to_matricies/samples.jsonl 3 evals/registry/data/hindi_upsc/samples.jsonl 3 evals/registry/data/math_equations/math_equations.jsonl 3 evals/registry/data/tracking-shuffled-objects/samples.jsonl 3 evals/registry/data/chinese_homonym/samples.jsonl 3 evals/registry/data/heart-disease/samples.jsonl 3 evals/registry/data/japanese_driving_license/samples.jsonl 3 evals/registry/data/osm_mapping/osm_mapping_one_way.jsonl 3 evals/registry/data/cricket_situations/samples.jsonl 3 evals/registry/data/premature-conclusions/samples.jsonl 3 evals/registry/data/dhammapada-reference/samples.jsonl 3 evals/registry/data/singapore_data_protection_decisions/samples.jsonl 3 evals/registry/data/comprehensive-graph-reasoning/samples.jsonl 3 evals/registry/data/bulgarian-exams-qa/samples.jsonl 3 evals/registry/data/portuguese-exams-qa/samples.jsonl 3 evals/registry/data/ral_to_hex/samples.jsonl 3 evals/registry/data/ner_finance/samples.jsonl 3 evals/registry/data/german-exams-qa/samples.jsonl 3 evals/registry/data/formal_logic/formal_logic_expressions.jsonl 3 evals/registry/data/wkt_understanding/samples.jsonl 3 evals/registry/data/hebrew_talmud_suka/samples.jsonl 3 evals/registry/data/japanese_approval/samples.jsonl 3 evals/registry/data/infiniteloop-match/infiniteloop-match.jsonl 3 evals/registry/data/manga-translation/pages.jsonl 3 evals/registry/data/manga-translation/panels.jsonl 3 evals/registry/data/manga-translation/bubbles.jsonl 3 evals/registry/data/points_on_line/eval_generator.py 73 evals/registry/data/points_on_line/points_on_line.jsonl 3 evals/registry/data/2d_movement/samples.jsonl 3 evals/registry/data/russian_sarcasm/samples.jsonl 3 evals/registry/data/abstract2title/samples.jsonl 3 evals/registry/data/norwegian-rhymes/samples.jsonl 3 evals/registry/data/self_prompting/samples.jsonl 3 evals/registry/data/korean-honorific/samples.jsonl 3 evals/registry/data/path_enclosed_area/samples.jsonl 3 evals/registry/data/imperial_date_to_string/samples.jsonl 3 evals/registry/data/russian-lexicon/samples.jsonl 3 evals/registry/data/nepali-song-singer/nepali-song-singer.jsonl 3 evals/registry/data/hard_russian_computer_science_tasks/samples.jsonl 3 evals/registry/data/turkish_characters/samples.jsonl 3 evals/registry/data/urdu-lexicon/samples.jsonl 3 evals/registry/data/poker_analysis/samples.jsonl 3 evals/registry/data/poker_analysis/poker_analysis_sample_generator.py 95 evals/registry/data/determinant/samples.jsonl 3 evals/registry/data/vintage_phone_keyboard_decode/samples.jsonl 3 evals/registry/data/day-of-week-from-date/samples.jsonl 3 evals/registry/data/diagrammatic_logic/samples.jsonl 3 evals/registry/data/3d_object_manipulation/samples.jsonl 3 evals/registry/data/anagrams/samples.jsonl 3 evals/registry/data/anagrams/fewshot.jsonl 3 evals/registry/data/logic-statements/logic-statements.jsonl 3 evals/registry/data/nutrition/facts.jsonl 3 evals/registry/data/rock-climbing/samples.jsonl 3 evals/registry/data/iambic-pentameter/samples.jsonl 3 evals/registry/data/finnish-rhyme/samples.jsonl 3 evals/registry/data/multi-step-equations/samples.jsonl 3 evals/registry/data/arabic-literature-qa/samples.jsonl 3 evals/registry/data/belarusian_orthography/samples.jsonl 3 evals/registry/data/already_said_that/500_100.jsonl 3 evals/registry/data/irish_plural_nouns/samples.jsonl 3 evals/registry/data/convert-bwt-num-and-chinese-num/n_to_c_upper_samples_few_shot.jsonl 3 evals/registry/data/convert-bwt-num-and-chinese-num/n_to_c_lower_samples_few_shot.jsonl 3 evals/registry/data/convert-bwt-num-and-chinese-num/c_lower_to_n_samples_few_shot.jsonl 3 evals/registry/data/convert-bwt-num-and-chinese-num/c_upper_to_n_samples_few_shot.jsonl 3 evals/registry/data/belarusian_antonyms/samples.jsonl 3 evals/registry/data/Chinese_character_riddles/samples.jsonl 3 evals/registry/data/cardinal-directions/samples.jsonl 3 evals/registry/data/hindi_shuddha/samples.jsonl 3 evals/registry/data/benjaminmoore_to_hex/samples.jsonl 3 evals/registry/data/romanian_homonyms/samples.jsonl 3 evals/registry/data/korean-phonetics/samples.jsonl 3 evals/registry/data/spanish_feminine_noun_masculine_article/samples.jsonl 3 evals/registry/data/pattern_identification/samples.v0.jsonl 3 evals/registry/data/ballots/samples25.jsonl 3 evals/registry/data/ballots/samples02.jsonl 3 evals/registry/data/illinois-law/samples.jsonl 3 evals/registry/data/coqa/match.jsonl 3 evals/registry/data/coqa/samples.jsonl 3 evals/registry/data/squares-gpt/square-samples.jsonl 3 evals/registry/data/french_homonym_and_homograph/samples.jsonl 3 evals/registry/data/word_association/related_words_5.jsonl 3 evals/registry/data/word_association/related_words_4.jsonl 3 evals/registry/data/word_association/corpus_tools/sample_generators.py 154 evals/registry/data/word_association/corpus_tools/logger_config.py 8 evals/registry/data/word_association/corpus_tools/corpus.py 58 evals/registry/data/word_association/corpus_tools/__init__.py 1 evals/registry/data/word_association/corpus_tools/validators.py 151 evals/registry/data/word_association/corpus_tools/pipelines.py 14 evals/registry/data/word_association/corpus_tools/related_words.py 64 evals/registry/data/word_association/corpus_tools/processor.py 36 evals/registry/data/word_association/related_words_2.jsonl 3 evals/registry/data/word_association/related_words_3.jsonl 3 evals/registry/data/canto_wu_pronunciation/samples_zero.jsonl 3 evals/registry/data/canto_wu_pronunciation/samples_few.jsonl 3 evals/registry/data/canto_wu_pronunciation/csv_to_json.py 55 evals/registry/data/simple-charting/samples.jsonl 3 evals/registry/data/music_theory/music_theory_chord_notes.jsonl 3 evals/registry/data/music_theory/music_theory_chord_names.jsonl 3 evals/registry/data/css-selectors/verbal.jsonl 3 evals/registry/data/css-selectors/explain.jsonl 3 evals/registry/data/body_movement/body_movement.jsonl 3 evals/registry/data/logical_reasoning_letter_series_test/samples.jsonl 3 evals/registry/data/naughty_strings/samples.jsonl 3 evals/registry/data/naughty_strings/security.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_comparison.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_fluency_style.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_prep.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_fluency_other.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_partvoice.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_number.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_aspect.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_tense.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_ungrammaticalstructure.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_case.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_fluency_repetition.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_verbvoice.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_gender.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_other.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_verbaform.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_fluency_calque.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_grammar_conjunction.jsonl 3 evals/registry/data/ukraine_gec/ukraine_gec_fluency_poorflow.jsonl 3 evals/registry/data/portuguese-kinship-riddles/samples.jsonl 3 evals/registry/data/korean_date_counting/samples.jsonl 3 evals/registry/data/belarusian_lexicon/samples.jsonl 3 evals/registry/data/atpl_exams/samples.jsonl 3 evals/registry/data/pure_korean/samples.jsonl 3 evals/registry/data/french-lexicon/samples.jsonl 3 evals/registry/data/color_theory/complementary.jsonl 3 evals/registry/data/consensus_summary/samples.jsonl 3 evals/registry/data/greek_nt_manuscripts/codes-sigla-centuries.jsonl 3 evals/registry/data/banking77/samples.jsonl 3 evals/registry/data/simple-knowledge-mongolian/samples.v0.jsonl 3 evals/registry/data/japanese-national-medical-exam02/japanese-national-medical-exam02.jsonl 3 evals/registry/data/chinese_shi_jing/samples.jsonl 3 evals/registry/data/human-safety/human-safety.jsonl 3 evals/registry/data/chinese-remainder-theorem/samples.jsonl 3 evals/registry/data/cant_do_that_anymore/diagonal_moves_dataset.jsonl 3 evals/registry/data/cant_do_that_anymore/special_moves_dataset.jsonl 3 evals/registry/data/cant_do_that_anymore/gpt-4-0314_dataset.jsonl 3 evals/registry/data/cant_do_that_anymore/gpt-4-0125-preview_dataset.jsonl 3 evals/registry/data/cant_do_that_anymore/gpt-3.5-turbo-instruct_dataset.jsonl 3 evals/registry/data/cant_do_that_anymore/gpt-3.5-turbo-0125_dataset.jsonl 3 evals/registry/data/security_guide/samples.jsonl 3 evals/registry/data/taxes/samples.jsonl 3 evals/registry/data/guess_the_singer/samples.jsonl 3 evals/registry/data/gpt-protocol-buffers/samples.jsonl 3 evals/registry/data/bugged_tools/main_small.jsonl 3 evals/registry/data/bugged_tools/main.jsonl 3 evals/registry/data/logic_and_probability/logic_and_probability.jsonl 3 evals/registry/data/building_floorplan/samples.jsonl 3 evals/registry/data/norwegian-lexicon/samples.jsonl 3 evals/registry/data/identifying_variables/balanced_ctrl_vars.jsonl 3 evals/registry/data/identifying_variables/balanced_hypotheses.jsonl 3 evals/registry/data/jee-math/samples.jsonl 3 evals/registry/data/japanese-itpassport-exam01/japanese-itpassport-exam01.jsonl 3 evals/registry/data/logic-container/samples.jsonl 3 evals/registry/data/categorize_with_distractors/samples.jsonl 3 evals/registry/data/interlingual-homograph/samples.jsonl 3 evals/registry/data/connect4/samples.jsonl 3 evals/registry/data/korean_foreign_words/samples.jsonl 3 evals/registry/data/alternate_numeral_systems/samples.jsonl 3 evals/registry/data/track_objects/samples.jsonl 3 evals/registry/data/chinese_ancient_poetry/samples.jsonl 3 evals/registry/data/count_intersections_polynomial/samples.jsonl 3 evals/registry/data/simple_math/simple_math.jsonl 3 evals/registry/data/russian_medical/samples.jsonl 3 evals/registry/data/japanese-national-medical-exam01/japanese-national-medical-exam01.jsonl 3 evals/registry/data/hebrew_bible/samples.jsonl 3 evals/registry/data/context-free-grammar/samples.jsonl 3 evals/registry/data/largest_country/samples.jsonl 3 evals/registry/data/simple-visual-understanding/simple-visual-understanding.jsonl 3 evals/registry/data/portuguese-sarcasm/samples.jsonl 3 evals/registry/data/sarcasm/few_shot.jsonl 3 evals/registry/data/sarcasm/samples.jsonl 3 evals/registry/data/svg_understanding/samples.jsonl 3 evals/registry/data/chinese_modern_poem_identification/samples.jsonl 3 evals/registry/data/belarusian_rhyme/samples.jsonl 3 evals/registry/data/asl-classifiers/samples.jsonl 3 evals/registry/data/french-part-of-speech/samples.jsonl 3 evals/registry/data/croatian-exams-qa/samples.jsonl 3 evals/registry/data/afrikaans-lexicon/samples.jsonl 3 evals/registry/data/regex-match/samples.jsonl 3 evals/registry/data/smiles_to_formula/samples.jsonl 3 evals/registry/data/steganography/samples.jsonl 3 evals/registry/data/hebrew_grammar/samples.jsonl 3 evals/registry/data/corr2cause/corr2cause.jsonl 3 evals/registry/data/linear-regression/samples.jsonl 3 evals/registry/data/linear-regression/labeled-samples.jsonl 3 evals/registry/data/research-question-extraction/research-question-extraction-samples.jsonl 3 evals/registry/data/actors-sequence/samples.jsonl 3 evals/registry/data/logical-black-scholes/samples.jsonl 3 evals/registry/data/formal-grammar-to-regex/formal-grammar-to-regex.jsonl 3 evals/registry/data/solve-for-variable/tools/solve.py 231 evals/registry/data/solve-for-variable/tools/tester.py 94 evals/registry/data/solve-for-variable/tools/template.jsonl 3 evals/registry/data/solve-for-variable/tools/main.py 65 evals/registry/data/solve-for-variable/tools/problem.py 90 evals/registry/data/solve-for-variable/samples.jsonl 3 evals/registry/data/qa/q_and_a.jsonl 3 evals/registry/data/monthly_metric_comparison/samples.jsonl 3 evals/registry/data/find_country_from_svg/samples.jsonl 3 evals/registry/data/islands/japanese_remote_island_to_prefecture.jsonl 3 evals/registry/data/csharp-linq/questions.jsonl 3 evals/registry/data/japanese_populer_video_game_title_and_the_publisher/samples.jsonl 3 evals/registry/data/polish-syllable-count/samples.jsonl 3 evals/registry/data/bigrams/samples.jsonl 3 evals/registry/data/iqbal-poetry-translation/samples.jsonl 3 evals/registry/data/iqbal-poetry-translation/labeled-samples.jsonl 3 evals/registry/data/number_series_test/samples.jsonl 3 evals/registry/data/unwanted-rhyming/samples.jsonl 3 evals/registry/data/syntax-check/samples.jsonl 3 evals/registry/data/korean_spelling/samples.jsonl 3 evals/registry/data/syllables_long_words/long_word_samples.jsonl 3 evals/registry/data/chinese_famous_novel/samples.jsonl 3 evals/registry/data/resource_id_extraction/samples.jsonl 3 evals/registry/data/korean_romanization/samples.jsonl 3 evals/registry/data/crepe/samples.jsonl 3 evals/registry/data/japanese_city_name_pronunciation/samples.jsonl 3 evals/registry/data/hr_ml_agent_bench/bipedal-walker.jsonl 3 evals/registry/data/hr_ml_agent_bench/spaceship_titanic/spaceship-titanic.jsonl 3 evals/registry/data/hr_ml_agent_bench/pong/gpu.jsonl 3 evals/registry/data/hr_ml_agent_bench/pong/cpu.jsonl 3 evals/registry/data/hr_ml_agent_bench/inverted-pendulum.jsonl 3 evals/registry/data/hr_ml_agent_bench/ogbn_arxiv/ogbn-arxiv.jsonl 3 evals/registry/data/hr_ml_agent_bench/parkinsons_disease/dataset/public_timeseries_testing_util.py 48 evals/registry/data/hr_ml_agent_bench/parkinsons_disease/parkinsons-disease.jsonl 3 evals/registry/data/hr_ml_agent_bench/imdb.jsonl 3 evals/registry/data/hr_ml_agent_bench/house_price/house-price.jsonl 3 evals/registry/data/hr_ml_agent_bench/pusher.jsonl 3 evals/registry/data/hr_ml_agent_bench/cartpole.jsonl 3 evals/registry/data/hr_ml_agent_bench/vectorization.jsonl 3 evals/registry/data/hr_ml_agent_bench/feedback/feedback.jsonl 3 evals/registry/data/hr_ml_agent_bench/humanoid/gpu.jsonl 3 evals/registry/data/hr_ml_agent_bench/humanoid/cpu.jsonl 3 evals/registry/data/hr_ml_agent_bench/ant/gpu.jsonl 3 evals/registry/data/hr_ml_agent_bench/ant/cpu.jsonl 3 evals/registry/data/hr_ml_agent_bench/cifar10.jsonl 3 evals/registry/data/european_date_format_challenge/samples.jsonl 3 evals/registry/data/complex_replace_characters/samples.jsonl 3 evals/registry/data/dice-rotation-sequence/samples.jsonl 3 evals/registry/data/count_token_freq_dna/samples.jsonl 3 evals/registry/data/sexagenary_cycle_calculation/samples.jsonl 3 evals/registry/data/utah_real_estate/samples.jsonl 3 evals/registry/data/direct-speech-tag/samples.jsonl 3 evals/registry/data/make_me_say/medium.jsonl 3 evals/registry/data/make_me_say/very-hard.jsonl 3 evals/registry/data/make_me_say/medium-and-hard.jsonl 3 evals/registry/data/make_me_say/easy.jsonl 3 evals/registry/data/make_me_say/hard.jsonl 3 evals/registry/data/music-theory/triads-few-shot.jsonl 3 evals/registry/data/music-theory/tetrads-samples.jsonl 3 evals/registry/data/music-theory/tetrads-few-shot.jsonl 3 evals/registry/data/music-theory/triads-samples.jsonl 3 evals/registry/data/fcc_amateur_extra/samples.jsonl 3 evals/registry/data/polish-proverbs/samples.jsonl 3 evals/registry/data/adultery-state-laws/samples.jsonl 3 evals/registry/data/mandaliof-table/samples.jsonl 3 evals/registry/data/emoji_riddle/fuzzy_match.jsonl 3 evals/registry/data/tricky-word-problems/samples.jsonl 3 evals/registry/data/passing-balls/passing-balls.jsonl 3 evals/registry/data/tokyo-station-number/samples.jsonl 3 evals/registry/data/reverse-shell/samples.jsonl 3 evals/registry/data/first-letters/samples.jsonl 3 evals/registry/data/crontab/samples.jsonl 3 evals/registry/data/base64_decode/base64_decode.jsonl 3 evals/registry/data/json_patch_object/samples.jsonl 3 evals/registry/data/linear_equations/samples.jsonl 3 evals/registry/data/japanese_prime_minister/samples.jsonl 3 evals/registry/data/product-matching/rules/samples.jsonl 3 evals/registry/data/product-matching/zeroshot/samples.jsonl 3 evals/registry/data/product-matching/fewshot/samples.jsonl 3 evals/registry/data/urdu-transliteration/samples.jsonl 3 evals/registry/data/isosceles-right-triangle/samples.jsonl 3 evals/registry/data/math_polish/samples.jsonl 3 evals/registry/data/indonesian_numbers/indonesian_numbers.jsonl 3 evals/registry/data/3d_globe_movement/samples.jsonl 3 evals/registry/data/cissp-study-questions/few_shot.jsonl 3 evals/registry/data/cissp-study-questions/samples.jsonl 3 evals/registry/data/override-system-instruction/samples.jsonl 3 evals/registry/data/loss_logic/samples.jsonl 3 evals/registry/data/korean_spaces/samples.jsonl 3 evals/registry/data/south-african-bands/south-african-bands.jsonl 3 evals/registry/data/icelandic-inflection-hard/samples.jsonl 3 evals/registry/data/utility_price_parsing/samples.jsonl 3 evals/registry/data/incontext_rl/samples.jsonl 3 evals/registry/data/incontext_rl/samples_gymnasium_only.jsonl 3 evals/registry/data/incontext_rl/samples_dev.jsonl 3 evals/registry/data/ab/samples.jsonl 3 evals/registry/data/moral_exceptQA/samples.jsonl 3 evals/registry/data/icelandic-sentences-gec/samples.jsonl 3 evals/registry/data/unified_patch/samples.jsonl 3 evals/registry/data/hebrew_same_noun_gender/samples.jsonl 3 evals/registry/data/finger-tracking/samples.jsonl 3 evals/registry/data/russian-rhyme/samples.jsonl 3 evals/registry/data/blackfoot-numerals-modern/samples.jsonl 3 evals/registry/data/soc_codes/samples.jsonl 3 evals/registry/data/counterfactual-reasoning/counterfactual_reasoning_samples.jsonl 3 evals/registry/data/geometry_puzzle/samples.jsonl 3 evals/registry/data/shopping_discount_comparison/samples.jsonl 3 evals/registry/data/logic-liar-paradox/samples.jsonl 3 evals/registry/data/pointer-value-retrieval/hard_many_examples.jsonl 3 evals/registry/data/pointer-value-retrieval/medium_many_examples.jsonl 3 evals/registry/data/pointer-value-retrieval/easy_many_examples.jsonl 3 evals/registry/data/pointer-value-retrieval/hard_few_examples.jsonl 3 evals/registry/data/pointer-value-retrieval/easy_few_examples.jsonl 3 evals/registry/data/pointer-value-retrieval/medium_few_examples.jsonl 3 evals/registry/data/korean_dialects/samples.jsonl 3 evals/registry/data/chinese_tang_poetries/sample.jsonl 3 evals/registry/data/word_vector_over_reliance/word_vector_over_reliance_samples.jsonl 3 evals/registry/data/astro_eval/samples.jsonl 3 evals/registry/data/italian_rhyme/samples.jsonl 3 evals/registry/data/simple-block-puzzles/block-puzzles.v1.jsonl 3 evals/registry/data/arc/samples.jsonl 3 evals/registry/data/serbian-exams-qa/samples.jsonl 3 evals/registry/data/rot13/rot13.jsonl 3 evals/registry/data/split_chinese_characters/samples.jsonl 3 evals/registry/data/prompt-injection/samples.jsonl 3 evals/registry/data/logic/samples.jsonl 3 evals/registry/data/simple_physics_engine/solver.py 90 evals/registry/data/simple_physics_engine/samples.jsonl 3 evals/registry/data/simple_physics_engine/samples_generator.py 59 evals/registry/data/simple_physics_engine/wave_function_collapse.py 157 evals/registry/data/ambiguous-sentences/samples.jsonl 3 evals/registry/data/emotional-intelligence/samples.jsonl 3 evals/registry/data/aba_mrpc_true_false/samples.jsonl 3 evals/registry/data/tempo_to_measure_count/samples.jsonl 3 evals/registry/data/greek_vocabulary/samples.jsonl 3 evals/registry/data/multistep-web-tasks/all_tasks.jsonl 3 evals/registry/data/multistep-web-tasks/task_7.jsonl 3 evals/registry/data/multistep-web-tasks/task_5.jsonl 3 evals/registry/data/multistep-web-tasks/task_4.jsonl 3 evals/registry/data/multistep-web-tasks/simple.jsonl 3 evals/registry/data/multistep-web-tasks/task_3.jsonl 3 evals/registry/data/multistep-web-tasks/medium_tasks.jsonl 3 evals/registry/data/multistep-web-tasks/task_1.jsonl 3 evals/registry/data/multistep-web-tasks/task_9.jsonl 3 evals/registry/data/multistep-web-tasks/easy_tasks.jsonl 3 evals/registry/data/multistep-web-tasks/task_8.jsonl 3 evals/registry/data/multistep-web-tasks/task_2.jsonl 3 evals/registry/data/multistep-web-tasks/task_6.jsonl 3 evals/registry/data/multistep-web-tasks/hard_tasks.jsonl 3 evals/registry/data/reasoning/samples.jsonl 3 evals/registry/data/nfl-point-combinations/samples.jsonl 3 evals/registry/data/nfl-point-combinations/combinations_generator.py 25 evals/registry/data/proofreader/samples.jsonl 3 evals/registry/data/multistep-word-problems/samples.jsonl 3 evals/registry/data/belarusian_numerals/samples.jsonl 3 evals/registry/data/list_comparison_missing_name/samples.jsonl 3 evals/registry/data/invoice_due_date_leap_day_adjustment/samples.jsonl 3 evals/registry/data/irrelevant-negative-diversion/irrelevant-negative-diversion.jsonl 3 evals/registry/data/ukraine_eit/samples.jsonl 3 evals/registry/data/born_first/born_first.jsonl 3 evals/registry/data/forth_stack_sim/detailed_samples.jsonl 3 evals/registry/data/forth_stack_sim/basic_samples.jsonl 3 evals/registry/data/forth_stack_sim/samples.jsonl 3 evals/registry/data/logic-grid/logic-grid.jsonl 3 evals/registry/data/math-derivatives/questions.jsonl 3 evals/registry/data/russian-nlp-tasks/samples.jsonl 3 evals/registry/data/chinese_ancient_masterpieces_dynasty/samples.jsonl 3 evals/registry/data/mate-in-one/samples.jsonl 3 evals/registry/data/belarusian_syllable_count/samples.jsonl 3 evals/registry/data/knot-theory/knot-theory-unknotting-numbers.jsonl 3 evals/registry/data/knot-theory/knot-theory-unknotting-problems.jsonl 3 evals/registry/data/knot-theory/knot-theory-code-conversions.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/human_rights_miskito.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/honduras.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/miskito_people.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/mosquito.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/miskito_language.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/nicaragua.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/miskito_lessons.jsonl 3 evals/registry/data/skill_acquisition/miskito/knowledge_base/mosquito_coast.jsonl 3 evals/registry/data/skill_acquisition/miskito/qa_pairs_by_lesson.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_test_translation_fewshot.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_test_manipulation_fewshot.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_test_all.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_test_all_fewshot.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_test_manipulation.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_train_translation.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_train_manipulation.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_test_translation.jsonl 3 evals/registry/data/skill_acquisition/miskito/variants/miskito_train_all.jsonl 3 evals/registry/data/marxist_philosophy_exam/fuzzy_match.jsonl 3 evals/registry/data/math_for_5th-grader/samples.jsonl 3 evals/registry/data/coq-proof-step/match.jsonl 3 evals/registry/data/historical-kana-orthography-reading/samples.jsonl 3 evals/registry/data/belarusian_synonyms/samples.jsonl 3 evals/registry/data/irony/samples.jsonl 3 evals/registry/data/icelandic-inflection-easy/samples.jsonl 3 evals/registry/data/logiqa/logiqa.jsonl 3 evals/registry/data/imo_exact_answers/samples.jsonl 3 evals/registry/data/schelling_point/owt_5.jsonl 3 evals/registry/data/schelling_point/wikipedia_5.jsonl 3 evals/registry/data/schelling_point/mix.jsonl 3 evals/registry/data/schelling_point/random_numbers_10_3.jsonl 3 evals/registry/data/schelling_point/random_words_10.jsonl 3 evals/registry/data/belarusian_proverbs/samples.jsonl 3 evals/registry/data/japanese-station/samples.jsonl 3 evals/registry/data/lithuanian-exams-qa/samples.jsonl 3 evals/registry/data/french-exams-qa/samples.jsonl 3 evals/registry/data/newsology/samples.jsonl 3 evals/registry/data/kanji-idioms/samples.jsonl 3 evals/registry/data/text_compression/samples.jsonl 3 evals/registry/data/three-pt-mapping/three_pt_mapping.jsonl 3 evals/registry/data/romanian-logic/romanian-logic.jsonl 3 evals/registry/data/missing_operators/samples.jsonl 3 evals/registry/data/mendelian_inheritance/samples.jsonl 3 evals/registry/data/hungarian-exams-qa/samples.jsonl 3 evals/registry/data/japanese_romantic_context/samples.jsonl 3 evals/registry/data/logical_counting/samples.jsonl 3 evals/registry/data/unique_combinations/samples.jsonl 3 evals/registry/data/physics-interaction/samples.jsonl 3 evals/registry/data/rucola/few_shot.jsonl 3 evals/registry/data/rucola/samples.jsonl 3 evals/registry/data/ph_calculation/samples.jsonl 3 evals/registry/data/detect-hshd/detect-hshd.jsonl 3 evals/registry/data/japanese_number_reading/japanese_number_reading.jsonl 3 evals/registry/data/lat_long_identify/samples.jsonl 3 evals/registry/data/irish-lexicon/samples.jsonl 3 evals/registry/data/automata-and-complexity/samples.jsonl 3 evals/registry/data/seo_keywords/samples.jsonl 3 evals/registry/data/ukraine_electronic_petitions/samples.jsonl 3 evals/registry/data/partially_solved_crossword_clues/samples.jsonl 3 evals/registry/data/polish_rhymes_generation/samples.jsonl 3 evals/registry/data/numeral-type-comparisons/samples.jsonl 3 evals/registry/data/confusing_korean/samples.jsonl 3 evals/registry/data/ordered-history-events/samples.jsonl 3 evals/registry/data/allergen-information/samples.jsonl 3 evals/registry/data/GOL/samples.jsonl 3 evals/registry/data/swap-words/samples.jsonl 3 evals/registry/data/python_list_comprehension/samples.jsonl 3 evals/registry/data/arithmetic-expression/samples.jsonl 3 evals/registry/data/arithmetic-expression/labeled-samples.jsonl 3 evals/registry/data/escher_sentences/samples.jsonl 3 evals/registry/data/twenty_questions/dataset.jsonl 3 evals/registry/data/twenty_questions/lexicon_nouns.jsonl 3 evals/registry/data/invoices/match.jsonl 3 evals/registry/data/event_categories/samples.jsonl 3 evals/registry/data/gujarati_numerals/samples.jsonl 3 evals/registry/data/rectangles/samples.jsonl 3 evals/registry/data/persian-kinship-riddles/samples.jsonl 3 evals/registry/data/date-booking/samples.jsonl 3 evals/registry/data/bitwise/samples.jsonl 3 evals/registry/data/phonetics-identify-words-needing-missing-gpcs/samples.jsonl 3 evals/registry/data/euler_problems/euler_problems.jsonl 3 evals/registry/data/arabic-exams-qa/samples.jsonl 3 evals/registry/data/resistor_ohm_calculator/samples.jsonl 3 evals/registry/data/positive-binary-operations/samples.jsonl 3 evals/registry/data/positive-binary-operations/fewshot.jsonl 3 evals/registry/data/accounting_audit/samples.jsonl 3 evals/registry/data/albanian-exams-qa/samples.jsonl 3 evals/registry/data/finance/credit.jsonl 3 evals/registry/data/russe/few_shot.jsonl 3 evals/registry/data/russe/samples.jsonl 3 evals/registry/data/us_tort_law/few_shot.jsonl 3 evals/registry/data/us_tort_law/samples.jsonl 3 evals/registry/data/gregorian-to-hebrew-date/samples.jsonl 3 evals/registry/data/cybersecurity/filepaths.jsonl 3 evals/registry/data/chinese_idioms/samples.jsonl 3 evals/registry/data/latin_grammar/samples.jsonl 3 evals/registry/data/parable-to-moral-match/parable-to-moral-match-zh.jsonl 3 evals/registry/data/parable-to-moral-match/parable-to-moral-match-en.jsonl 3 evals/registry/data/reverse-sort-words-eng/samples.jsonl 3 evals/registry/data/japanese-decimal-units/samples.jsonl 3 evals/registry/data/ordering_randomised_versionlist/samples.jsonl 3 evals/registry/data/gears_rotation/samples.jsonl 3 evals/registry/data/logiqa-logical-reasoning-plus/logiqav2-logical-reasoning-plus.jsonl 3 evals/registry/data/logiqa-logical-reasoning-plus/reclor-logical-reasoning-plus.jsonl 3 evals/registry/data/logiqa-logical-reasoning-plus/logiqa-logical-reasoning-plus.jsonl 3 evals/registry/data/svg_to_text/samples.jsonl 3 evals/registry/data/quartz/few_shot.jsonl 3 evals/registry/data/quartz/samples.jsonl 3 evals/registry/data/singlestore-vectorsearch/samples.jsonl 3 evals/registry/data/function_deduction/data.jsonl 3 evals/registry/data/last_word_nth/samples.jsonl 3 evals/registry/data/Unfamiliar-Chinese-Character/samples.jsonl 3 evals/registry/data/diabetes/samples.jsonl 3 evals/registry/data/number_pattern/samples.jsonl 3 evals/registry/data/population_span_extraction/samples.jsonl 3 evals/registry/data/probabilities-word-problems/samples.jsonl 3 evals/registry/data/pantone_to_hex/samples.jsonl 3 evals/registry/data/chinese_song_ci/samples.jsonl 3 evals/registry/data/hebrew_rhyme/samples.jsonl 3 evals/registry/modelgraded/research-question-extraction.yaml 19 evals/registry/modelgraded/possible.yaml 18 evals/registry/modelgraded/best.yaml 10 evals/registry/modelgraded/fact.yaml 19 evals/registry/modelgraded/keywords.yaml 20 evals/registry/modelgraded/battle.yaml 21 evals/registry/modelgraded/singlestore.yaml 24 evals/registry/modelgraded/arithmetic-expression.yaml 24 evals/registry/modelgraded/diversity.yaml 13 evals/registry/modelgraded/onomatopoeia.yaml 30 evals/registry/modelgraded/humor.yaml 17 evals/registry/modelgraded/closedqa.yaml 21 evals/registry/modelgraded/sql.yaml 24 evals/registry/modelgraded/iambic_pentameter.yaml 14 evals/registry/modelgraded/translation.yaml 19 evals/registry/modelgraded/regression-equation.yaml 26 evals/registry/modelgraded/security.yaml 14 evals/registry/modelgraded/rhyming.yaml 13 evals/registry/eval_sets/chinese-numbers.yaml 6 evals/registry/eval_sets/schelling_point.yaml 6 evals/registry/eval_sets/mmmu.yaml 3 evals/registry/eval_sets/raven-matrices.yaml 30 evals/registry/eval_sets/exams-all.yaml 18 evals/registry/eval_sets/test-basic.yaml 6 evals/registry/eval_sets/hr-ml-agent-bench.yaml 34 evals/registry/eval_sets/test-modelgraded.yaml 14 evals/registry/eval_sets/coqa-ex.yaml 7 evals/registry/eval_sets/stock-options.yaml 14 evals/registry/eval_sets/manga-translation.yaml 5 evals/registry/eval_sets/logiqa-logical-reasoning-plus.yaml 5 evals/registry/eval_sets/css-selectors.yaml 4 evals/registry/eval_sets/ukraine-gec.yaml 20 evals/registry/eval_sets/test-all.yaml 21 evals/registry/eval_sets/word-associations.yaml 6 evals/registry/eval_sets/pointer-value-retrieval.yaml 8 evals/registry/eval_sets/mazes.yaml 8 evals/registry/completion_fns/cot.yaml 16 evals/registry/completion_fns/langchain_chains.yaml 2 evals/registry/completion_fns/langchain_llms.yaml 24 evals/registry/evals/language.yaml 15 evals/registry/evals/irrelevant-negative-diversion.yaml 3 evals/registry/evals/chinese_homonym.yaml 3 evals/registry/evals/infiniteloop-match.yaml 3 evals/registry/evals/simple_math.yaml 8 evals/registry/evals/chinese_song_ci.yaml 7 evals/registry/evals/Unfamiliar-Chinese-Character.yaml 9 evals/registry/evals/resource_id_extraction.yaml 8 evals/registry/evals/superficial-patterns.yaml 9 evals/registry/evals/actors-sequence.yaml 8 evals/registry/evals/track_the_stat.yaml 20 evals/registry/evals/reverse-sort-words-eng.yaml 8 evals/registry/evals/svg_alphabet.yaml 8 evals/registry/evals/dutch-lexicon.yaml 3 evals/registry/evals/emotional-intelligence.yaml 3 evals/registry/evals/word-association.yaml 32 evals/registry/evals/cybersecurity-filepaths.yaml 8 evals/registry/evals/product-matching.yaml 46 evals/registry/evals/islands.yaml 8 evals/registry/evals/rock-climbing.yaml 7 evals/registry/evals/tempo_to_measure_count.yaml 3 evals/registry/evals/greek-nt-manuscripts.yaml 3 evals/registry/evals/number-pattern.yaml 7 evals/registry/evals/spanish_feminine_noun_masculine_article.yaml 8 evals/registry/evals/test-modelgraded-battle.yaml 36 evals/registry/evals/code_progress.yaml 8 evals/registry/evals/belarusian-rhyme.yaml 3 evals/registry/evals/day-of-week-from-date.yaml 7 evals/registry/evals/research-question-extraction.yaml 11 evals/registry/evals/finnish-rhyme.yaml 8 evals/registry/evals/python_list_comprehension.yaml 8 evals/registry/evals/geometry_puzzle.yaml 3 evals/registry/evals/allergen-information.yaml 3 evals/registry/evals/bluff.yaml 39 evals/registry/evals/phonetics-identify-words-needing-missing-gpcs.yaml 8 evals/registry/evals/time-zone-conversion.yaml 8 evals/registry/evals/korean_date_counting.yaml 3 evals/registry/evals/polish-proverbs.yaml 2 evals/registry/evals/russe.yaml 10 evals/registry/evals/rare-and-loanwords-dutch-lexicon.yaml 3 evals/registry/evals/ner_finance.yaml 9 evals/registry/evals/belarusian-antonyms.yaml 3 evals/registry/evals/italian-rhyme.yaml 8 evals/registry/evals/indonesian_numbers.yaml 7 evals/registry/evals/russian-lexicon.yaml 3 evals/registry/evals/japanese_city_name_pronuciation.yaml 3 evals/registry/evals/finance.yaml 3 evals/registry/evals/schelling_point.yaml 66 evals/registry/evals/self_prompting.yaml 19 evals/registry/evals/gears_rotation.yaml 3 evals/registry/evals/knot-theory.yaml 21 evals/registry/evals/portuguese-syllable-count.yaml 9 evals/registry/evals/simple-charting.yaml 3 evals/registry/evals/singlestore-vectorsearch.yaml 10 evals/registry/evals/logic-grid-eval.yaml 11 evals/registry/evals/naughty_strings.yaml 36 evals/registry/evals/mandaliof-table.yaml 3 evals/registry/evals/prompt-injection.yaml 3 evals/registry/evals/numbers_game.yaml 3 evals/registry/evals/building_floorplan.yaml 8 evals/registry/evals/count_intersections_polynomial.yaml 8 evals/registry/evals/icelandic-inflection-medium.yaml 3 evals/registry/evals/numerical-cabbala-casanova.yaml 10 evals/registry/evals/adultery_state_laws.yaml 8 evals/registry/evals/chinese_ancient_masterpieces_dynasty.yaml 8 evals/registry/evals/hebrew_talmud_suka.yaml 8 evals/registry/evals/gpt-protocol-buffers.yaml 7 evals/registry/evals/skill_acquisition.yaml 94 evals/registry/evals/nfl-point-combinations.yaml 3 evals/registry/evals/bulgarian-lexicon.yaml 3 evals/registry/evals/human-safety.yaml 8 evals/registry/evals/mmlu.yaml 399 evals/registry/evals/twenty_questions.yaml 54 evals/registry/evals/isosceles-right-triangle.yaml 7 evals/registry/evals/belarusian-grammar.yaml 3 evals/registry/evals/abstract-causal-reasoning.yaml 16 evals/registry/evals/corr2cause.yaml 3 evals/registry/evals/anagrams.yaml 10 evals/registry/evals/3d_object_manipulation.yaml 8 evals/registry/evals/finance_calc.yaml 8 evals/registry/evals/chinese-lantern-riddles.yaml 3 evals/registry/evals/unsolvable_questions.yaml 7 evals/registry/evals/accounting_audit.yaml 7 evals/registry/evals/utility_price_parsing.yaml 7 evals/registry/evals/resistor-ohm-calculator.yaml 3 evals/registry/evals/partially_solved_crossword_clues.yaml 7 evals/registry/evals/arithmetical_puzzles.yaml 3 evals/registry/evals/russian_sarcasm.yaml 8 evals/registry/evals/word_vector_over_reliance.yaml 9 evals/registry/evals/directions.yaml 8 evals/registry/evals/pure_korean.yaml 8 evals/registry/evals/unified-patch.yaml 7 evals/registry/evals/coq-editing.yaml 3 evals/registry/evals/diagrammatic_logic.yaml 7 evals/registry/evals/marxist_philosophy_exam.yaml 3 evals/registry/evals/swedish-spelling.yaml 3 evals/registry/evals/mmmu.yaml 390 evals/registry/evals/cant_do_that_anymore.yaml 20 evals/registry/evals/utah_real_estate.yaml 8 evals/registry/evals/dhammapada-reference.yaml 8 evals/registry/evals/brazilian_laws.yaml 8 evals/registry/evals/korean-consonant-vowel-combination.yaml 3 evals/registry/evals/decrypt-caesar-cipher.yaml 7 evals/registry/evals/nepali-song-singer.yaml 3 evals/registry/evals/find-letter.yaml 7 evals/registry/evals/chinese_tang_poetries.yaml 4 evals/registry/evals/reasoning_with_contradictory_statements.yaml 9 evals/registry/evals/belarusian-proverbs.yaml 2 evals/registry/evals/qa.yaml 3 evals/registry/evals/points-on-line.yaml 3 evals/registry/evals/russian-english-homonym-context-resolution.yaml 7 evals/registry/evals/korean_foreign_words.yaml 8 evals/registry/evals/stats-tests.yaml 7 evals/registry/evals/portuguese-sarcasm.yaml 8 evals/registry/evals/parable-to-moral-match.yaml 14 evals/registry/evals/physics-interaction.yaml 3 evals/registry/evals/portuguese-kinship-riddles.yaml 8 evals/registry/evals/chinese_chu_ci.yaml 8 evals/registry/evals/override-system-instruction.yaml 7 evals/registry/evals/comprehensive-graph-reasoning.yaml 3 evals/registry/evals/raven-matrices.yaml 224 evals/registry/evals/quartz.yaml 10 evals/registry/evals/norwegian-lexicon.yaml 3 evals/registry/evals/test-comp-sci.yaml 9 evals/registry/evals/logic-container.yaml 10 evals/registry/evals/iqbal-poetry-translation.yaml 24 evals/registry/evals/forth-stack-sim.yaml 19 evals/registry/evals/identifying_variables.yaml 123 evals/registry/evals/korean-phonetics.yaml 8 evals/registry/evals/icelandic-inflection-easy.yaml 3 evals/registry/evals/swedish_sat.yaml 3 evals/registry/evals/hebrew-homophones.yaml 8 evals/registry/evals/medmcqa.yaml 7 evals/registry/evals/belarusian-syllable-count.yaml 3 evals/registry/evals/poker_analysis.yaml 3 evals/registry/evals/regex-match.yaml 7 evals/registry/evals/japanese_driving_license.yaml 3 evals/registry/evals/music-theory-chord-notes.yaml 3 evals/registry/evals/linear-regression.yaml 21 evals/registry/evals/map-electronic-component-part-to-fact.yaml 7 evals/registry/evals/pantone_to_hex.yaml 7 evals/registry/evals/number-reading.yaml 3 evals/registry/evals/sexagenary-cycle-calculation.yaml 7 evals/registry/evals/taxes.yaml 7 evals/registry/evals/tricky-word-problems.yaml 3 evals/registry/evals/simple-block-puzzles.yaml 3 evals/registry/evals/syllables_long_words.yaml 7 evals/registry/evals/invoice_due_date_leap_day_adjustment.yaml 7 evals/registry/evals/gregorian-to-hebrew-date.yaml 9 evals/registry/evals/multistep-word-problems.yaml 3 evals/registry/evals/south-african-bands.yaml 3 evals/registry/evals/italian_big_math_expression.yaml 3 evals/registry/evals/monthly_metric_comparison.yaml 8 evals/registry/evals/cricket_situations.yaml 8 evals/registry/evals/greek-vocabulary.yaml 7 evals/registry/evals/shopping_discount_comparison.yaml 3 evals/registry/evals/japanese_populer_video_game_title_and_the_publisher.yaml 3 evals/registry/evals/internal_representations.yaml 7 evals/registry/evals/latin_grammar.yaml 3 evals/registry/evals/cube-pack.yaml 7 evals/registry/evals/korean-postposition.yaml 8 evals/registry/evals/ordering_randomised_versionlist.yaml 8 evals/registry/evals/moral_exceptQA.yaml 8 evals/registry/evals/illinois-law.yaml 3 evals/registry/evals/complex-analogies-en-ru.yaml 7 evals/registry/evals/connect-4.yaml 7 evals/registry/evals/pattern_identification.yaml 7 evals/registry/evals/hindi_words.yaml 7 evals/registry/evals/thirty_six_stratagems.yaml 3 evals/registry/evals/hebrew-rhyme.yaml 8 evals/registry/evals/proofreader.yaml 8 evals/registry/evals/logic-liar-paradox.yaml 3 evals/registry/evals/convert-hex-hsl-lightness.yaml 8 evals/registry/evals/food.yaml 7 evals/registry/evals/sort-numeric.yaml 8 evals/registry/evals/simple-knowledge-mongolian.yaml 3 evals/registry/evals/population_span_extraction.yaml 10 evals/registry/evals/ral_to_hex.yaml 7 evals/registry/evals/ru_rhymes.yaml 11 evals/registry/evals/norwegian-rhymes.yaml 8 evals/registry/evals/aba-mrpc-true-false.yaml 7 evals/registry/evals/persian-kinship-riddles.yaml 8 evals/registry/evals/unwanted-rhyming.yaml 9 evals/registry/evals/invert_word_wise.yaml 8 evals/registry/evals/vigenere.yaml 3 evals/registry/evals/code_combination.yaml 3 evals/registry/evals/dice-rotation-sequence.yaml 3 evals/registry/evals/next-val-series.yaml 3 evals/registry/evals/test-basic.yaml 32 evals/registry/evals/guess-the-singer.yaml 2 evals/registry/evals/italian-new-words.yaml 3 evals/registry/evals/atpl_exams.yaml 7 evals/registry/evals/steganography.yaml 3 evals/registry/evals/alternate-numeral-systems.yaml 3 evals/registry/evals/music-theory-chord-names.yaml 3 evals/registry/evals/logical-black-scholes.yaml 8 evals/registry/evals/wkt_understanding.yaml 8 evals/registry/evals/passing-balls.yaml 3 evals/registry/evals/spanish-lexicon.yaml 3 evals/registry/evals/tokyo-station-number.yaml 8 evals/registry/evals/hr-ml-agent-bench.yaml 137 evals/registry/evals/japanese_romantic_context.yaml 7 evals/registry/evals/ukraine-eit.yaml 7 evals/registry/evals/convert_bwt_num_and_chinese_num.yaml 32 evals/registry/evals/irish-lexicon.yaml 3 evals/registry/evals/reverse-polish-notation.yaml 3 evals/registry/evals/gujarati_numerals.yaml 3 evals/registry/evals/test-modelgraded.yaml 81 evals/registry/evals/imperial_date_to_string.yaml 7 evals/registry/evals/cardinal-directions.yaml 7 evals/registry/evals/belarusian-synonyms.yaml 3 evals/registry/evals/automata-and-complexity.yaml 3 evals/registry/evals/astro_eval.yaml 8 evals/registry/evals/belarusian-lexicon.yaml 3 evals/registry/evals/japanese_onomatopoeia.yaml 9 evals/registry/evals/coqa-ex.yaml 55 evals/registry/evals/syntax-check.yaml 3 evals/registry/evals/test-modelgraded-generated.yaml 9 evals/registry/evals/seo_keywords.yaml 3 evals/registry/evals/ascii-digit-recognition.yaml 8 evals/registry/evals/ab.yaml 3 evals/registry/evals/chinese_hard_translations.yaml 4 evals/registry/evals/theory_of_mind.yaml 48 evals/registry/evals/body-movement.yaml 3 evals/registry/evals/german-part-of-speech.yaml 3 evals/registry/evals/exams.yaml 112 evals/registry/evals/singapore_data_protection_decisions.yaml 8 evals/registry/evals/soc_codes.yaml 10 evals/registry/evals/swap-words.yaml 7 evals/registry/evals/security_guide.yaml 10 evals/registry/evals/jee-math.yaml 7 evals/registry/evals/multistep_web_tasks.yaml 60 evals/registry/evals/mendelian_inheritance.yaml 7 evals/registry/evals/iambic-pentameter.yaml 10 evals/registry/evals/solve-for-variable.yaml 8 evals/registry/evals/logic.yaml 9 evals/registry/evals/banking77.yaml 8 evals/registry/evals/shared-borders.yaml 8 evals/registry/evals/already_said_that.yaml 45 evals/registry/evals/finger-tracking.yaml 8 evals/registry/evals/invoices.yaml 3 evals/registry/evals/korean-honorific.yaml 8 evals/registry/evals/newsology.yaml 8 evals/registry/evals/find_country_from_svg.yaml 3 evals/registry/evals/asl-classifiers.yaml 3 evals/registry/evals/us-tort-law.yaml 11 evals/registry/evals/hebrew_grammar.yaml 8 evals/registry/evals/bias_detection.yaml 3 evals/registry/evals/complex-replace-characters.yaml 8 evals/registry/evals/bigrams.yaml 7 evals/registry/evals/balance-chemical-equation.yaml 7 evals/registry/evals/sarcasm.yaml 10 evals/registry/evals/job_listing_title_for_a_caregiver_in_japan.yaml 9 evals/registry/evals/squares-gpt.yaml 4 evals/registry/evals/arithmetic-expression.yaml 21 evals/registry/evals/gol.yaml 3 evals/registry/evals/chinese_ancient_poetry.yaml 3 evals/registry/evals/japanese-itpassport-exam01.yaml 8 evals/registry/evals/polish-lexicon.yaml 3 evals/registry/evals/music_theory_scale_modes.yaml 3 evals/registry/evals/blackfoot-numerals-modern.yaml 3 evals/registry/evals/lunar-calendar.yaml 20 evals/registry/evals/icelandic-sentences-gec.yaml 3 evals/registry/evals/nepali-numerals.yaml 3 evals/registry/evals/crepe.yaml 7 evals/registry/evals/math_equations.yaml 3 evals/registry/evals/dutch-rhymes.yaml 8 evals/registry/evals/determinant.yaml 7 evals/registry/evals/urdu-transliteration.yaml 3 evals/registry/evals/compare-countries-area.yaml 3 evals/registry/evals/chess-piece-count.yaml 3 evals/registry/evals/algebra-word-problems.yaml 9 evals/registry/evals/japanese-national-medical-exam02.yaml 7 evals/registry/evals/sindarin-fluency.yaml 3 evals/registry/evals/relative-orientations.yaml 7 evals/registry/evals/korean_dialects.yaml 3 evals/registry/evals/japanese-decimal-units.yaml 8 evals/registry/evals/interlingual-homograph.yaml 3 evals/registry/evals/chinese_homophonic.yaml 9 evals/registry/evals/math-derivatives.yaml 3 evals/registry/evals/canto_wu_pronunciation_fewshot.yaml 3 evals/registry/evals/ascii-wordart.yaml 3 evals/registry/evals/chinese_zodiac.yaml 7 evals/registry/evals/math_for_5th-grader.yaml 3 evals/registry/evals/rot13.yaml 3 evals/registry/evals/stock-options.yaml 96 evals/registry/evals/math_logic_operations.yaml 3 evals/registry/evals/ballots.yaml 34 evals/registry/evals/error_recovery.yaml 31 evals/registry/evals/logic-riddles.yaml 9 evals/registry/evals/japanese_approval.yaml 8 evals/registry/evals/Chinese_character_riddles.yaml 3 evals/registry/evals/sandbagging.yaml 54 evals/registry/evals/music-theory.yaml 20 evals/registry/evals/ambiguous-sentences.yaml 8 evals/registry/evals/hard_russian_computer_science_tasks.yaml 10 evals/registry/evals/loss-logic.yaml 9 evals/registry/evals/rubiks-colors.yaml 8 evals/registry/evals/premature-conclusions.yaml 3 evals/registry/evals/counterfactual-reasoning.yaml 8 evals/registry/evals/arc.yaml 7 evals/registry/evals/irish-plural-nouns.yaml 3 evals/registry/evals/french-lexicon.yaml 3 evals/registry/evals/missing-operators.yaml 9 evals/registry/evals/incontext_rl.yaml 53 evals/registry/evals/context-free-grammar.yaml 8 evals/registry/evals/bugged_tools.yaml 28 evals/registry/evals/first-letters.yaml 7 evals/registry/evals/financial-derivatives.yaml 8 evals/registry/evals/afrikaans-lexicon.yaml 3 evals/registry/evals/diabetes.yaml 7 evals/registry/evals/coq-proof-step.yaml 4 evals/registry/evals/function-deduction.yaml 4 evals/registry/evals/simple_physics_engine.yaml 3 evals/registry/evals/historical-kana-orthography-reading.yaml 3 evals/registry/evals/romanian-logic.yaml 3 evals/registry/evals/event-categories.yaml 10 evals/registry/evals/find-thirukkural.yaml 8 evals/registry/evals/pararule-plus-multi-step-deductive-reasoning.yaml 8 evals/registry/evals/belarusian-orthography.yaml 3 evals/registry/evals/date-booking.yaml 7 evals/registry/evals/game-theory.yaml 9 evals/registry/evals/brazilian-lexicon.yaml 3 evals/registry/evals/direct-speech-tag.yaml 7 evals/registry/evals/russian_medical.yaml 7 evals/registry/evals/manga-translation.yaml 21 evals/registry/evals/vintage_phone_keyboard_decode.yaml 8 evals/registry/evals/benjaminmoore_to_hex.yaml 7 evals/registry/evals/mate-in-one.yaml 8 evals/registry/evals/logiqa-logical-reasoning-plus.yaml 24 evals/registry/evals/consensus_summary.yaml 3 evals/registry/evals/numeral-type-comparisons.yaml 3 evals/registry/evals/confusing_korean.yaml 3 evals/registry/evals/positive-binary-operations.yaml 10 evals/registry/evals/GPT-model-text-detection.yaml 4 evals/registry/evals/russian-rhyme.yaml 8 evals/registry/evals/russian-verse.yaml 8 evals/registry/evals/korean_spaces.yaml 3 evals/registry/evals/chinese_poem.yaml 7 evals/registry/evals/belarusian-russian-translation.yaml 3 evals/registry/evals/svg_understanding.yaml 8 evals/registry/evals/chinese_modern_poem_identification.yaml 7 evals/registry/evals/probability_questions.yaml 3 evals/registry/evals/logic_and_probability.yaml 8 evals/registry/evals/chinese_shi_jing.yaml 7 evals/registry/evals/css-selectors.yaml 3 evals/registry/evals/which-is-heavier.yaml 3 evals/registry/evals/hebrew-same-noun-gender.yaml 8 evals/registry/evals/formal_logic.yaml 3 evals/registry/evals/viewport_to_grid_size.yaml 8 evals/registry/evals/french_homonym_and_homograph.yaml 8 evals/registry/evals/osm_mapping_one_way.yaml 3 evals/registry/evals/russian-nlp-tasks.yaml 7 evals/registry/evals/csharp-linq.yaml 8 evals/registry/evals/ukraine-gec.yaml 180 evals/registry/evals/list_comparison_missing_name.yaml 3 evals/registry/evals/last-word-nth.yaml 3 evals/registry/evals/hebrew_plurals.yaml 8 evals/registry/evals/belarusian-numerals.yaml 3 evals/registry/evals/sql.yaml 5 evals/registry/evals/heart-disease.yaml 3 evals/registry/evals/chess.yaml 3 evals/registry/evals/urdu-lexicon.yaml 3 evals/registry/evals/simple-visual-understanding.yaml 3 evals/registry/evals/json_patch_object.yaml 3 evals/registry/evals/chinese_famous_novel.yaml 8 evals/registry/evals/tetris.yaml 8 evals/registry/evals/multi-step-equations.yaml 7 evals/registry/evals/formal-grammar-to-regex.yaml 7 evals/registry/evals/make-me-say.yaml 25 evals/registry/evals/mapping_to_matricies.yaml 7 evals/registry/evals/japanese_prime_minister.yaml 7 evals/registry/evals/logic-statements.yaml 7 evals/registry/evals/count_token_freq_dna.yaml 3 evals/registry/evals/euler_problems.yaml 10 evals/registry/evals/hindi_upsc.yaml 7 evals/registry/evals/fcc_amateur_extra.yaml 9 evals/registry/evals/countries.yaml 7 evals/registry/evals/co-sql.yaml 4 evals/registry/evals/rucola.yaml 10 evals/registry/evals/medication_dose.yaml 3 evals/registry/evals/recurrence-relation.yaml 8 evals/registry/evals/chinese_idioms.yaml 3 evals/registry/evals/emoji-riddle.yaml 3 evals/registry/evals/polish_rhymes_generation.yaml 8 evals/registry/evals/kanji-idioms.yaml 3 evals/registry/evals/seating_arrangements.yaml 3 evals/registry/evals/belarusian-word-analogy-inflection.yaml 3 evals/registry/evals/bitwise.yaml 3 evals/registry/evals/product-ie.yaml 28 evals/registry/evals/hindi_shuddha.yaml 7 evals/registry/evals/abstract2title.yaml 4 evals/registry/evals/reverse-string.yaml 3 evals/registry/evals/smiles_to_formula.yaml 7 evals/registry/evals/canto_wu_pronunciation.yaml 3 evals/registry/evals/matrix-mult-rows.yaml 3 evals/registry/evals/ph_calculation.yaml 3 evals/registry/evals/born-first.yaml 3 evals/registry/evals/color_theory_complementary.yaml 3 evals/registry/evals/ukraine_electronic_petitions.yaml 9 evals/registry/evals/three-pt-mapping.yaml 3 evals/registry/evals/imo_exact_answers.yaml 8 evals/registry/evals/icelandic-inflection-hard.yaml 3 evals/registry/evals/date-calculator.yaml 7 evals/registry/evals/path_enclosed_area.yaml 8 evals/registry/evals/european-date-format-challenge.yaml 3 evals/registry/evals/nutrition.yaml 3 evals/registry/evals/korean_spelling.yaml 7 evals/registry/evals/cissp-study-questions.yaml 10 evals/registry/evals/2d_movement.yaml 3 evals/registry/evals/categorize_with_distractors.yaml 3 evals/registry/evals/poker_hand_ranks.yaml 7 evals/registry/evals/crontab.yaml 7 evals/registry/evals/lat_long_identify.yaml 7 evals/registry/evals/polish-syllable-count.yaml 7 evals/registry/evals/rectangles.yaml 7 evals/registry/evals/math_polish.yaml 3 evals/registry/evals/svg_to_text.yaml 8 evals/registry/evals/korean_yaminjeongeum.yaml 8 evals/registry/evals/unique_combinations.yaml 8 evals/registry/evals/largest_country.yaml 8 evals/registry/evals/japanese-station.yaml 7 evals/registry/evals/japanese-national-medical-exam01.yaml 7 evals/registry/evals/probabilities-word-problems.yaml 3 evals/registry/evals/romanian_homonyms.yaml 3 evals/registry/evals/3d_globe_movement.yaml 3 evals/registry/evals/japanese_mahjong_discard_tile.yaml 3 evals/registry/evals/escher-sentences.yaml 7 evals/registry/evals/irony.yaml 8 evals/registry/evals/french-part-of-speech.yaml 3 evals/registry/evals/rhetorical-devices.yaml 3 evals/registry/evals/turkish_characters.yaml 8 evals/registry/evals/reverse-shell.yaml 3 evals/registry/evals/logical_counting.yaml 7 evals/registry/evals/make-me-pay.yaml 59 evals/registry/evals/logiqa.yaml 7 evals/registry/evals/text_compression.yaml 3 evals/registry/evals/beam-analysis.yaml 3 evals/registry/evals/backgammon.yaml 16 evals/registry/evals/tracking-shuffled-objects.yaml 7 evals/registry/evals/chinese-remainder-theorem.yaml 3 evals/registry/evals/dna-melting-calculation.yaml 3 evals/registry/evals/hebrew-bible.yaml 8 evals/registry/evals/linear-equations.yaml 7 evals/registry/evals/base64-decode.yaml 3 evals/registry/evals/pointer-value-retrieval.yaml 48 evals/registry/evals/split_chinese_characters.yaml 7 evals/registry/evals/integer-sequence-predictions.yaml 28 evals/registry/evals/arabic-literature-qa.yaml 7 evals/registry/evals/detect-hshd.yaml 7 evals/registry/evals/aime_evaluation.yaml 3 evals/registry/evals/korean_romanization.yaml 3 evals/registry/evals/shape-in-shape.yaml 3 evals/registry/evals/mazes.yaml 48 evals/registry/evals/ordered-history-events.yaml 7 evals/registry/evals/track_objects.yaml 3 evals/registry/evals/japanese_number_reading.yaml 3 evals/formatting.py 14 evals/cli/oaievalset.py 111 evals/cli/oaieval.py 253 evals/metrics.py 52 evals/prompt/base.py 64 evals/eval.py 170 evals/completion_fns/langchain_math.py 21 evals/completion_fns/retrieval.py 68 evals/completion_fns/cot.py 47 evals/completion_fns/__init__.py 1 evals/completion_fns/solver_completion_fn.py 47 evals/completion_fns/openai.py 147 evals/completion_fns/langchain_llm.py 70 evals/record.py 450 evals/api.py 61