evals/elsuite/multistep_web_tasks/webarena/eval_run.py (8 lines):
	- line 118: # TODO: maybe make this config editable or put inside generate trajectory
	- line 142: # TODO: make some kind of FailedAction so we can ask for a retry
	- line 143: # TODO: early stop when too many action parsing errors occur
	- line 175: # TODO: clean this up somehow, so I don't have to check
	- line 259: """TODO: add more features to this, such as creating a render
	- line 270: """TODO: move this and constituent functions to separate file/dir"""
	- line 271: # TODO: change to match-case statement in Python 3.10
	- line 337: # TODO: implement parsing failure early stopping


evals/elsuite/multistep_web_tasks/webarena/browser_env/basic_browser_env.py (7 lines):
	- line 62: # TODO: make Space[Action] = ActionSpace
	- line 92: """Possible TODO: move the setup logic from the API to this function so
	- line 97: # TODO: work out if registering/attaching the container should happen inside `session.setup_container`
	- line 105: # TODO: work out if this should happen in PageForwarder.__init__ or here or .setup()
	- line 152: # TODO: work out if we should be resetting through to the Gym base class
	- line 154: # TODO: clean up the container and reuse it rather than tearing down and making a new one
	- line 185: raise NotImplementedError("TODO: traces with flask-playwright api")


evals/elsuite/multistep_web_tasks/session.py (6 lines):
	- line 47: # TODO: maybe clean up/parallelise this
	- line 96: TODO: Work out if the network should be handled some other way"""
	- line 198: # TODO: work out if there's a more flexible way to wait for redis to be running rather than sleeping 5s
	- line 229: # TODO: work out if there's a more flexible way to wait for redis to be running
	- line 431: # TODO: work out if this can/should be cleaned up
	- line 445: TODO: maybe allow some to stay, esp. if they're stateless?"""


evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/evaluators.py (5 lines):
	- line 232: time.sleep(3)  # TODO [shuyanzh]: fix this hard-coded sleep
	- line 276: # TODO: work out why it's done this way
	- line 307: # TODO: add 'eval' and maybe others to the experiment_config base class
	- line 327: # TODO: add 'eval' and maybe others to the experiment_config base class
	- line 343: # TODO: add 'eval' and maybe others to the experiment_config base class


evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/env/train.py (3 lines):
	- line 8: # TODO: pre-process data
	- line 10: model: torch.nn.Module = None  # TODO: define model here
	- line 12: # TODO: train model


evals/elsuite/incontext_rl/eval.py (3 lines):
	- line 77: action_space_n=env.action_space.n,  # TODO might not be available for all envs, check when adding a continuous env
	- line 78: observation_space_n=env.observation_space.n,  # TODO might not be available for all envs, check when adding a continuous env
	- line 229: ):  # TODO this might not work for non-discrete action spaces, check with more complex env


evals/elsuite/multistep_web_tasks/webarena/browser_env/processors.py (3 lines):
	- line 504: # TODO: support multiple tabs, e.g. something like:
	- line 574: raise NotImplementedError("TODO: Images with flask-playwright api")
	- line 605: # TODO: stop hardcoding AccTree here


evals/elsuite/multistep_web_tasks/webarena/core/env.py (3 lines):
	- line 15: # TODO: work out if these should be implemented differently -
	- line 57: TODO: work out if truncated and info are strictly necessary"""
	- line 79: """TODO: improve the way this string is built"""


evals/solvers/providers/openai/openai_assistants_solver.py (2 lines):
	- line 161: # TODO: Handle content.text.annotations ?
	- line 169: # TODO: The Assistant also reports Run Steps which detail logs for tool use


evals/elsuite/multistep_web_tasks/webarena/bash_env/basic_bash_env.py (2 lines):
	- line 46: # TODO: work out if there's a better way to wait on the container
	- line 153: # TODO: work out if registering/attaching the container should happen inside `session.setup_container`


evals/solvers/providers/anthropic/anthropic_solver.py (2 lines):
	- line 51: # TODO: handle context length limit; possible once anthropic tokenizer is available
	- line 137: # TODO: make this format of dict a dataclass type to be reused througout lib?


evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/train.py (2 lines):
	- line 103: ## TODO: data cleaning and feature engineering
	- line 176: # TODO - make your predictions here by modifying 'rating' sample_submission dataframe


evals/elsuite/skill_acquisition/scraping/scrape_miskito.py (2 lines):
	- line 9: # TODO: make sure italicised text is crawled properly and that hints are excluded from answers.
	- line 10: # TODO: Split any multi-part questions into individual questions.


evals/solvers/solver.py (2 lines):
	- line 140: # TODO: Should we allow nested solvers to (also) take Solver classes instead of SolverSpecs?
	- line 194: solver_copy = deepcopy(self)  # TODO: We should deepcopy without copying the cache


evals/elsuite/skill_acquisition/eval.py (2 lines):
	- line 206: # TODO: decide which metric to report – propagated standard deviation
	- line 270: # TODO: more general file format.


evals/solvers/providers/google/gemini_solver.py (2 lines):
	- line 44: # TODO: Could we just use google's own types?
	- line 136: # TODO: Why does this error ever occur and how can we handle it better?


evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_env.py (2 lines):
	- line 73: TODO: work out what observation to return
	- line 102: TODO: work out if it matters that stop actions will always be parsed


evals/elsuite/multistep_web_tasks/webarena/core/utils.py (2 lines):
	- line 101: """TODO: add the hardcoded args to from_dict if we want to change them"""
	- line 118: """TODO: add the hardcoded args to to_dict if we want to record them"""


evals/elsuite/multistep_web_tasks/docker/flask-playwright/app.py (2 lines):
	- line 29: # TODO: pass this instead of hardcoding it
	- line 55: storage_state=None,  # TODO: pass this if needed (how to handle auth?)


evals/registry/solvers/error_recovery.yaml (1 line):
	- line 1: # TODO: use default solvers once they are versioned


evals/elsuite/hr_ml_agent_bench/high_level_actions.py (1 line):
	- line 76: # TODO: handle long file editing


evals/elsuite/error_recovery/scripts/dataset_creation.py (1 line):
	- line 90: TODO (ian): think about renaming


evals/elsuite/function_deduction/prompts.py (1 line):
	- line 25: #   TODO: Include in the task description whether we're in the easy or hard mode?


evals/record.py (1 line):
	- line 501: # TODO: model_name -> completion_fns


evals/elsuite/multistep_web_tasks/webarena/browser_env/actions.py (1 line):
	- line 354: ]:  # TODO: can be further optimized


evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_solver.py (1 line):
	- line 91: # TODO: use as many previous observations as will fit in the context, rather than just 3


pyproject.toml (1 line):
	- line 69: # TODO: This should be brought down in the future


evals/elsuite/function_deduction/solvers.py (1 line):
	- line 94: #   TODO: Once we have solvers with proper built-in support for output postprocessors,


evals/elsuite/incontext_rl/baselines.py (1 line):
	- line 83: # TODO these might not be true if environment is not discrete


evals/elsuite/function_deduction/eval.py (1 line):
	- line 251: #   TODO: add more complexity-related metrics, such as correlation or linear regression coefficient.


evals/elsuite/error_recovery/scripts/make_plots.py (1 line):
	- line 123: # TODO: work out how to order a variable set of models


evals/elsuite/twenty_questions/eval.py (1 line):
	- line 177: # TODO: Maybe make the guesser retry here?


evals/elsuite/error_recovery/eval.py (1 line):
	- line 202: TODO (ian): Work out whether to add mistake notification to 'no reasoning' baseline


evals/solvers/nested/fewshot_solver.py (1 line):
	- line 12: train_jsonl: str,  # TODO: move this to be handled eval-side


evals/elsuite/multistep_web_tasks/webarena/core/playwright_api.py (1 line):
	- line 30: # TODO: find a way to dynamically wait


evals/registry/data/hr_ml_agent_bench/parkinsons_disease/dataset/public_timeseries_testing_util.py (1 line):
	- line 25: # TODO: uncomment and fill in the following three variables


evals/solvers/nested/cot_solver.py (1 line):
	- line 17: private_interaction_length: int = 3,  # TODO: do this better


evals/elsuite/self_prompting/eval.py (1 line):
	- line 142: ):  # TODO: Ideally we would check that the tasks are the same


evals/registry/solvers/skill_acquisition.yaml (1 line):
	- line 133: # TODO: refactor few-shot solver so that train_jsonl is not parameterised here to reduce verbosity.


evals/registry/data/backgammon/generate_samples.ipynb (1 line):
	- line 897: "        # TODO: check how to phrase this\n",


evals/elsuite/skill_acquisition/scripts/make_plots.py (1 line):
	- line 143: # TODO: report directly as 'average_correct_calls' in future and remove this rename.


evals/elsuite/hr_ml_agent_bench/auto_marking.py (1 line):
	- line 28: # TODO: refactor this to not dynamically import the grade module