evals/elsuite/multistep_web_tasks/webarena/eval_run.py (8 lines): - line 118: # TODO: maybe make this config editable or put inside generate trajectory - line 142: # TODO: make some kind of FailedAction so we can ask for a retry - line 143: # TODO: early stop when too many action parsing errors occur - line 175: # TODO: clean this up somehow, so I don't have to check - line 259: """TODO: add more features to this, such as creating a render - line 270: """TODO: move this and constituent functions to separate file/dir""" - line 271: # TODO: change to match-case statement in Python 3.10 - line 337: # TODO: implement parsing failure early stopping evals/elsuite/multistep_web_tasks/webarena/browser_env/basic_browser_env.py (7 lines): - line 62: # TODO: make Space[Action] = ActionSpace - line 92: """Possible TODO: move the setup logic from the API to this function so - line 97: # TODO: work out if registering/attaching the container should happen inside `session.setup_container` - line 105: # TODO: work out if this should happen in PageForwarder.__init__ or here or .setup() - line 152: # TODO: work out if we should be resetting through to the Gym base class - line 154: # TODO: clean up the container and reuse it rather than tearing down and making a new one - line 185: raise NotImplementedError("TODO: traces with flask-playwright api") evals/elsuite/multistep_web_tasks/session.py (6 lines): - line 47: # TODO: maybe clean up/parallelise this - line 96: TODO: Work out if the network should be handled some other way""" - line 198: # TODO: work out if there's a more flexible way to wait for redis to be running rather than sleeping 5s - line 229: # TODO: work out if there's a more flexible way to wait for redis to be running - line 431: # TODO: work out if this can/should be cleaned up - line 445: TODO: maybe allow some to stay, esp. if they're stateless?""" evals/elsuite/multistep_web_tasks/webarena/evaluation_harness/evaluators.py (5 lines): - line 232: time.sleep(3) # TODO [shuyanzh]: fix this hard-coded sleep - line 276: # TODO: work out why it's done this way - line 307: # TODO: add 'eval' and maybe others to the experiment_config base class - line 327: # TODO: add 'eval' and maybe others to the experiment_config base class - line 343: # TODO: add 'eval' and maybe others to the experiment_config base class evals/elsuite/hr_ml_agent_bench/benchmarks/imdb/env/train.py (3 lines): - line 8: # TODO: pre-process data - line 10: model: torch.nn.Module = None # TODO: define model here - line 12: # TODO: train model evals/elsuite/incontext_rl/eval.py (3 lines): - line 77: action_space_n=env.action_space.n, # TODO might not be available for all envs, check when adding a continuous env - line 78: observation_space_n=env.observation_space.n, # TODO might not be available for all envs, check when adding a continuous env - line 229: ): # TODO this might not work for non-discrete action spaces, check with more complex env evals/elsuite/multistep_web_tasks/webarena/browser_env/processors.py (3 lines): - line 504: # TODO: support multiple tabs, e.g. something like: - line 574: raise NotImplementedError("TODO: Images with flask-playwright api") - line 605: # TODO: stop hardcoding AccTree here evals/elsuite/multistep_web_tasks/webarena/core/env.py (3 lines): - line 15: # TODO: work out if these should be implemented differently - - line 57: TODO: work out if truncated and info are strictly necessary""" - line 79: """TODO: improve the way this string is built""" evals/solvers/providers/openai/openai_assistants_solver.py (2 lines): - line 161: # TODO: Handle content.text.annotations ? - line 169: # TODO: The Assistant also reports Run Steps which detail logs for tool use evals/elsuite/multistep_web_tasks/webarena/bash_env/basic_bash_env.py (2 lines): - line 46: # TODO: work out if there's a better way to wait on the container - line 153: # TODO: work out if registering/attaching the container should happen inside `session.setup_container` evals/solvers/providers/anthropic/anthropic_solver.py (2 lines): - line 51: # TODO: handle context length limit; possible once anthropic tokenizer is available - line 137: # TODO: make this format of dict a dataclass type to be reused througout lib? evals/elsuite/hr_ml_agent_bench/benchmarks/parkinsons_disease/env/train.py (2 lines): - line 103: ## TODO: data cleaning and feature engineering - line 176: # TODO - make your predictions here by modifying 'rating' sample_submission dataframe evals/elsuite/skill_acquisition/scraping/scrape_miskito.py (2 lines): - line 9: # TODO: make sure italicised text is crawled properly and that hints are excluded from answers. - line 10: # TODO: Split any multi-part questions into individual questions. evals/solvers/solver.py (2 lines): - line 140: # TODO: Should we allow nested solvers to (also) take Solver classes instead of SolverSpecs? - line 194: solver_copy = deepcopy(self) # TODO: We should deepcopy without copying the cache evals/elsuite/skill_acquisition/eval.py (2 lines): - line 206: # TODO: decide which metric to report – propagated standard deviation - line 270: # TODO: more general file format. evals/solvers/providers/google/gemini_solver.py (2 lines): - line 44: # TODO: Could we just use google's own types? - line 136: # TODO: Why does this error ever occur and how can we handle it better? evals/elsuite/multistep_web_tasks/webarena/bash_browser_env/bash_browser_env.py (2 lines): - line 73: TODO: work out what observation to return - line 102: TODO: work out if it matters that stop actions will always be parsed evals/elsuite/multistep_web_tasks/webarena/core/utils.py (2 lines): - line 101: """TODO: add the hardcoded args to from_dict if we want to change them""" - line 118: """TODO: add the hardcoded args to to_dict if we want to record them""" evals/elsuite/multistep_web_tasks/docker/flask-playwright/app.py (2 lines): - line 29: # TODO: pass this instead of hardcoding it - line 55: storage_state=None, # TODO: pass this if needed (how to handle auth?) evals/registry/solvers/error_recovery.yaml (1 line): - line 1: # TODO: use default solvers once they are versioned evals/elsuite/hr_ml_agent_bench/high_level_actions.py (1 line): - line 76: # TODO: handle long file editing evals/elsuite/error_recovery/scripts/dataset_creation.py (1 line): - line 90: TODO (ian): think about renaming evals/elsuite/function_deduction/prompts.py (1 line): - line 25: # TODO: Include in the task description whether we're in the easy or hard mode? evals/record.py (1 line): - line 501: # TODO: model_name -> completion_fns evals/elsuite/multistep_web_tasks/webarena/browser_env/actions.py (1 line): - line 354: ]: # TODO: can be further optimized evals/elsuite/multistep_web_tasks/solvers/strong_solver/strong_solver.py (1 line): - line 91: # TODO: use as many previous observations as will fit in the context, rather than just 3 pyproject.toml (1 line): - line 69: # TODO: This should be brought down in the future evals/elsuite/function_deduction/solvers.py (1 line): - line 94: # TODO: Once we have solvers with proper built-in support for output postprocessors, evals/elsuite/incontext_rl/baselines.py (1 line): - line 83: # TODO these might not be true if environment is not discrete evals/elsuite/function_deduction/eval.py (1 line): - line 251: # TODO: add more complexity-related metrics, such as correlation or linear regression coefficient. evals/elsuite/error_recovery/scripts/make_plots.py (1 line): - line 123: # TODO: work out how to order a variable set of models evals/elsuite/twenty_questions/eval.py (1 line): - line 177: # TODO: Maybe make the guesser retry here? evals/elsuite/error_recovery/eval.py (1 line): - line 202: TODO (ian): Work out whether to add mistake notification to 'no reasoning' baseline evals/solvers/nested/fewshot_solver.py (1 line): - line 12: train_jsonl: str, # TODO: move this to be handled eval-side evals/elsuite/multistep_web_tasks/webarena/core/playwright_api.py (1 line): - line 30: # TODO: find a way to dynamically wait evals/registry/data/hr_ml_agent_bench/parkinsons_disease/dataset/public_timeseries_testing_util.py (1 line): - line 25: # TODO: uncomment and fill in the following three variables evals/solvers/nested/cot_solver.py (1 line): - line 17: private_interaction_length: int = 3, # TODO: do this better evals/elsuite/self_prompting/eval.py (1 line): - line 142: ): # TODO: Ideally we would check that the tasks are the same evals/registry/solvers/skill_acquisition.yaml (1 line): - line 133: # TODO: refactor few-shot solver so that train_jsonl is not parameterised here to reduce verbosity. evals/registry/data/backgammon/generate_samples.ipynb (1 line): - line 897: " # TODO: check how to phrase this\n", evals/elsuite/skill_acquisition/scripts/make_plots.py (1 line): - line 143: # TODO: report directly as 'average_correct_calls' in future and remove this rename. evals/elsuite/hr_ml_agent_bench/auto_marking.py (1 line): - line 28: # TODO: refactor this to not dynamically import the grade module