lmms_eval/api/task.py (16 lines):
	- line 67: fewshot_split: str = None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
	- line 123: # TODO: how to make TaskConfigs be de- and re-serializable, even when using the !function constructor?
	- line 139: # TODO: should any default value in the TaskConfig not be printed?
	- line 147: # TODO: this should handle Promptsource template objects as a separate case?
	- line 371: # sample fewshot context #TODO: need to offset doc_id by rank now!
	- line 374: # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
	- line 400: TODO: update this docstring
	- line 507: # TODO: this should only return the overrides applied to a non-YAML task's configuration.
	- line 517: def __init__(self, model_name) -> None:  # TODO no super() call here
	- line 635: # TODO: handle this in TaskConfig.__post_init__ ?
	- line 949: # TODO: we should raise a warning telling users this will at most ~2x runtime.
	- line 1035: # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
	- line 1070: # TODO: this may break for multipLe_target, non zero-or-1 metrics
	- line 1083: except TypeError:  # TODO: this is hacky and I don't want to do it
	- line 1086: # TODO: this handles the case where HF evaluate returns a dict.
	- line 1103: # TODO: this handles the case where HF evaluate returns a dict.


lmms_eval/evaluator.py (4 lines):
	- line 81: torch.manual_seed(1234)  # TODO: this may affect training runs that are run with evaluation mid-run.
	- line 298: # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
	- line 315: # TODO: make it possible to use a different metric per filter
	- line 484: # TODO: There should be a way for users


lmms_eval/api/samplers.py (4 lines):
	- line 28: # TODO: should we just stop people from using fewshot from same split as evaluating?
	- line 34: # TODO: is separating doc_to_text and doc_to_target by one space always desired?
	- line 71: TODO: this should return approximately class-balanced samples from our fewshot examples.
	- line 72: TODO: what order should they be in? maybe random?


lmms_eval/filters/__init__.py (2 lines):
	- line 16: # TODO: implement this filter. either it should take in an arbitrary "scoring"/reward function
	- line 40: f = get_filter(function)(**kwargs)  # TODO: pass kwargs to filters properly


lmms_eval/tasks/__init__.py (2 lines):
	- line 127: # TODO: scrap this
	- line 132: # TODO: pass num_fewshot and other cmdline overrides in a better way


lmms_eval/models/llava.py (1 line):
	- line 357: # TODO: pay attention to this major generation step...


lmms_eval/api/model.py (1 line):
	- line 57: # TODO: Add an optional max length


lmms_eval/api/registry.py (1 line):
	- line 81: # TODO: do we want to enforce a certain interface to registered metrics?


lmms_eval/filters/decontamination.py (1 line):
	- line 14: TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").


lmms_eval/utils.py (1 line):
	- line 273: # TODO: overhaul reorderer. It currently grouped requests by content but we don't want this


lmms_eval/api/metrics.py (1 line):
	- line 92: Higher is better  # TODO I think


lmms_eval/api/instance.py (1 line):
	- line 10: metadata: Tuple[str, int, int] = field(default_factory=lambda: (None, None, None))  # TODO: better typehints here