lm_eval/api/task.py (17 lines):
	- line 71: None  # TODO: assert that this not None if num_fewshot > 0. (?) assert if this is same split as one evaling (?)
	- line 151: # TODO: should any default value in the TaskConfig not be printed?
	- line 445: # sample fewshot context #TODO: need to offset doc_id by rank now!
	- line 455: # TODO: we should override self.config.repeats if doing greedy gen so users don't waste time+compute
	- line 501: TODO: update this docstring
	- line 627: # TODO: this should only return the overrides applied to a non-YAML task's configuration.
	- line 711: ) -> None:  # TODO no super() call here
	- line 751: # TODO: handle this in TaskConfig.__post_init__ ?
	- line 1311: # TODO: we should raise a warning telling users this will at most ~2x runtime.
	- line 1428: # TODO: this gets score of 0 on arc_challenge for pythia-70m. need to test that this works properly
	- line 1433: # TODO use keyword arguments to the metric?
	- line 1474: # TODO: this may break for multipLe_target, non zero-or-1 metrics
	- line 1498: ):  # TODO: this is hacky and I don't want to do it
	- line 1503: # TODO: this handles the case where HF evaluate returns a dict.
	- line 1520: # TODO: this handles the case where HF evaluate returns a dict.
	- line 1560: # TODO: add mutual info here?
	- line 1575: ]  # only retain loglikelihoods, discard is_greedy TODO: do we need is_greedy anywhere?


lm_eval/models/huggingface.py (9 lines):
	- line 192: # TODO: include in warning that `load_in_8bit` etc. affect this too
	- line 195: # TODO: update this to be less of a hack once subfolder is fixed in HF
	- line 245: # TODO: can remove this whole snippet except in the mps case, perhaps?
	- line 536: device_map_option,  # TODO: phase out device_map_option?
	- line 923: # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
	- line 982: # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context
	- line 1087: # TODO: left-shift these?
	- line 1088: # TODO: our code assumes we never end up truncating conts for either model type
	- line 1119: # TODO: left-pad encoder inps and mask?


scripts/regression.py (6 lines):
	- line 36: # TODO: implement num_fewshot and limit per task, e.g. task1:5,task2:1:100,task3::1000
	- line 39: # TODO: implement hf-auto to pick between causal and seq2seq models so we don't need this
	- line 67: # TODO: split_and_pad_windows in AutoSeq2SeqLM doesn"t exist, #527
	- line 73: # TODO: OOM with auto for seq2seq models, also can OOM with llama
	- line 163: # TODO: implement proper timing for each task
	- line 164: # TODO: reduce IO by sharing tasks between models?


lm_eval/api/samplers.py (4 lines):
	- line 42: # TODO: should we just stop people from using fewshot from same split as evaluating?
	- line 83: # TODO: should we just stop people from using fewshot from same split as evaluating?
	- line 141: TODO: this should return approximately class-balanced samples from our fewshot examples.
	- line 142: TODO: what order should they be in? maybe random?


lm_eval/tasks/__init__.py (4 lines):
	- line 273: # very scuffed: set task name here. TODO: fixme?
	- line 439: # TODO: remove group in next release
	- line 487: # TODO: remove group in next release
	- line 538: # TODO: scrap this


lm_eval/models/anthropic_llms.py (4 lines):
	- line 143: REQ_CHUNK_SIZE = 20  # TODO: not used
	- line 242: temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic
	- line 275: REQ_CHUNK_SIZE = 20  # TODO: not used
	- line 346: temperature=temperature,  # TODO: implement non-greedy sampling for Anthropic


lm_eval/evaluator.py (3 lines):
	- line 480: # TODO: del model here, maybe (idea: allow user to specify device of e.g. reward model separately)
	- line 487: # TODO: make it possible to use a different metric per filter
	- line 588: # TODO: clean this up ; unify with the below metric_list loop?


lm_eval/evaluator_utils.py (3 lines):
	- line 105: self.sample_len = len(items)  # TODO: same sample size for each metric?
	- line 457: ]  # TODO: copy?
	- line 487: # TODO: calculate groups' metrics using arbitrary agg fns


lm_eval/models/neuron_optimum.py (3 lines):
	- line 234: # TODO: update this to be less of a hack once subfolder is fixed in HF
	- line 475: # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case
	- line 510: # TODO: implement some kind of efficient-request-middleware that lumps together requests with the same context


lm_eval/models/openai_completions.py (2 lines):
	- line 201: # TODO: the logic is much simpler if we just look at the length of continuation tokens
	- line 327: # TODO: Right now, we pass single EOT token to the Encoder and the full context to the decoder, in seq2seq case


lm_eval/prompts/__init__.py (2 lines):
	- line 101: # TODO allow to multiple prompt naming
	- line 119: # TODO need a way to process doc_to_choice


lm_eval/api/metrics.py (2 lines):
	- line 96: Higher is better  # TODO I think
	- line 562: # TODO: does not hold for non-mean aggregations


lm_eval/tasks/hendrycks_ethics/justice.yaml (1 line):
	- line 7: # TODO: impl. exact match for this and deontology


lm_eval/filters/selection.py (1 line):
	- line 7: # TODO: implement "arg_max" filter. either it should take in an arbitrary "scoring"/reward function


lm_eval/api/registry.py (1 line):
	- line 95: # TODO: do we want to enforce a certain interface to registered metrics?


lm_eval/api/model.py (1 line):
	- line 98: # TODO: Add an optional max length


lm_eval/models/__init__.py (1 line):
	- line 17: # TODO: implement __all__


lm_eval/decontamination/janitor.py (1 line):
	- line 109: # FIXME delete_chars: Should anything else go here? Special chars?


lm_eval/api/group.py (1 line):
	- line 61: # TODO: should any default value in the TaskConfig not be printed?


lm_eval/tasks/nq_open/nq_open.yaml (1 line):
	- line 8: doc_to_target: "{{answer}}" # TODO: should be multi-target


lm_eval/models/textsynth.py (1 line):
	- line 124: # TODO: The TextSynth API does not support tokenized inputs so we cannot


lm_eval/tasks/hendrycks_ethics/deontology.yaml (1 line):
	- line 9: # TODO: implement exact-match metric for this subset


lm_eval/filters/decontamination.py (1 line):
	- line 16: TODO: make sure only ever run one time on the train set (should this be cached as a class var? keyed by value for "path").


lm_eval/utils.py (1 line):
	- line 275: # TODO: overhaul reorderer. It currently grouped requests by content but we don't want this