optimum/tpu/modeling_llama.py (8 lines):
	- line 155: self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: this may break with compilation
	- line 456: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 501: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 572: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
	- line 664: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 1094: # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
	- line 1363: # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
	- line 1401: # TODO: use `next_tokens` directly instead.


optimum/tpu/modeling_gemma.py (7 lines):
	- line 366: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 411: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 482: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in GemmaFlashAttention2 __init__.
	- line 575: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 1015: # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static
	- line 1270: # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
	- line 1308: # TODO: use `next_tokens` directly instead.


optimum/tpu/modeling_mistral.py (4 lines):
	- line 347: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 514: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
	- line 642: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 1063: # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static


text-generation-inference/server/text_generation_server/generator.py (3 lines):
	- line 169: # TODO: watermark
	- line 172: # TODO: stop_sequences, ignore_eos_token
	- line 852: # TODO: maybe model_config can be removed from mailbox


text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py (2 lines):
	- line 125: # TODO: watermark
	- line 128: # TODO: stop_sequences, ignore_eos_token


text-generation-inference/server/text_generation_server/jetstream_pt_support/models/llama_model_exportable_hf.py (1 line):
	- line 12: # TODO: it would be better to have RoPE scaling code in Jetstream Pytorch, but until that is not done,


optimum/tpu/static_cache_xla.py (1 line):
	- line 50: # TODO: deprecate this function in favor of `cache_position`


text-generation-inference/server/text_generation_server/cli.py (1 line):
	- line 64: # TODO: these two parameters are used when the server is started, but they are not used yet, so just inform the