src/transformers/generation/utils.py (11 lines):
	- line 328: # TODO (joao): remove the equivalent classes and typing shortcuts below in v5
	- line 692: # 8. Remove unexpected `generate` inputs (TODO @joao: fix trainer and examples)
	- line 973: # TODO (joao): remove output/input mismatch when these old models (xlnet, reformer) are deprecated
	- line 1263: # TODO (joao): find a strategy to specify the order of the processors
	- line 1584: # TODO: A better way to handle this.
	- line 1720: # TODO (joao): per-model generation config classes.
	- line 2422: # TODO (joao): generalize this check with other types of inputs
	- line 3648: # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
	- line 3703: TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
	- line 3822: # TODO (joao): This function should take an optional beam scorer function, to manipulate the scores after
	- line 3986: # TODO (joao): standardize special cases


src/transformers/cache_utils.py (10 lines):
	- line 160: # TODO: deprecate this function in favor of `cache_position`
	- line 564: # TODO: deprecate this function in favor of `cache_position`
	- line 709: # TODO (tmanlaibaatar) This won't be needed in torch 2.7.
	- line 1071: # TODO (joao, manuel): Remove this class in v4.59.0
	- line 1209: # TODO: deprecate this function in favor of `cache_position`
	- line 1703: # TODO: deprecate this function in favor of `cache_position`
	- line 1912: # TODO: deprecate this function in favor of `cache_position`
	- line 1975: # TODO (joao): to enable this cache on multiple devicesuse the pattern from `OffloadedCache`, which keeps
	- line 2110: # TODO (joao): add layer_device_map arg and update code in `generate` accordingly
	- line 2234: # TODO (joao): to enable this cache on multiple devicesuse the pattern from `OffloadedCache`, which keeps


src/transformers/modeling_tf_utils.py (9 lines):
	- line 1480: # TODO Matt: This is a workaround for older versions of datasets that are missing the `cols_to_retain`
	- line 2009: # TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor
	- line 2049: # TODO (joao): flagged for detection due to embeddings refactor
	- line 2080: # TODO (joao): flagged for replacement (by `_v2_resize_token_embeddings`) due to embeddings refactor
	- line 2117: # TODO (joao): this one probably needs a v2 version with other models
	- line 2140: # TODO (joao): flagged for replacement (by `_v2_get_resized_lm_head_bias`) due to embeddings refactor
	- line 2267: # TODO (joao): flagged for replacement (by `_v2_get_resized_embeddings`) due to embeddings refactor
	- line 2938: # TODO Matt: This is a temporary workaround to allow weight renaming, but requires a method
	- line 3315: # TODO (joao): flagged for detection due to embeddings refactor


src/transformers/models/kyutai_speech_to_text/modeling_kyutai_speech_to_text.py (8 lines):
	- line 500: # TODO cyril: modular
	- line 511: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 560: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 622: # TODO cyril: modular
	- line 643: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 1366: # TODO: @eustlb, this should be standardized
	- line 1394: # TODO: @eustlb, this should be standardized
	- line 1411: # TODO: @eustlb, we should have per-batch-idx values


src/transformers/utils/import_utils.py (8 lines):
	- line 44: # TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better.
	- line 51: # TODO: Once python 3.9 support is dropped, `importlib.metadata.packages_distributions()`
	- line 74: # TODO: remove once `importlib.metadata.packages_distributions()` is supported.
	- line 427: # TODO check if some bugs cause push backs on the exact version
	- line 619: # TODO: more precise exception matching, if possible.
	- line 643: # TODO: more precise exception matching, if possible.
	- line 1115: # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
	- line 1136: # TODO: Check for a minimum version when FA3 is stable


src/transformers/models/mimi/modeling_mimi.py (7 lines):
	- line 704: # TODO cyril: modular
	- line 715: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 759: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 821: # TODO cyril: modular
	- line 842: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 1477: # TODO: @eustlb, let's make the encoder support padding_mask so that batched inputs are supported.
	- line 1480: # TODO: @eustlb, convert the padding mask to attention mask.


src/transformers/trainer.py (7 lines):
	- line 267: # TODO: @AjayP13, @younesbelkada replace this check with version check at the next `accelerate` release
	- line 1532: # TODO Change dtypes back to M=FP32, Var = BF16, Kahan = False once they can be cast together in torchdistx.
	- line 2870: # TODO: in the future support only specific min PEFT versions
	- line 2960: # TODO: in the future support only specific min PEFT versions
	- line 3707: # TODO Matt: This syntax is deprecated and the preferred version is
	- line 3838: # TODO: this needs to be fixed and made cleaner later.
	- line 4625: # TODO: this needs to be fixed and made cleaner later.


src/transformers/models/nemotron/modeling_nemotron.py (6 lines):
	- line 280: # TODO cyril: modular
	- line 291: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 338: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 401: # TODO cyril: modular
	- line 423: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 867: # TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->NEMOTRON,Llama->Nemotron,llama->nemotron


utils/check_repo.py (6 lines):
	- line 126: "TFRobertaForMultipleChoice",  # TODO: fix
	- line 127: "TFRobertaPreLayerNormForMultipleChoice",  # TODO: fix
	- line 153: "MllamaTextModel",  # Building part of bigger (tested) model. # TODO: add tests
	- line 154: "MllamaVisionModel",  # Building part of bigger (tested) model. # TODO: add tests
	- line 155: "Llama4TextModel",  # Building part of bigger (tested) model. # TODO: add tests
	- line 156: "Llama4VisionModel",  # Building part of bigger (tested) model. # TODO: add tests


utils/notification_service.py (5 lines):
	- line 618: # TODO: Improve the condition here.
	- line 664: # TODO: We should NOT assume it's always Nvidia CI, but it's the case at this moment.
	- line 779: # TODO: Make sure we always have a valid job link (or at least a way not to break the report sending)
	- line 1206: # TODO: ???
	- line 1225: # TODO: How to deal wit this


src/transformers/modeling_rope_utils.py (5 lines):
	- line 71: self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: may break with compilation
	- line 196: # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
	- line 341: # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling
	- line 494: # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`
	- line 546: # TODO (joao): update logic for the inclusion of `original_max_position_embeddings`


src/transformers/models/moshi/modeling_moshi.py (5 lines):
	- line 499: # TODO cyril: modular
	- line 510: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 559: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 621: # TODO cyril: modular
	- line 642: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.


src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py (5 lines):
	- line 321: # `sqrt` in order to prevent NaNs during training in bfloat16. TODO a bit annoying
	- line 338: # TODO refactor
	- line 632: if use_cache and inputs_embeds.shape[1] != 1:  # TODO let's maybe only call in the `generate`?
	- line 696: # TODO: re-enable check: Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->RECURRENTGEMMA,Llama->RecurrentGemma,llama->gemma
	- line 784: # Soft-cap the logits TODO remove if always done.


src/transformers/modeling_utils.py (5 lines):
	- line 3784: # TODO: fix safe_serialization for tied weights
	- line 4421: # TODO: we can relax this check when we support taking tp_plan from a json file, for example.
	- line 4439: # TODO: make device_mesh support multiple dimensions
	- line 4509: # TODO Cyril: raise an error instead of the warning in v4.53 (and change the test to check for raise instead of success)
	- line 5630: if "llama4" in self.config.model_type:  # TODO try to enable for FULL COMPILE HYBRID CACHE SUPPORT


src/transformers/models/detr/image_processing_detr.py (4 lines):
	- line 638: # TODO - (Amy) make compatible with other frameworks
	- line 661: # TODO - (Amy) make compatible with other frameworks
	- line 1042: # TODO (Amy) - update to use `rescale_factor` instead of `scale`
	- line 1503: # POSTPROCESSING METHODS - TODO: add support for other frameworks


src/transformers/pipelines/base.py (4 lines):
	- line 1085: # TODO (joao): no PT model should reach this line. However, some audio models with complex
	- line 1394: # TODO hack by collating feature_extractor and image_processor
	- line 1437: # TODO make the get_iterator work also for `tf` (and `flax`).
	- line 1512: # TODO hack by collating feature_extractor and image_processor


src/transformers/generation/tf_utils.py (4 lines):
	- line 1629: # TODO (Joao): fix cache format or find programmatic way to detect cache index
	- line 1913: # TODO (Joao): fix cache format or find programmatic way to detect cache index
	- line 2256: # TODO (Joao): fix cache format or find programmatic way to detect cache index
	- line 2791: # TODO (Joao): fix cache format or find programmatic way to detect cache index


src/transformers/utils/fx.py (4 lines):
	- line 199: # TODO: add support for them as it should be quite easy to do so (small blocking issues).
	- line 391: # TODO: infer shape without performing the computation, this might be quite hard.
	- line 584: # TODO: infer shape without performing the computation.
	- line 1341: # TODO: solves GraphModule creation.


utils/check_config_attributes.py (4 lines):
	- line 61: # generation configs (TODO joao)
	- line 284: # TODO (ydshieh): Check the failing cases, try to fix them or move some cases to the above block once we are sure
	- line 313: # TODO: @Arthur (for `alignment_head` and `alignment_layer`)
	- line 315: # TODO: @Younes (for `is_decoder`)


src/transformers/models/falcon/modeling_falcon.py (4 lines):
	- line 232: # TODO (raushan): remove in v4.46 (RoPE is computed in the model, not in the decoder layers)
	- line 438: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 477: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 892: # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static


src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py (4 lines):
	- line 388: self.t2u_variance_predictor_embed_dim = t2u_variance_predictor_embed_dim  # TODO: add to docstrings
	- line 389: self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim  # TODO: add to docstrings
	- line 390: self.t2u_variance_predictor_kernel_size = t2u_variance_predictor_kernel_size  # TODO: add to docstrings
	- line 391: self.t2u_variance_pred_dropout = t2u_variance_pred_dropout  # TODO: add to docstrings


src/transformers/models/fuyu/processing_fuyu.py (4 lines):
	- line 128: # TODO Remove this logic in a subsequent release since subsequences are not supported.
	- line 361: self.max_position_embeddings = 16384  # TODO Can't derive this from model files: where to set it?
	- line 441: # FIXME max_tokens_to_generate is embedded into this processor's call.
	- line 555: # FIXME - We hard code "pt" here because the rest of the processing assumes torch tensors


src/transformers/models/qwen2_moe/modeling_qwen2_moe.py (4 lines):
	- line 367: # TODO cyril: modular
	- line 380: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 483: # TODO cyril: modular
	- line 504: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.


src/transformers/integrations/tensor_parallel.py (4 lines):
	- line 406: # TODO: figure out dynamo support for instance method and switch this to instance method
	- line 433: # TODO: figure out dynamo support for instance method and switch this to instance method
	- line 475: # TODO: figure out dynamo support for instance method and switch this to instance method
	- line 763: # TODO: this logic should be wrapped in a function, this is copied from corresponding tp classes.


src/transformers/pipelines/document_question_answering.py (4 lines):
	- line 104: # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
	- line 405: # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
	- line 406: # FIXME: ydshieh and/or Narsil
	- line 493: # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer


src/transformers/modeling_attn_mask_utils.py (3 lines):
	- line 275: # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input
	- line 303: # TODO: maybe revisit this with https://github.com/pytorch/pytorch/pull/114823 in PyTorch 2.3.
	- line 381: # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400).


src/transformers/models/dia/processing_dia.py (3 lines):
	- line 177: # TODO: dac with batching is currently broken, but non-batch is working
	- line 329: # TODO: see above, dac doesn't work in batches yet
	- line 383: # TODO: @eustlb, this should be in AudioProcessor


src/transformers/models/llama4/modeling_llama4.py (3 lines):
	- line 816: # TODO there is a different RoPE for vision encoder, defined as below
	- line 886: scaling=None,  # TODO Might be enforced here for TP compatibility as scaling is not just sqrt(head_dim)
	- line 974: freqs_ci: torch.Tensor,  # TODO move this to an attribute instead of keeping it around


src/transformers/models/deprecated/deta/modeling_deta.py (3 lines):
	- line 424: # TODO fix this
	- line 517: # TODO find a better way of exposing other arguments
	- line 1153: # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36


src/transformers/models/dbrx/modeling_dbrx.py (3 lines):
	- line 322: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 376: # TODO: These transpose are quite inefficient but Flash Attention requires the layout
	- line 454: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.


src/transformers/masking_utils.py (3 lines):
	- line 711: # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
	- line 797: # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it
	- line 891: # TODO: cyril -> probably revisit and remove this, but a lot of tests rely on it


src/transformers/models/kyutai_speech_to_text/modular_kyutai_speech_to_text.py (3 lines):
	- line 441: # TODO: @eustlb, this should be standardized
	- line 469: # TODO: @eustlb, this should be standardized
	- line 486: # TODO: @eustlb, we should have per-batch-idx values


src/transformers/models/jetmoe/modeling_jetmoe.py (3 lines):
	- line 571: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 644: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 699: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache


utils/create_dummy_models.py (3 lines):
	- line 413: # TODO: More fine grained control of the desired tester class.
	- line 628: # TODO: Try to improve `build_processor`'s definition and/or usage to avoid the above situation in the first place.
	- line 853: # TODO: We need this information?


src/transformers/models/speech_to_text/modeling_speech_to_text.py (3 lines):
	- line 251: # TODO: we need a refactor so that the different attention modules can get their specific kwargs
	- line 401: # TODO: change copy when applying cache class
	- line 529: # TODO: tests would need a rewrite to check for correct implementation


src/transformers/models/stablelm/modeling_stablelm.py (3 lines):
	- line 318: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
	- line 423: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 487: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache


src/transformers/models/olmoe/modeling_olmoe.py (3 lines):
	- line 362: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 406: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 485: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.


src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py (3 lines):
	- line 53: # TODO: Could have better fused kernels depending on scaling, dropout and head mask.
	- line 279: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 515: # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.


src/transformers/models/olmo/convert_olmo_weights_to_hf.py (3 lines):
	- line 101: # TODO: Layernorm stuff
	- line 102: # TODO: multi query attention
	- line 130: # TODO: Deal with weight-tying


src/transformers/models/olmo2/convert_olmo2_weights_to_hf.py (3 lines):
	- line 117: # TODO: Layernorm stuff
	- line 118: # TODO: multi query attention
	- line 154: # TODO: Deal with weight-tying


src/transformers/models/diffllama/modeling_diffllama.py (3 lines):
	- line 240: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 294: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	- line 675: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/training_args.py (2 lines):
	- line 209: # TODO: `TrainingArguments` users rely on it being fully mutable. In the future see if we can narrow this to a few keys: https://github.com/huggingface/transformers/pull/25903
	- line 2137: # those deprecated arguments are removed from TrainingArguments. (TODO: v5)


src/transformers/models/conditional_detr/modeling_conditional_detr.py (2 lines):
	- line 411: # TODO find a better way of exposing other arguments
	- line 1825: # FIXME h_boxes takes the last one computed, keep this in mind


src/transformers/models/auto/image_processing_auto.py (2 lines):
	- line 443: # TODO: @yoni, change in v4.48 (use_fast set to True by default)
	- line 508: # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)


src/transformers/models/blenderbot_small/configuration_blenderbot_small.py (2 lines):
	- line 189: # TODO: figure this case out.
	- line 288: # TODO: test this.


src/transformers/models/funnel/modeling_funnel.py (2 lines):
	- line 884: # TODO: deal with head_mask
	- line 951: # TODO: deal with head_mask


src/transformers/models/unispeech_sat/modeling_unispeech_sat.py (2 lines):
	- line 348: # TODO: we need a refactor so that the different attention modules can get their specific kwargs
	- line 1225: >>> # TODO: Add full pretraining example


src/transformers/testing_utils.py (2 lines):
	- line 1328: # TODO: Remove once eetq releases a fix and this release is used in CI
	- line 1637: # TODO (if possible): Avoid exceptional cases


src/transformers/models/bart/configuration_bart.py (2 lines):
	- line 204: # TODO: figure this case out.
	- line 303: # TODO: test this.


src/transformers/models/esm/openfold_utils/residue_constants.py (2 lines):
	- line 364: # TODO: ^ interpret this
	- line 416: # TODO: this file should be downloaded in a setup script


src/transformers/models/jamba/modeling_jamba.py (2 lines):
	- line 374: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 484: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.


src/transformers/models/mbart/configuration_mbart.py (2 lines):
	- line 189: # TODO: figure this case out.
	- line 288: # TODO: test this.


src/transformers/models/paligemma/convert_paligemma_weights_to_hf.py (2 lines):
	- line 41: # TODO add sequence length variations here
	- line 164: # TODO verify correctness of layer norm loading


src/transformers/models/deepseek_v3/modeling_deepseek_v3.py (2 lines):
	- line 280: TODO let's just use the original freqcis computation to not have the view
	- line 588: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


benchmark/benchmark.py (2 lines):
	- line 108: # TODO: Give warnings.
	- line 273: # TODO: not hardcoded


src/transformers/models/longt5/modeling_longt5.py (2 lines):
	- line 60: # TODO: Update before the merge
	- line 1254: _supports_static_cache = False  # TODO: @raushan more involved due to local/global attn


src/transformers/models/marian/configuration_marian.py (2 lines):
	- line 189: # TODO: figure this case out.
	- line 289: # TODO: test this.


src/transformers/models/detr/modeling_detr.py (2 lines):
	- line 403: # TODO find a better way of exposing other arguments
	- line 1530: # FIXME h_boxes takes the last one computed, keep this in mind


src/transformers/models/deformable_detr/modeling_deformable_detr.py (2 lines):
	- line 474: # TODO find a better way of exposing other arguments
	- line 1017: # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36


src/transformers/models/led/modeling_tf_led.py (2 lines):
	- line 467: # TODO: This code is most likely not very efficient and should be improved
	- line 2473: # TODO (Joao): investigate why LED has numerical issues in XLA generate


src/transformers/models/csm/modeling_csm.py (2 lines):
	- line 789: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
	- line 973: # TODO: @eustlb, this should be batched !!!


src/transformers/models/xlnet/modeling_tf_xlnet.py (2 lines):
	- line 526: qlen: TODO Lysandre didn't fill
	- line 527: mlen: TODO Lysandre didn't fill


src/transformers/models/time_series_transformer/modeling_time_series_transformer.py (2 lines):
	- line 363: # TODO: we need a refactor so that the different attention modules can get their specific kwargs
	- line 636: # TODO: tests would need a rewrite to check for correct implementation


src/transformers/models/led/modeling_led.py (2 lines):
	- line 263: # TODO: remove the redundant computation
	- line 378: # TODO replace this with


src/transformers/image_processing_base.py (2 lines):
	- line 51: # TODO: Move BatchFeature to be imported by both image_processing_utils and image_processing_utils_fast
	- line 68: # TODO: (Amy) - factor out the common parts of this and the feature extractor


src/transformers/models/bark/modeling_bark.py (2 lines):
	- line 200: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 1590: # TODO (joao):workaround until nested generation config is compatible with PreTrained Model


src/transformers/models/instructblipvideo/modeling_instructblipvideo.py (2 lines):
	- line 1191: _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
	- line 1389: _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8


src/transformers/models/longformer/modeling_longformer.py (2 lines):
	- line 621: # TODO: remove the redundant computation
	- line 736: # TODO replace this with


src/transformers/models/instructblip/modeling_instructblip.py (2 lines):
	- line 1195: _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8
	- line 1383: _keep_in_fp32_modules = ["query_tokens"]  # TODO @ArthurZucker I don't know why this is required for FP8


src/transformers/models/unispeech/modeling_unispeech.py (2 lines):
	- line 343: # TODO: we need a refactor so that the different attention modules can get their specific kwargs
	- line 1213: >>> # TODO: Add full pretraining example


src/transformers/utils/hub.py (2 lines):
	- line 99: # TODO: clean this for v5?
	- line 905: )  # TODO: This is only used for testing and should be removed once save_jinja_files becomes the default


src/transformers/models/csm/generation_csm.py (2 lines):
	- line 265: # TODO (joao): this OP throws "skipping cudagraphs due to ['incompatible ops']", find solution
	- line 472: # TODO: @eustlb, this should be batched !!!


src/transformers/models/diffllama/modular_diffllama.py (2 lines):
	- line 173: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 227: # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache


src/transformers/generation/tf_logits_process.py (2 lines):
	- line 370: # TODO (Joao): this function might trigger XLA retracing as `cur_len` increases. Fix it if it becomes
	- line 427: # TODO (joao): enable XLA on this logits processor. See discussion and attempts in


src/transformers/models/cohere/tokenization_cohere_fast.py (2 lines):
	- line 150: # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly
	- line 499: # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers


src/transformers/models/nllb_moe/modeling_nllb_moe.py (2 lines):
	- line 549: # TODO: we need a refactor so that the different attention modules can get their specific kwargs
	- line 853: # TODO: If anyone is up to it to make sure tests pass etc


src/transformers/onnx/config.py (2 lines):
	- line 519: # TODO: should we set seq_length = 1 when self.use_past = True?
	- line 705: # TODO: test this.


src/transformers/models/musicgen/modeling_musicgen.py (2 lines):
	- line 228: # TODO: we need a refactor so that the different attention modules can get their specific kwargs
	- line 341: # TODO: change to new cache class


src/transformers/models/maskformer/image_processing_maskformer.py (2 lines):
	- line 270: # TODO: (Amy) Move to image_transforms
	- line 664: # TODO: (Amy)


src/transformers/models/deprecated/nat/modeling_nat.py (2 lines):
	- line 216: # TODO: Support arbitrary patch sizes.
	- line 931: # TODO can we simplify this?


src/transformers/models/gpt_neo/modeling_gpt_neo.py (2 lines):
	- line 278: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	- line 496: _supports_static_cache = False  # TODO: needs a HybridCache


src/transformers/tokenization_utils.py (2 lines):
	- line 545: # TODO this is fairly slow to improve!
	- line 1103: # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string


src/transformers/models/granite_speech/modeling_granite_speech.py (2 lines):
	- line 166: # TODO (@avihu111) find a fast alternative to einsum
	- line 387: # TODO (@alex-jw-brooks) add an example to this docstring once models are released


src/transformers/utils/args_doc.py (2 lines):
	- line 1388: elif param_type == "" and False:  # TODO: Enforce typing for all parameters
	- line 1862: # TODO (Yoni): Add support for Attributes section in docs


src/transformers/configuration_utils.py (2 lines):
	- line 267: # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
	- line 406: # TODO (joao): this should be an exception if the user has modified the loaded config. See #33886


utils/check_model_tester.py (2 lines):
	- line 28: # TODO: deal with TF/Flax too
	- line 37: # TODO: deal this better


src/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py (2 lines):
	- line 211: # TODO: figure this case out.
	- line 310: # TODO: test this.


src/transformers/models/paligemma/convert_paligemma2_weights_to_hf.py (2 lines):
	- line 42: # TODO add sequence length variations here
	- line 207: # TODO verify correctness of layer norm loading


src/transformers/models/mask2former/image_processing_mask2former.py (2 lines):
	- line 264: # TODO: (Amy) Move to image_transforms
	- line 661: # TODO: (Amy)


src/transformers/models/siglip2/convert_siglip2_to_hf.py (1 line):
	- line 385: # TODO: update with more checkpoints


src/transformers/tf_utils.py (1 line):
	- line 70: # TODO: When the issue linked above gets sorted, add a check on TF version here and use the original function if


src/transformers/models/perceiver/tokenization_perceiver.py (1 line):
	- line 182: # TODO @ArthurZ refactor this as well....


src/transformers/models/unispeech_sat/modular_unispeech_sat.py (1 line):
	- line 396: >>> # TODO: Add full pretraining example


src/transformers/models/oneformer/image_processing_oneformer.py (1 line):
	- line 662: # TODO: (Amy)


src/transformers/models/bamba/modular_bamba.py (1 line):
	- line 254: # FIXME:


src/transformers/models/fuyu/image_processing_fuyu.py (1 line):
	- line 579: # TODO refer to https://github.com/ArthurZucker/transformers/blob/0f0a3fe5ca5697ee58faeb5b53f049af720b5e98/src/transformers/models/vit_mae/modeling_vit_mae.py#L871


src/transformers/models/longformer/modeling_tf_longformer.py (1 line):
	- line 1031: # TODO: This code is most likely not very efficient and should be improved


src/transformers/pipelines/__init__.py (1 line):
	- line 1057: # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.


src/transformers/models/esm/openfold_utils/protein.py (1 line):
	- line 86: seq[i] = "X"  # FIXME: strings are immutable


src/transformers/models/dinat/modeling_dinat.py (1 line):
	- line 171: # TODO: Support arbitrary patch sizes.


src/transformers/trainer_seq2seq.py (1 line):
	- line 330: # TODO: remove this hack when the legacy code that initializes generation_config from a model config is


src/transformers/models/unispeech_sat/convert_unispeech_sat_original_pytorch_checkpoint_to_pytorch.py (1 line):
	- line 127: # TODO: don't match quantizer.weight_proj


src/transformers/models/pegasus/tokenization_pegasus.py (1 line):
	- line 34: # TODO ArthurZ refactor this to only use the added_tokens_encoder


src/transformers/models/unispeech/convert_unispeech_original_pytorch_checkpoint_to_pytorch.py (1 line):
	- line 137: # TODO: don't match quantizer.weight_proj


utils/patch_helper.py (1 line):
	- line 21: Potential TODO: automatically cherry-picks them.


src/transformers/models/m2m_100/modeling_m2m_100.py (1 line):
	- line 263: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/dpt/image_processing_dpt.py (1 line):
	- line 610: # TODO: add support for other frameworks


src/transformers/integrations/executorch.py (1 line):
	- line 485: # TODO: The default inputs only work for text models. We need to add support for vision/audio models.


src/transformers/models/lightglue/image_processing_lightglue.py (1 line):
	- line 72: Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch


src/transformers/models/flava/modeling_flava.py (1 line):
	- line 597: # TODO: Check fp32 layer norm possibility


src/transformers/models/granite_speech/processing_granite_speech.py (1 line):
	- line 68: # TODO (@alex-jw-brooks); we should add a util to get_num_audio_tokens


src/transformers/models/camembert/modeling_camembert.py (1 line):
	- line 294: # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.


src/transformers/generation/logits_process.py (1 line):
	- line 29: # TODO (joao): We shouldn't need this, but there would be a circular import


src/transformers/models/musicgen_melody/modeling_musicgen_melody.py (1 line):
	- line 235: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/bark/generation_configuration_bark.py (1 line):
	- line 244: # TODO (joao): nested from_dict


src/transformers/models/camembert/tokenization_camembert.py (1 line):
	- line 196: # TODO decode outputs do not match between fast and slow


src/transformers/models/olmo/modeling_olmo.py (1 line):
	- line 379: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/patchtst/modeling_patchtst.py (1 line):
	- line 111: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/deit/modeling_deit.py (1 line):
	- line 533: # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)


src/transformers/models/deprecated/tapex/tokenization_tapex.py (1 line):
	- line 1345: # TODO (Qian): is it possible to revert the original cell if it is in the final answer?


src/transformers/data/datasets/language_modeling.py (1 line):
	- line 210: # TODO: randomness could apply a random seed, ex. rng = random.Random(random_seed)


src/transformers/models/ijepa/modeling_ijepa.py (1 line):
	- line 523: # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)


src/transformers/models/gemma3n/feature_extraction_gemma3n.py (1 line):
	- line 266: # TODO: The filtered mask is always exactly 3 elements longer than the mel_spectrogram. Why???


src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py (1 line):
	- line 215: # TODO: Check if this is needed, as it ensures that decode(encode(doc)) != doc by adding extra whitespace in the decoded document


src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py (1 line):
	- line 45: # TODO (raushan): processor can be removed after v5 release. Kept for backwards compatibility


src/transformers/quantizers/quantizer_awq.py (1 line):
	- line 130: model._awq_is_fused = True  # TODO: consider storing this flag in model.config instead


src/transformers/models/plbart/modeling_plbart.py (1 line):
	- line 394: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/data2vec/modeling_data2vec_audio.py (1 line):
	- line 251: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py (1 line):
	- line 1415: # TODO (Joao): investigate why Speech2Text has numerical issues in XLA generate


src/transformers/generation/configuration_utils.py (1 line):
	- line 533: # TODO joao: find out a way of not depending on external fields (e.g. `assistant_model`), then make this a


src/transformers/models/data2vec/convert_data2vec_audio_original_pytorch_checkpoint_to_pytorch.py (1 line):
	- line 132: # TODO: don't match quantizer.weight_proj


src/transformers/models/mbart/modeling_mbart.py (1 line):
	- line 208: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/wav2vec2/convert_wav2vec2_original_pytorch_checkpoint_to_pytorch.py (1 line):
	- line 192: # TODO: don't match quantizer.weight_proj


src/transformers/pipelines/image_to_text.py (1 line):
	- line 221: # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`


src/transformers/models/arcee/modeling_arcee.py (1 line):
	- line 401: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py (1 line):
	- line 629: # TODO (joao): the `TFBaseModelOutput` wrapper should not be needed after the generate refactor is complete


src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py (1 line):
	- line 260: # TODO check if the t5/llama PR also applies here


src/transformers/models/aria/modeling_aria.py (1 line):
	- line 784: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/quantizers/quantizer_bnb_8bit.py (1 line):
	- line 287: # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here


src/transformers/models/csm/processing_csm.py (1 line):
	- line 169: # TODO: @eustlb, this should be in AudioProcessor


src/transformers/onnx/convert.py (1 line):
	- line 142: # TODO: Check when exporting QA we provide "is_pair=True"


src/transformers/models/llama/tokenization_llama_fast.py (1 line):
	- line 237: # TODO ArthurZ let's rely on the template processor instead, refactor all fast tokenizers


src/transformers/models/cohere/modeling_cohere.py (1 line):
	- line 435: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py (1 line):
	- line 208: # TODO: Convert dataset to Parquet


src/transformers/models/pixtral/modeling_pixtral.py (1 line):
	- line 83: # TODO maybe make it torch compatible later on. We can also just slice


src/transformers/models/beit/image_processing_beit.py (1 line):
	- line 489: # TODO: add support for other frameworks


src/transformers/models/hgnet_v2/modular_hgnet_v2.py (1 line):
	- line 38: # TODO: Modular conversion for resnet must be fixed as


src/transformers/quantizers/quantizer_hqq.py (1 line):
	- line 205: # TODO: This is a compatibility hack. HQQ-quantized linear layers do not have a `weight` attribute,


src/transformers/models/gemma3n/modular_gemma3n.py (1 line):
	- line 1977: # TODO (raushan): Fix this after RoPE refactor. For now we hack it by


src/transformers/models/bert/modeling_bert.py (1 line):
	- line 343: # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.


src/transformers/models/vision_text_dual_encoder/modeling_vision_text_dual_encoder.py (1 line):
	- line 388: # TODO: Should we use the pre-trained projection as well ?


src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py (1 line):
	- line 4058: # TODO: raushan, defaults should be saved in generation config


src/transformers/models/d_fine/configuration_d_fine.py (1 line):
	- line 30: # TODO: Attribute map assignment logic should be fixed in modular


src/transformers/models/bloom/tokenization_bloom_fast.py (1 line):
	- line 113: # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly


src/transformers/image_utils.py (1 line):
	- line 938: # TODO raise a warning here instead of simply logging?


src/transformers/models/superpoint/image_processing_superpoint.py (1 line):
	- line 66: Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch


src/transformers/models/blenderbot/modeling_blenderbot.py (1 line):
	- line 198: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/dots1/configuration_dots1.py (1 line):
	- line 109: base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace


src/transformers/models/blip_2/modeling_blip_2.py (1 line):
	- line 2225: # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs


src/transformers/models/gemma3n/convert_gemma3n_weights.py (1 line):
	- line 491: # TODO: ryanmullins - support multimodal norms and projections


src/transformers/models/segformer/image_processing_segformer.py (1 line):
	- line 456: # TODO: add support for other frameworks


src/transformers/models/esm/modeling_esmfold.py (1 line):
	- line 1979: # TODO Add information to the docstring about any methods that convert to PDB format, or otherwise prepare


src/transformers/models/deepseek_v3/configuration_deepseek_v3.py (1 line):
	- line 135: base_model_tp_plan = {  # TODO: only replicate attention layers when > first_k_dense_replace


src/transformers/models/zoedepth/image_processing_zoedepth.py (1 line):
	- line 233: # TODO support align_corners=True in image_transforms.resize


src/transformers/models/bloom/configuration_bloom.py (1 line):
	- line 157: # TODO: how to do that better?


src/transformers/models/videomae/modeling_videomae.py (1 line):
	- line 87: # TODO: make it with torch instead of numpy


src/transformers/models/pegasus/modeling_pegasus.py (1 line):
	- line 197: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


utils/check_bad_commit.py (1 line):
	- line 185: # TODO: make this script able to deal with both `single-gpu` and `multi-gpu` via a new argument.


src/transformers/loss/loss_for_object_detection.py (1 line):
	- line 197: # TODO use valid to mask invalid areas due to padding in loss


src/transformers/models/t5gemma/modeling_t5gemma.py (1 line):
	- line 574: # TODO: support intialization for encoders and decoders separately(?)


src/transformers/models/fuyu/modeling_fuyu.py (1 line):
	- line 343: # don't pass kwargs because Persimmon-backbone doesn't accept FA2 kwargs yet, TODO: raushan


src/transformers/models/mobilevit/image_processing_mobilevit.py (1 line):
	- line 459: # TODO: add support for other frameworks


src/transformers/models/dots1/modeling_dots1.py (1 line):
	- line 507: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py (1 line):
	- line 621: # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)


src/transformers/models/swin2sr/convert_swin2sr_original_to_pytorch.py (1 line):
	- line 211: # TODO values didn't match exactly here


src/transformers/models/owlvit/image_processing_owlvit.py (1 line):
	- line 464: # TODO: (amy) add support for other frameworks


src/transformers/models/grounding_dino/processing_grounding_dino.py (1 line):
	- line 319: # TODO: @pavel, set labels to None since v4.51.0 or find a way to extract ids


src/transformers/pipelines/automatic_speech_recognition.py (1 line):
	- line 88: # TODO  Use a faster algorithm this can probably be done in O(n)


templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py (1 line):
	- line 889: # TODO Get the proper metric function


src/transformers/models/wav2vec2_conformer/convert_wav2vec2_conformer_original_pytorch_checkpoint_to_pytorch.py (1 line):
	- line 154: # TODO: don't match quantizer.weight_proj


benchmark/llama.py (1 line):
	- line 191: # TODO use  decode_one_token(model, input_id.clone(), cache_position) for verification


src/transformers/models/phimoe/modeling_phimoe.py (1 line):
	- line 443: # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.


src/transformers/generation/continuous_batching.py (1 line):
	- line 710: # TODO don't apply on prefill splitting


src/transformers/models/informer/modeling_informer.py (1 line):
	- line 453: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/roberta/modeling_roberta.py (1 line):
	- line 293: # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.


src/transformers/models/bloom/modeling_bloom.py (1 line):
	- line 62: TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.


src/transformers/models/table_transformer/modeling_table_transformer.py (1 line):
	- line 361: # TODO find a better way of exposing other arguments


src/transformers/image_transforms.py (1 line):
	- line 780: # TODO (Amy): Accept 1/3/4 channel numpy array as input and return np.array as default


src/transformers/models/yolos/image_processing_yolos.py (1 line):
	- line 1439: # POSTPROCESSING METHODS - TODO: add support for other frameworks


src/transformers/models/falcon_h1/modular_falcon_h1.py (1 line):
	- line 335: # FIXME:


src/transformers/models/marian/modeling_marian.py (1 line):
	- line 198: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/falcon_h1/modeling_falcon_h1.py (1 line):
	- line 536: # FIXME:


src/transformers/models/xlm/tokenization_xlm.py (1 line):
	- line 415: # TODO: make sure we are using `FacebookAI/xlm-mlm-enro-1024`, since XLM-100 doesn't have this step


src/transformers/quantizers/quantizer_eetq.py (1 line):
	- line 62: # TODO: Update message once eetq releases a fix


src/transformers/models/olmo2/modeling_olmo2.py (1 line):
	- line 385: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/bridgetower/configuration_bridgetower.py (1 line):
	- line 279: # TODO: remove this once the Hub files are updated.


src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py (1 line):
	- line 1262: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/conditional_detr/image_processing_conditional_detr.py (1 line):
	- line 1530: # POSTPROCESSING METHODS - TODO: add support for other frameworks


src/transformers/models/esm/configuration_esm.py (1 line):
	- line 26: # TODO Update this


src/transformers/models/bart/modeling_bart.py (1 line):
	- line 199: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/owlvit/image_processing_owlvit_fast.py (1 line):
	- line 71: # TODO: (amy) add support for other frameworks


src/transformers/models/bitnet/modeling_bitnet.py (1 line):
	- line 398: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/mluke/tokenization_mluke.py (1 line):
	- line 346: # TODO check if the t5/llama PR also applies here


src/transformers/models/groupvit/modeling_tf_groupvit.py (1 line):
	- line 2127: # TODO: As is this currently fails with saved_model=True, because


src/transformers/models/gptj/modeling_gptj.py (1 line):
	- line 274: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.


src/transformers/models/sam/modeling_tf_sam.py (1 line):
	- line 693: # TODO Matt: What is going on here? Why is a non-trainable weight randomly initialized?


src/transformers/integrations/flex_attention.py (1 line):
	- line 108: # TODO: deprecate / rename to make_flex_block_mask for clarity as it's not only causal anymore


src/transformers/models/fsmt/modeling_fsmt.py (1 line):
	- line 98: # TODO:


src/transformers/models/patchtsmixer/modeling_patchtsmixer.py (1 line):
	- line 314: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/utils/attention_visualizer.py (1 line):
	- line 200: if "token_type_ids" in inputs:  # TODO inspect signature of update causal mask


src/transformers/models/chameleon/modeling_chameleon.py (1 line):
	- line 133: self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: this may break with compilation


src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2_fast.py (1 line):
	- line 58: # TODO: add support for other frameworks


src/transformers/models/blenderbot_small/modeling_blenderbot_small.py (1 line):
	- line 182: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/superglue/image_processing_superglue.py (1 line):
	- line 75: Converts an image to grayscale format using the NTSC formula. Only support numpy and PIL Image. TODO support torch


src/transformers/models/wav2vec2/modeling_wav2vec2.py (1 line):
	- line 535: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/dpt/image_processing_dpt_fast.py (1 line):
	- line 338: # TODO: add support for other frameworks


src/transformers/models/wav2vec2/modeling_tf_wav2vec2.py (1 line):
	- line 461: # TODO Matt: Assigning to attributes in call() is deeply sinful in TensorFlow, as it should be idempotent.


src/transformers/models/instructblip/convert_instructblip_original_to_pytorch.py (1 line):
	- line 263: # TODO add this in the generate method?


src/transformers/models/maskformer/convert_maskformer_resnet_to_pytorch.py (1 line):
	- line 38: # TODO add support for ResNet-C backbone, which uses a "deeplab" stem


src/transformers/models/mllama/processing_mllama.py (1 line):
	- line 261: TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask


src/transformers/models/whisper/modeling_whisper.py (1 line):
	- line 296: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/byt5/tokenization_byt5.py (1 line):
	- line 97: additional_special_tokens=additional_special_tokens,  # TODO extra ids are not used :sweatywmile:


src/transformers/models/qwen2/modular_qwen2.py (1 line):
	- line 142: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/pipelines/text_generation.py (1 line):
	- line 335: "max_length": max_length,  # TODO: name clash -- this is broken, `max_length` is also a `generate` arg


src/transformers/models/barthez/tokenization_barthez.py (1 line):
	- line 35: # TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this.


utils/modular_model_converter.py (1 line):
	- line 1106: # TODO we only use single assign might cause issues


src/transformers/models/codegen/configuration_codegen.py (1 line):
	- line 160: # TODO: how to do that better?


utils/check_docstrings.py (1 line):
	- line 1435: # TODO (Yoni): The functions in check_auto_docstrings rely on direct code parsing, which is prone to


src/transformers/models/nougat/tokenization_nougat_fast.py (1 line):
	- line 537: # TODO Come up with footnote formatting inside a table


src/transformers/models/glm/modeling_glm.py (1 line):
	- line 415: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/esm/modeling_esm.py (1 line):
	- line 415: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.


src/transformers/models/qwen2/modeling_qwen2.py (1 line):
	- line 381: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/beit/image_processing_beit_fast.py (1 line):
	- line 255: # TODO: add support for other frameworks


src/transformers/models/whisper/modeling_tf_whisper.py (1 line):
	- line 1672: # TODO: Implement `WhisperTimeStampLogitsProcessor`.


src/transformers/models/rt_detr/modeling_rt_detr.py (1 line):
	- line 46: # TODO: Replace all occurrences of the checkpoint with the final one


src/transformers/models/bamba/convert_mamba_ssm_checkpoint.py (1 line):
	- line 221: # FIXME: allow other parameters to pass in


src/transformers/models/glm4/modeling_glm4.py (1 line):
	- line 423: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/image_processing_utils_fast.py (1 line):
	- line 286: # TODO: remove this once the bug is fixed (detected with torch==2.7.0+git1fee196, torchvision==0.22.0+9eb57cd)


src/transformers/pipelines/mask_generation.py (1 line):
	- line 215: # TODO: Identifying the model by the type of its returned embeddings is brittle.


src/transformers/utils/quantization_config.py (1 line):
	- line 1977: # TODO: Remove this check once configuration version is handled natively by Quark.


src/transformers/models/mistral/modeling_mistral.py (1 line):
	- line 376: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/helium/modeling_helium.py (1 line):
	- line 400: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/gemma3/modeling_gemma3.py (1 line):
	- line 471: # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas


src/transformers/models/instructblipvideo/convert_instructblipvideo_original_to_pytorch.py (1 line):
	- line 265: # TODO add this in the generate method?


src/transformers/models/smolvlm/image_processing_smolvlm.py (1 line):
	- line 227: # FIXME Amy: make a more general crop function that isn't just centre crop


src/transformers/models/idefics3/image_processing_idefics3.py (1 line):
	- line 230: # FIXME Amy: make a more general crop function that isn't just centre crop


src/transformers/models/qwen2_vl/configuration_qwen2_vl.py (1 line):
	- line 250: # TODO: @raushan update config in the hub


src/transformers/models/whisper/tokenization_whisper.py (1 line):
	- line 1013: # TODO Handle when language is different from the previous


src/transformers/models/dpt/convert_dpt_beit_to_hf.py (1 line):
	- line 246: # TODO there's still a small difference with the original logits


src/transformers/models/qwen3/modeling_qwen3.py (1 line):
	- line 407: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/model_debugging_utils.py (1 line):
	- line 247: # summary-only version for readability - traversing the tree again #TODO optimize?


src/transformers/models/csm/modular_csm.py (1 line):
	- line 587: # TODO: @eustlb, this should be batched !!!


src/transformers/models/gpt2/configuration_gpt2.py (1 line):
	- line 203: # TODO: how to do that better?


src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py (1 line):
	- line 328: # TODO: add support for other frameworks


src/transformers/models/phi3/modeling_phi3.py (1 line):
	- line 431: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/t5gemma/modular_t5gemma.py (1 line):
	- line 547: # TODO: support intialization for encoders and decoders separately(?)


src/transformers/models/deepseek_v3/modular_deepseek_v3.py (1 line):
	- line 42: TODO let's just use the original freqcis computation to not have the view


src/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py (1 line):
	- line 158: # TODO ArthurZ fairseq_ids_to_tokens should be removed


src/transformers/models/clip/modeling_tf_clip.py (1 line):
	- line 1446: # TODO: As is this currently fails with saved_model=True, because


src/transformers/models/gemma3/modular_gemma3.py (1 line):
	- line 549: # TODO: raushan fix this after RoPE refactor. For now we hack it by reassigning thetas


src/transformers/models/qwen2_5_omni/modeling_qwen2_5_omni.py (1 line):
	- line 3752: # TODO: raushan, defaults should be saved in generation config


src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py (1 line):
	- line 37: Sample usage: # TODO fix clone links from persimmon to fuyu


src/transformers/models/gptj/configuration_gptj.py (1 line):
	- line 149: # TODO: how to do that better?


src/transformers/models/distilbert/modeling_distilbert.py (1 line):
	- line 246: # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.


src/transformers/pipelines/text_classification.py (1 line):
	- line 164: # TODO try and retrieve it in a nicer way from _sanitize_parameters.


src/transformers/models/deformable_detr/image_processing_deformable_detr.py (1 line):
	- line 1528: # POSTPROCESSING METHODS - TODO: add support for other frameworks


src/transformers/models/idefics2/image_processing_idefics2.py (1 line):
	- line 126: # FIXME Amy: merge this function with the one in image_transforms.py


src/transformers/audio_utils.py (1 line):
	- line 469: # TODO This method does not support batching yet as we are mainly focused on inference.


src/transformers/models/deberta_v2/tokenization_deberta_v2.py (1 line):
	- line 327: # TODO add a deprecation cycle as this can have different behaviour from our API


src/transformers/models/pegasus_x/modeling_pegasus_x.py (1 line):
	- line 218: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/sew_d/modeling_sew_d.py (1 line):
	- line 572: # TODO: We should check if the opset_version being used to export


src/transformers/commands/add_new_model_like.py (1 line):
	- line 696: # TODO: Find some kind of fallback if there is no _CHECKPOINT_FOR_DOC in any of the modeling file.


src/transformers/models/wavlm/convert_wavlm_original_pytorch_checkpoint_to_pytorch.py (1 line):
	- line 126: # TODO: don't match quantizer.weight_proj


src/transformers/models/llama/modeling_llama.py (1 line):
	- line 400: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/m2m_100/configuration_m2m_100.py (1 line):
	- line 275: # TODO: test this.


src/transformers/models/rt_detr_v2/convert_rt_detr_v2_weights_to_hf.py (1 line):
	- line 62: # TODO: check this not working


src/transformers/processing_utils.py (1 line):
	- line 1344: # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default)


src/transformers/models/gemma3n/modeling_gemma3n.py (1 line):
	- line 1534: # TODO (raushan): Fix this after RoPE refactor. For now we hack it by


src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py (1 line):
	- line 261: # TODO: @raushan update config in the hub


src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py (1 line):
	- line 291: # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.


src/transformers/modeling_flash_attention_utils.py (1 line):
	- line 480: # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.


src/transformers/models/rwkv/modeling_rwkv.py (1 line):
	- line 270: # TODO: maybe jit, otherwise move inside forward


src/transformers/models/biogpt/modeling_biogpt.py (1 line):
	- line 176: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/hubert/modeling_hubert.py (1 line):
	- line 311: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


utils/process_bad_commit_report.py (1 line):
	- line 35: # TODO: extend


src/transformers/models/grounding_dino/modeling_grounding_dino.py (1 line):
	- line 1488: # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36


src/transformers/models/xlm_roberta/modeling_xlm_roberta.py (1 line):
	- line 294: # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented.


src/transformers/models/unispeech/modular_unispeech.py (1 line):
	- line 377: >>> # TODO: Add full pretraining example


src/transformers/models/janus/convert_janus_weights_to_hf.py (1 line):
	- line 442: # TODO: warning about weights not being tied is raised here regardless of model.tie_weights() above


src/transformers/models/d_fine/modular_d_fine.py (1 line):
	- line 49: # TODO: Attribute map assignment logic should be fixed in modular


utils/process_test_artifacts.py (1 line):
	- line 26: MAX_PARALLEL_NODES = 8  # TODO create a mapping!


src/transformers/models/bamba/modeling_bamba.py (1 line):
	- line 455: # FIXME:


src/transformers/models/hubert/modeling_tf_hubert.py (1 line):
	- line 431: # TODO Matt: Assigning to attributes in call() is deeply sinful in TensorFlow, as it should be idempotent.


src/transformers/models/granite_speech/feature_extraction_granite_speech.py (1 line):
	- line 81: # TODO (@alex-jw-brooks): Currently input_features_mask is not


src/transformers/models/hgnet_v2/configuration_hgnet_v2.py (1 line):
	- line 27: # TODO: Modular conversion for resnet must be fixed as


src/transformers/models/canine/modeling_canine.py (1 line):
	- line 382: # TODO add support for MLM


src/transformers/models/smollm3/modeling_smollm3.py (1 line):
	- line 411: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/vit/modeling_vit.py (1 line):
	- line 540: # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)


src/transformers/models/sew/modeling_sew.py (1 line):
	- line 304: # TODO: we need a refactor so that the different attention modules can get their specific kwargs


src/transformers/models/emu3/modeling_emu3.py (1 line):
	- line 1242: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/models/mistral/modular_mistral.py (1 line):
	- line 137: # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache


src/transformers/quantizers/quantizer_bnb_4bit.py (1 line):
	- line 324: # TODO: consider bringing replace_with_bnb_linear() code from ..integrations/bitsandbyter.py to here


src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py (1 line):
	- line 382: # FIXME:


src/transformers/sagemaker/training_args_sm.py (1 line):
	- line 29: # TODO: should be moved to `utils` after refactoring of SageMakerTrainer


src/transformers/tokenization_utils_base.py (1 line):
	- line 866: # TODO clean this up at some point (probably by switching to fast tokenizers)


src/transformers/modeling_gguf_pytorch_utils.py (1 line):
	- line 382: # FIXME: Currently this implementation is only for flan-t5 architecture.


src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py (1 line):
	- line 669: # TODO (joao): the `TFBaseModelOutput` wrapper should not be needed after the generate refactor is complete