server/custom_kernels/custom_kernels/fused_attention_cuda.cu (8 lines):
	- line 67: // TODO @thomasw21 with more memory we can probably compute a much faster `max-reduce` in parallel O(ln(n)) operations in each memory slot
	- line 87: // TODO @thomasw21 with more memory we can probably compute a much faster `sum-reduce` in parallel O(ln(n)) operations in each memory slot
	- line 158: // TODO @thomasw21: it's easier to think of attention_scores as 2D tensors
	- line 169: // TODO @thomas21: change by to this as it's cleaner when pytorch 1.13 comes out
	- line 189: // TODO @thomasw21 figure out everything warp related:
	- line 191: // TODO @thomas21 check why everyone is setting 1024 when officially it's 2048
	- line 193: // TODO @thomasw21 figure out how to have longer sequences, currently the maximum is `max_kv_length = MAX_THREADS_PER_SM * MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD`
	- line 229: // TODO @thomasw21 Figure out how to get minimum value


server/text_generation_server/layers/attention/flash_attn_triton.py (8 lines):
	- line 52: # TODO: use tl.randint for better performance
	- line 128: # TODO: This can be optimized to only be true for the padded block.
	- line 284: # TODO: This config fails with head_size not pow2 with data mismatches.
	- line 428: # TODO: Should dropout and return encoded softmax be handled here?
	- line 493: # TODO: Fix encoded softmax. It currently uses just h_q in the base offset.
	- line 664: # TODO: Do the boundary check optionally.
	- line 693: # TODO: Change assert if we support qkl f8 and v f16
	- line 695: # TODO: Fix assert to check head size <=256 once supported


server/custom_kernels/custom_kernels/fused_bloom_attention_cuda.cu (8 lines):
	- line 67: // TODO @thomasw21 with more memory we can probably compute a much faster `max-reduce` in parallel O(ln(n)) operations in each memory slot
	- line 87: // TODO @thomasw21 with more memory we can probably compute a much faster `sum-reduce` in parallel O(ln(n)) operations in each memory slot
	- line 158: // TODO @thomasw21: it's easier to think of attention_scores as 2D tensors
	- line 169: // TODO @thomas21: change by to this as it's cleaner when pytorch 1.13 comes out
	- line 189: // TODO @thomasw21 figure out everything warp related:
	- line 191: // TODO @thomas21 check why everyone is setting 1024 when officially it's 2048
	- line 193: // TODO @thomasw21 figure out how to have longer sequences, currently the maximum is `max_kv_length = MAX_THREADS_PER_SM * MIN_KV_LENGTH_SHARD_SIZE_PER_THREAD`
	- line 229: // TODO @thomasw21 Figure out how to get minimum value


server/text_generation_server/models/__init__.py (8 lines):
	- line 632: # TODO: fix how we determine model type for Mamba
	- line 1151: # TODO: once implemented in transformers, use the config class
	- line 1189: # TODO: once implemented in transformers, use the config class
	- line 1525: # TODO: Fix bug in rust image_text_replacement implementation
	- line 1528: # TODO: Uncomment when transformers is refactored
	- line 1558: # TODO: Fix bug in rust image_text_replacement implementation
	- line 1561: # TODO: Uncomment when transformers is refactored
	- line 1591: # TODO: Uncomment when transformers is refactored and cross attn is added


launcher/src/main.rs (5 lines):
	- line 312: // TODO handle quantization
	- line 321: // TODO handle quantization
	- line 322: // TODO This calculation depends on the actual implementation
	- line 347: // TODO handle quantization
	- line 2116: // TODO: remove this when we correctly esimate the flops for VLMs


backends/gaudi/server/text_generation_server/layers/attention/kv_cache.py (4 lines):
	- line 51: ## TODO FP8 kv cache support
	- line 94: ## TODO FP8 kv cache support
	- line 126: ## TODO FP8 kv cache support
	- line 162: ## TODO FP8 kv cache support


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama4_modeling.py (4 lines):
	- line 615: )  # flash does not support chunked attn TODO support flash
	- line 894: )  # TODO: check if we need to apply activation again
	- line 949: # TODO there is a different RoPE for vision encoder, defined as below
	- line 1119: freqs_ci: torch.Tensor,  # TODO move this to an attribute instead of keeping it around


backends/gaudi/server/text_generation_server/models/flash_causal_lm.py (3 lines):
	- line 1411: # TODO Huge hack
	- line 2036: # TODO not support adapter now, need the add in the future
	- line 2084: # TODO speculative decoding handling missing


backends/gaudi/server/text_generation_server/models/flash_vlm_causal_lm.py (3 lines):
	- line 130: # TODO: implement this in a more general way
	- line 169: # TODO: get correct number of features via reviewing the Gemma3 architecture
	- line 218: # TODO: check if this is needed


server/text_generation_server/models/flash_causal_lm.py (3 lines):
	- line 307: # FIXME: speculate is not supported for context chunking at the moment
	- line 1244: # TODO Huge hack
	- line 1980: # FIXME: use true number of accepted tokens instead of 1


server/text_generation_server/models/vlm_causal_lm.py (3 lines):
	- line 121: # TODO: implement this in a more general way
	- line 160: # TODO: get correct number of features via reviewing the Gemma3 architecture
	- line 209: # TODO: check if this is needed


server/text_generation_server/utils/quantization.py (3 lines):
	- line 14: # TODO: Split this config to have a single config type per quant method
	- line 139: # TODO: improve check once we have one config type per quantize value
	- line 195: # TODO: improve check once we have one config type per quantize value


backends/gaudi/server/text_generation_server/layers/fp8.py (3 lines):
	- line 238: # TODO: don't do this when we have to use the Torch kernel.
	- line 353: # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
	- line 415: # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet


router/src/validation.rs (3 lines):
	- line 339: // TODO: we should build the FSM here and pass the compiled FSM instead of the grammar
	- line 571: // TODO Remove this clone
	- line 766: // TODO: prefer using the config to determine the number of features


backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_vl.py (2 lines):
	- line 239: # TODO: replace with static positional embeddings once implemented
	- line 287: # TODO: revisit to see if we can avoid some of these reshapes


server/text_generation_server/models/idefics_causal_lm.py (2 lines):
	- line 136: # TODO Check impact on idefics
	- line 161: # TODO Check impact on idefics


backends/trtllm/src/looper.rs (2 lines):
	- line 130: // TODO : Expose actual real starting time for a request on FFI layer
	- line 295: // TODO: Is it really needed? How can it be validated before?


backends/gaudi/server/text_generation_server/cli.py (2 lines):
	- line 86: # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
	- line 171: # TODO: maybe reverse the default value of merge_lora?


backends/v3/src/radix.rs (2 lines):
	- line 368: // TODO: add some bookkeeping in the future to check whether we can
	- line 429: // TODO: in the future we may want to check that the blocks match for


backends/gaudi/server/text_generation_server/models/__init__.py (2 lines):
	- line 829: # TODO: Fix bug in rust image_text_replacement implementation
	- line 846: # TODO: Fix bug in rust image_text_replacement implementation


backends/gaudi/server/text_generation_server/models/custom_modeling/bloom_modeling.py (2 lines):
	- line 105: TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
	- line 295: ### TODO @thomasw21: this takes quite a bit of time, how do I accelerate that?


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py (2 lines):
	- line 286: # TODO: This is a hotfix to be removed & properly refactored.
	- line 464: # TODO dirty hack for idefics2.


server/text_generation_server/layers/fp8.py (2 lines):
	- line 155: # TODO: don't do this when we have to use the Torch kernel.
	- line 290: # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet


server/text_generation_server/models/custom_modeling/qwen2_vl.py (2 lines):
	- line 274: # TODO: replace with static positional embeddings once implemented
	- line 322: # TODO: revisit to see if we can avoid some of these reshapes


backends/gaudi/server/text_generation_server/utils/quantization.py (2 lines):
	- line 12: # TODO: Split this config to have a single config type per quant method
	- line 136: # TODO: improve check once we have one config type per quantize value


server/text_generation_server/models/custom_modeling/idefics_image_processing.py (2 lines):
	- line 177: # TODO: this converts to torch tensors - switch to convert_to_tensors once it becomes available
	- line 293: # TODO 4.32


server/text_generation_server/models/custom_modeling/bloom_modeling.py (2 lines):
	- line 105: TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly.
	- line 295: ### TODO @thomasw21: this takes quite a bit of time, how do I accelerate that?


backends/gaudi/server/text_generation_server/layers/compressed_tensors/w8an_fp.py (2 lines):
	- line 131: # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet
	- line 178: # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet


backends/llamacpp/src/backend.rs (2 lines):
	- line 87: // TODO: macro
	- line 512: // TODO remove this


server/text_generation_server/cli.py (2 lines):
	- line 94: # TODO: enable lora with cuda graphs. for now disable cuda graphs if lora is enabled
	- line 174: # TODO: maybe reverse the default value of merge_lora?


server/text_generation_server/models/custom_modeling/flash_llama_modeling.py (2 lines):
	- line 374: # TODO: This is a hotfix to be removed & properly refactored.
	- line 387: != 16384  # TODO: Temporary workaround for `LLMM_Silu` kernel not working with LLama3.1 405B; needs refactoring once fixed.


backends/gaudi/server/text_generation_server/models/custom_modeling/qwen2_5_vl.py (2 lines):
	- line 375: # TODO: @raushan update config in the hub
	- line 681: # TODO: revisit to see if we can avoid some of these reshapes


backends/gaudi/server/text_generation_server/models/mllama_causal_lm.py (2 lines):
	- line 132: # TODO unsure about BOS
	- line 605: # TODO list


server/text_generation_server/models/custom_modeling/qwen2_5_vl.py (2 lines):
	- line 369: # TODO: @raushan update config in the hub
	- line 714: # TODO: revisit to see if we can avoid some of these reshapes


server/text_generation_server/models/custom_modeling/flash_mistral_modeling.py (2 lines):
	- line 304: # TODO: This is a hotfix to be removed & properly refactored.
	- line 487: # TODO dirty hack for idefics2.


backends/gaudi/server/text_generation_server/utils/tokens.py (2 lines):
	- line 206: # TODO Hack because eos_token_id cannot be what we want.
	- line 262: # TODO: enable watermark with FP8 quantization


backends/gaudi/server/text_generation_server/layers/gptq/quantize.py (1 line):
	- line 779: # TODO: perform packing on GPU


backends/v3/src/backend.rs (1 line):
	- line 192: // TODO: temporarily disable to avoid incorrect deallocation +


server/text_generation_server/models/custom_modeling/mpt_modeling.py (1 line):
	- line 797: # TODO: reimplement mixed device initialization


server/text_generation_server/layers/attention/cuda.py (1 line):
	- line 92: # TODO fixme when flash contains the fix.


server/text_generation_server/models/model.py (1 line):
	- line 50: # TODO report this to transformers.


router/src/server.rs (1 line):
	- line 181: created: 0, // TODO: determine how to get this


server/text_generation_server/layers/compressed_tensors/w8an_fp.py (1 line):
	- line 119: # FIXME: Force to_device to false as fp8 weights do not support torch.cat on device yet


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py (1 line):
	- line 382: # TODO: This is a hotfix to be removed & properly refactored.


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py (1 line):
	- line 449: # TODO: This is a hotfix to be removed & properly refactored.


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py (1 line):
	- line 109: # TODO This is odd but apparently pali gemma position ids start at 1.


server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py (1 line):
	- line 130: # TODO: remove this hack to support local sliding window


server/text_generation_server/models/custom_modeling/flash_pali_gemma_modeling.py (1 line):
	- line 112: # TODO This is odd but apparently pali gemma position ids start at 1.


server/text_generation_server/utils/tokens.py (1 line):
	- line 199: # TODO Hack because eos_token_id cannot be what we want.


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_gemma3_modeling.py (1 line):
	- line 138: # TODO: remove this hack to support local sliding window


server/text_generation_server/models/custom_modeling/clip.py (1 line):
	- line 31: # TODO Should we TP this ?


server/text_generation_server/models/transformers_flash_vlm.py (1 line):
	- line 146: # TODO: implement


backends/gaudi/server/text_generation_server/models/custom_modeling/flash_llama_modeling.py (1 line):
	- line 360: # TODO: This is a hotfix to be removed & properly refactored.


server/text_generation_server/utils/logits_process.py (1 line):
	- line 524: # TODO: move grammar compilation into the router


backends/gaudi/server/text_generation_server/server.py (1 line):
	- line 59: # TODO: The inferecemode set messes up the autograd op dispatch. And results in aten::matmul


server/text_generation_server/models/mllama_causal_lm.py (1 line):
	- line 110: # TODO unsure about BOS


server/text_generation_server/layers/lora.py (1 line):
	- line 104: # TODO: error with [-1, 0], but not [0, -1]


server/text_generation_server/layers/gptq/quantize.py (1 line):
	- line 779: # TODO: perform packing on GPU


server/exllama_kernels/exllama_kernels/cuda_func/q4_matrix.cu (1 line):
	- line 108: cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch


backends/gaudi/server/text_generation_server/models/model.py (1 line):
	- line 43: # TODO report this to transformers.


server/text_generation_server/models/custom_modeling/vlm.py (1 line):
	- line 60: # TODO: ensure that using the prefix doesn't break any existing models


server/text_generation_server/models/mamba.py (1 line):
	- line 481: # TODO: implement warmup for Mamba if needed


server/text_generation_server/models/custom_modeling/flash_deepseek_v3_modeling.py (1 line):
	- line 394: # TODO: This is a hotfix to be removed & properly refactored.


server/text_generation_server/models/custom_modeling/flash_deepseek_v2_modeling.py (1 line):
	- line 394: # TODO: This is a hotfix to be removed & properly refactored.


backends/gaudi/server/text_generation_server/models/custom_modeling/clip.py (1 line):
	- line 31: # TODO Should we TP this ?


server/text_generation_server/models/metadata_kernels.py (1 line):
	- line 21: # FIXME: it seems that has_triton_torch is bugged on RocM


server/exllama_kernels/exllama_kernels/matrix.cuh (1 line):
	- line 83: // TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu


server/text_generation_server/models/custom_modeling/gemma3/image_processing_gemma3.py (1 line):
	- line 145: Pan and Scan and image, whatever it means. TODO: write-up docs


backends/gaudi/server/text_generation_server/layers/gptq/hpu.py (1 line):
	- line 86: # TODO: Support group indexing and remove the check


backends/gaudi/server/text_generation_server/models/custom_modeling/vlm.py (1 line):
	- line 59: # TODO: ensure that using the prefix doesn't break any existing models


backends/gaudi/server/text_generation_server/utils/logits_process.py (1 line):
	- line 518: # TODO: move grammar compilation into the router


backends/gaudi/server/text_generation_server/layers/lora.py (1 line):
	- line 98: # TODO: error with [-1, 0], but not [0, -1]