megatron_patch/model/deepseek_v2/transformer_layer.py (5 lines):
	- line 108: # TODO: add pytorch only layernorm
	- line 162: # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1,
	- line 317: # TODO: could we move `bias_dropout_add_exec_handler` itself
	- line 341: # TODO: could we move `bias_dropout_add_exec_handler` itself
	- line 357: # TODO: could we move `bias_dropout_add_exec_handler` itself


megatron_patch/model/llama3/transformer_legacy.py (5 lines):
	- line 231: # TODO pre allocate memory
	- line 247: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 263: # TODO (rprenger) TODO this could be made easier to read
	- line 1143: # TODO: better redesign with inference param
	- line 1161: # TODO: better redesign with inference param


megatron_patch/model/qwen2/transformer_layer.py (5 lines):
	- line 85: # TODO: add pytorch only layernorm
	- line 126: # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1,
	- line 196: # TODO: could we move `bias_dropout_add_exec_handler` itself
	- line 220: # TODO: could we move `bias_dropout_add_exec_handler` itself
	- line 236: # TODO: could we move `bias_dropout_add_exec_handler` itself


megatron_patch/model/starcoder/transformer.py (4 lines):
	- line 167: # TODO (rprenger) TODO this could be made easier to read
	- line 176: #TODO (rprenger) This does each expert in serial, but it could be parallelized
	- line 390: # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk)
	- line 554: # TODO: Find a way to merge the query and key-value computations?


megatron_patch/model/llava/transformer.py (3 lines):
	- line 224: # TODO pre allocate memory
	- line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 256: # TODO (rprenger) TODO this could be made easier to read


megatron_patch/model/qwen_vl/transformer.py (3 lines):
	- line 224: # TODO pre allocate memory
	- line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 256: # TODO (rprenger) TODO this could be made easier to read


megatron_patch/lm_evaluate.py (3 lines):
	- line 119: # TODO: Implement caching once we've confirmed the perplexity implementation
	- line 120: # TODO: automatic batch size detection for vectorization
	- line 134: # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that


megatron_patch/model/llama2/transformer.py (3 lines):
	- line 224: # TODO pre allocate memory
	- line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 256: # TODO (rprenger) TODO this could be made easier to read


megatron_patch/model/qwen/transformer.py (3 lines):
	- line 218: # TODO pre allocate memory
	- line 234: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 250: # TODO (rprenger) TODO this could be made easier to read


megatron_patch/model/baichuan2/transformer.py (3 lines):
	- line 219: # TODO pre allocate memory
	- line 235: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 251: # TODO (rprenger) TODO this could be made easier to read


megatron_patch/model/mistral/transformer.py (3 lines):
	- line 224: # TODO pre allocate memory
	- line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm
	- line 256: # TODO (rprenger) TODO this could be made easier to read


megatron_patch/model/baichuan/transformer.py (2 lines):
	- line 188: # TODO (rprenger) TODO this could be made easier to read
	- line 197: #TODO (rprenger) This does each expert in serial, but it could be parallelized


megatron_patch/model/falcon/transformer.py (2 lines):
	- line 393: # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk)
	- line 604: # TODO: Find a way to merge the query and key-value computations?


megatron_patch/model/llava/language_model.py (2 lines):
	- line 346: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.
	- line 493: # FIXME: this is a hacky fix, for deepspeed zero3 to work


megatron_patch/model/mixtral/transformer_config.py (2 lines):
	- line 173: bias_dropout_fusion: bool = False  # TODO: this should be bias_dropout_add_fusion?
	- line 288: moe_token_dropping: bool = False  # TODO: Support token dropping.


megatron_patch/model/llava_mcore/llava_model.py (2 lines):
	- line 293: TODO: This function should adjust the attention mask too.
	- line 572: # TODO: Support batched inference.


megatron_patch/data/dataset_helpers.py (2 lines):
	- line 182: # TODO: modify get_visual_transform to add more augmentations
	- line 341: # TODO: add args


megatron_patch/data/json_sft.py (1 line):
	- line 157: # TODO: update get_batch_on_this_tp_rank_original and replace the following line with


megatron_patch/model/galactica/language_model.py (1 line):
	- line 374: # TODO: passing share_word_embeddings=False


megatron_patch/model/mixtral/model.py (1 line):
	- line 80: # TODO: remove this dependency ?


megatron_patch/model/llama2/language_model.py (1 line):
	- line 343: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


toolkits/auto_configurator/report_theoretical_memory.py (1 line):
	- line 113: # TODO: This function needs to take into account query_projection_size potentially being


megatron_patch/model/qwen/language_model.py (1 line):
	- line 343: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


megatron_patch/model/mixtral_bak/moe/token_dispatcher.py (1 line):
	- line 115: # TODO pre allocate memory


megatron_patch/model/qwen2/transformer_block.py (1 line):
	- line 175: # # TODO: add back standalone_embedding_stage


megatron_patch/model/qwen_vl/language_model.py (1 line):
	- line 344: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


megatron_patch/training.py (1 line):
	- line 589: # TODO: Remove this once we move DDP to Core.


toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_moe.py (1 line):
	- line 84: # TODO: support other orders


megatron_patch/model/falcon/language_model.py (1 line):
	- line 343: # TODO: passing share_word_embeddings=False


megatron_patch/model/llama3_1/model.py (1 line):
	- line 97: # TODO: remove this dependency ?


megatron_patch/model/qwen1_5/model.py (1 line):
	- line 82: # TODO: remove this dependency ?


rlhf/deepspeed-chat/utils.py (1 line):
	- line 230: # TODO: use prefiltering for efficiency


megatron_patch/model/glm130b/language_model.py (1 line):
	- line 358: # TODO: passing share_word_embeddings=False will not work correctly


megatron_patch/model/qwen3_moe/moe/moe_utils.py (1 line):
	- line 93: # TODO Try using element-wise operations instead of scatter?


megatron_patch/initialize.py (1 line):
	- line 65: # TODO is this still a necessary option?


megatron_patch/model/deepseek_v2/model.py (1 line):
	- line 92: # TODO: remove this dependency ?


megatron_patch/model/llama3/model.py (1 line):
	- line 82: # TODO: remove this dependency ?


megatron_patch/model/qwen2_5_vl/transformer_block.py (1 line):
	- line 285: # @TODO: add back account_for_embedding_in_pipeline_split (see issue #293)


megatron_patch/model/qwen1_5/moe/token_dispatcher.py (1 line):
	- line 407: # TODO Optimize EP=1 case


megatron_patch/model/qwen2_vl/visionmodel.py (1 line):
	- line 106: # TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism.


megatron_patch/model/qwen2_5_vl/visionmodel.py (1 line):
	- line 111: # TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism.


megatron_patch/model/mistral/language_model.py (1 line):
	- line 344: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


megatron_patch/model/deepseek_v2/moe/experts.py (1 line):
	- line 248: # TODO: write a generic implementation to cover both cases with and without GLU


megatron_patch/model/llama/language_model.py (1 line):
	- line 343: # TODO: passing share_word_embeddings=False


megatron_patch/model/falcon40b/language_model.py (1 line):
	- line 343: # TODO: passing share_word_embeddings=False


megatron_patch/model/llava_mcore/vision/clip_vit_model.py (1 line):
	- line 125: # TODO: Make pre_process and post_process configurable.


toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v3_moe.py (1 line):
	- line 76: # TODO: support other orders


megatron_patch/model/baichuan2/language_model.py (1 line):
	- line 353: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


megatron_patch/model/qwen2/model.py (1 line):
	- line 83: # TODO: remove this dependency ?


megatron_patch/model/llama3/language_model.py (1 line):
	- line 338: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


megatron_patch/model/mixtral_bak/model.py (1 line):
	- line 79: # TODO: remove this dependency ?


megatron_patch/template/helper.py (1 line):
	- line 42: # TODO: this is pretty hacky, find a better way


toolkits/model_checkpoints_convertor/glm/checkpoint_reshaping_and_interoperability.py (1 line):
	- line 464: #TODO: fit megatron


megatron_patch/model/mixtral/moe/experts.py (1 line):
	- line 249: # TODO: write a generic implementation to cover both cases with and without GLU


megatron_patch/model/deepseek_v2/transformer_block.py (1 line):
	- line 246: # @TODO: add back standalone_embedding_stage (see issue #293)


megatron_patch/model/qwen2/moe/router.py (1 line):
	- line 62: # TODO: Add Pre softmax.


megatron_patch/model/baichuan/language_model.py (1 line):
	- line 399: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.


megatron_patch/model/qwen1_5_megablocks/language_model.py (1 line):
	- line 342: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.