megatron_patch/model/deepseek_v2/transformer_layer.py (5 lines): - line 108: # TODO: add pytorch only layernorm - line 162: # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1, - line 317: # TODO: could we move `bias_dropout_add_exec_handler` itself - line 341: # TODO: could we move `bias_dropout_add_exec_handler` itself - line 357: # TODO: could we move `bias_dropout_add_exec_handler` itself megatron_patch/model/llama3/transformer_legacy.py (5 lines): - line 231: # TODO pre allocate memory - line 247: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 263: # TODO (rprenger) TODO this could be made easier to read - line 1143: # TODO: better redesign with inference param - line 1161: # TODO: better redesign with inference param megatron_patch/model/qwen2/transformer_layer.py (5 lines): - line 85: # TODO: add pytorch only layernorm - line 126: # TODO how to set the gpt_layer_spec.py when we have moe_frequency > 1, - line 196: # TODO: could we move `bias_dropout_add_exec_handler` itself - line 220: # TODO: could we move `bias_dropout_add_exec_handler` itself - line 236: # TODO: could we move `bias_dropout_add_exec_handler` itself megatron_patch/model/starcoder/transformer.py (4 lines): - line 167: # TODO (rprenger) TODO this could be made easier to read - line 176: #TODO (rprenger) This does each expert in serial, but it could be parallelized - line 390: # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk) - line 554: # TODO: Find a way to merge the query and key-value computations? megatron_patch/model/llava/transformer.py (3 lines): - line 224: # TODO pre allocate memory - line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 256: # TODO (rprenger) TODO this could be made easier to read megatron_patch/model/qwen_vl/transformer.py (3 lines): - line 224: # TODO pre allocate memory - line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 256: # TODO (rprenger) TODO this could be made easier to read megatron_patch/lm_evaluate.py (3 lines): - line 119: # TODO: Implement caching once we've confirmed the perplexity implementation - line 120: # TODO: automatic batch size detection for vectorization - line 134: # TODO: extract out this call so it only gets called once and also somehow figure out partial caching for that megatron_patch/model/llama2/transformer.py (3 lines): - line 224: # TODO pre allocate memory - line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 256: # TODO (rprenger) TODO this could be made easier to read megatron_patch/model/qwen/transformer.py (3 lines): - line 218: # TODO pre allocate memory - line 234: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 250: # TODO (rprenger) TODO this could be made easier to read megatron_patch/model/baichuan2/transformer.py (3 lines): - line 219: # TODO pre allocate memory - line 235: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 251: # TODO (rprenger) TODO this could be made easier to read megatron_patch/model/mistral/transformer.py (3 lines): - line 224: # TODO pre allocate memory - line 240: # TODO (rprenger) Right now we're just using the sinkhorn algorithm - line 256: # TODO (rprenger) TODO this could be made easier to read megatron_patch/model/baichuan/transformer.py (2 lines): - line 188: # TODO (rprenger) TODO this could be made easier to read - line 197: #TODO (rprenger) This does each expert in serial, but it could be parallelized megatron_patch/model/falcon/transformer.py (2 lines): - line 393: # TODO: ideally, alibi would have the shape: (1, num_heads * sq, sk) - line 604: # TODO: Find a way to merge the query and key-value computations? megatron_patch/model/llava/language_model.py (2 lines): - line 346: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. - line 493: # FIXME: this is a hacky fix, for deepspeed zero3 to work megatron_patch/model/mixtral/transformer_config.py (2 lines): - line 173: bias_dropout_fusion: bool = False # TODO: this should be bias_dropout_add_fusion? - line 288: moe_token_dropping: bool = False # TODO: Support token dropping. megatron_patch/model/llava_mcore/llava_model.py (2 lines): - line 293: TODO: This function should adjust the attention mask too. - line 572: # TODO: Support batched inference. megatron_patch/data/dataset_helpers.py (2 lines): - line 182: # TODO: modify get_visual_transform to add more augmentations - line 341: # TODO: add args megatron_patch/data/json_sft.py (1 line): - line 157: # TODO: update get_batch_on_this_tp_rank_original and replace the following line with megatron_patch/model/galactica/language_model.py (1 line): - line 374: # TODO: passing share_word_embeddings=False megatron_patch/model/mixtral/model.py (1 line): - line 80: # TODO: remove this dependency ? megatron_patch/model/llama2/language_model.py (1 line): - line 343: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. toolkits/auto_configurator/report_theoretical_memory.py (1 line): - line 113: # TODO: This function needs to take into account query_projection_size potentially being megatron_patch/model/qwen/language_model.py (1 line): - line 343: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. megatron_patch/model/mixtral_bak/moe/token_dispatcher.py (1 line): - line 115: # TODO pre allocate memory megatron_patch/model/qwen2/transformer_block.py (1 line): - line 175: # # TODO: add back standalone_embedding_stage megatron_patch/model/qwen_vl/language_model.py (1 line): - line 344: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. megatron_patch/training.py (1 line): - line 589: # TODO: Remove this once we move DDP to Core. toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_moe.py (1 line): - line 84: # TODO: support other orders megatron_patch/model/falcon/language_model.py (1 line): - line 343: # TODO: passing share_word_embeddings=False megatron_patch/model/llama3_1/model.py (1 line): - line 97: # TODO: remove this dependency ? megatron_patch/model/qwen1_5/model.py (1 line): - line 82: # TODO: remove this dependency ? rlhf/deepspeed-chat/utils.py (1 line): - line 230: # TODO: use prefiltering for efficiency megatron_patch/model/glm130b/language_model.py (1 line): - line 358: # TODO: passing share_word_embeddings=False will not work correctly megatron_patch/model/qwen3_moe/moe/moe_utils.py (1 line): - line 93: # TODO Try using element-wise operations instead of scatter? megatron_patch/initialize.py (1 line): - line 65: # TODO is this still a necessary option? megatron_patch/model/deepseek_v2/model.py (1 line): - line 92: # TODO: remove this dependency ? megatron_patch/model/llama3/model.py (1 line): - line 82: # TODO: remove this dependency ? megatron_patch/model/qwen2_5_vl/transformer_block.py (1 line): - line 285: # @TODO: add back account_for_embedding_in_pipeline_split (see issue #293) megatron_patch/model/qwen1_5/moe/token_dispatcher.py (1 line): - line 407: # TODO Optimize EP=1 case megatron_patch/model/qwen2_vl/visionmodel.py (1 line): - line 106: # TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism. megatron_patch/model/qwen2_5_vl/visionmodel.py (1 line): - line 111: # TODO: Follow-up changes will make pre and post_process configurable. They are needed for supporting pipeline parallelism. megatron_patch/model/mistral/language_model.py (1 line): - line 344: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. megatron_patch/model/deepseek_v2/moe/experts.py (1 line): - line 248: # TODO: write a generic implementation to cover both cases with and without GLU megatron_patch/model/llama/language_model.py (1 line): - line 343: # TODO: passing share_word_embeddings=False megatron_patch/model/falcon40b/language_model.py (1 line): - line 343: # TODO: passing share_word_embeddings=False megatron_patch/model/llava_mcore/vision/clip_vit_model.py (1 line): - line 125: # TODO: Make pre_process and post_process configurable. toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v3_moe.py (1 line): - line 76: # TODO: support other orders megatron_patch/model/baichuan2/language_model.py (1 line): - line 353: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. megatron_patch/model/qwen2/model.py (1 line): - line 83: # TODO: remove this dependency ? megatron_patch/model/llama3/language_model.py (1 line): - line 338: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. megatron_patch/model/mixtral_bak/model.py (1 line): - line 79: # TODO: remove this dependency ? megatron_patch/template/helper.py (1 line): - line 42: # TODO: this is pretty hacky, find a better way toolkits/model_checkpoints_convertor/glm/checkpoint_reshaping_and_interoperability.py (1 line): - line 464: #TODO: fit megatron megatron_patch/model/mixtral/moe/experts.py (1 line): - line 249: # TODO: write a generic implementation to cover both cases with and without GLU megatron_patch/model/deepseek_v2/transformer_block.py (1 line): - line 246: # @TODO: add back standalone_embedding_stage (see issue #293) megatron_patch/model/qwen2/moe/router.py (1 line): - line 62: # TODO: Add Pre softmax. megatron_patch/model/baichuan/language_model.py (1 line): - line 399: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5. megatron_patch/model/qwen1_5_megablocks/language_model.py (1 line): - line 342: # TODO: passing share_embeddings_and_output_weights=False will not work correctly for T5 and embeddings will not be synced. Fix later for T5.