Path Lines of Code megatron_patch/__init__.py 1 megatron_patch/arguments.py 449 megatron_patch/data/__init__.py 88 megatron_patch/data/dataset_helpers.py 381 megatron_patch/data/energon/chatml.py 46 megatron_patch/data/image_processing.py 67 megatron_patch/data/json_sft.py 106 megatron_patch/data/utils.py 318 megatron_patch/finetune_utils.py 202 megatron_patch/generation/api.py 170 megatron_patch/generation/generation.py 317 megatron_patch/generation/gpt_predictor.py 74 megatron_patch/generation/tokenization.py 76 megatron_patch/initialize.py 91 megatron_patch/lm_evaluate.py 139 megatron_patch/model/__init__.py 1 megatron_patch/model/baichuan/__init__.py 1 megatron_patch/model/baichuan/gpt_model.py 106 megatron_patch/model/baichuan/language_model.py 515 megatron_patch/model/baichuan/transformer.py 1179 megatron_patch/model/baichuan2/gpt_model.py 97 megatron_patch/model/baichuan2/language_model.py 450 megatron_patch/model/baichuan2/layers.py 134 megatron_patch/model/baichuan2/transformer.py 1292 megatron_patch/model/bloom/__init__.py 1 megatron_patch/model/bloom/gpt_model.py 81 megatron_patch/model/bloom/language_model.py 411 megatron_patch/model/bloom/layers.py 87 megatron_patch/model/bloom/positional_embeddings.py 122 megatron_patch/model/bloom/transformer.py 811 megatron_patch/model/chatglm/__init__.py 1 megatron_patch/model/chatglm/gpt_model.py 82 megatron_patch/model/chatglm/language_model.py 473 megatron_patch/model/chatglm/positional_embeddings.py 60 megatron_patch/model/chatglm/transformer.py 604 megatron_patch/model/deepseek_v2/__init__.py 1 megatron_patch/model/deepseek_v2/layer_specs.py 120 megatron_patch/model/deepseek_v2/mlp.py 196 megatron_patch/model/deepseek_v2/model.py 181 megatron_patch/model/deepseek_v2/moe/experts.py 676 megatron_patch/model/deepseek_v2/moe/moe_layer.py 118 megatron_patch/model/deepseek_v2/moe/shared_experts.py 180 megatron_patch/model/deepseek_v2/multi_latent_attention.py 276 megatron_patch/model/deepseek_v2/transformer_block.py 377 megatron_patch/model/deepseek_v2/transformer_config.py 42 megatron_patch/model/deepseek_v2/transformer_layer.py 226 megatron_patch/model/falcon/__init__.py 1 megatron_patch/model/falcon/gpt_model.py 94 megatron_patch/model/falcon/language_model.py 491 megatron_patch/model/falcon/transformer.py 845 megatron_patch/model/falcon40b/__init__.py 1 megatron_patch/model/falcon40b/gpt_model.py 94 megatron_patch/model/falcon40b/language_model.py 491 megatron_patch/model/falcon40b/transformer.py 683 megatron_patch/model/galactica/__init__.py 1 megatron_patch/model/galactica/gpt_model.py 94 megatron_patch/model/galactica/language_model.py 501 megatron_patch/model/galactica/transformer.py 570 megatron_patch/model/glm130b/__init__.py 1 megatron_patch/model/glm130b/gpt_model.py 80 megatron_patch/model/glm130b/language_model.py 434 megatron_patch/model/glm130b/transformer.py 875 megatron_patch/model/llama/__init__.py 1 megatron_patch/model/llama/gpt_model.py 92 megatron_patch/model/llama/language_model.py 501 megatron_patch/model/llama/positional_embeddings.py 54 megatron_patch/model/llama/transformer.py 715 megatron_patch/model/llama2/__init__.py 1 megatron_patch/model/llama2/gpt_model.py 88 megatron_patch/model/llama2/language_model.py 454 megatron_patch/model/llama2/rotary_pos_embedding.py 56 megatron_patch/model/llama2/transformer.py 1296 megatron_patch/model/llama3/__init__.py 1 megatron_patch/model/llama3/gpt_model.py 88 megatron_patch/model/llama3/language_model.py 438 megatron_patch/model/llama3/layer_specs.py 85 megatron_patch/model/llama3/model.py 144 megatron_patch/model/llama3/rms_norm.py 13 megatron_patch/model/llama3/transformer/attention.py 402 megatron_patch/model/llama3/transformer/mlp.py 150 megatron_patch/model/llama3/transformer_config.py 8 megatron_patch/model/llama3/transformer_legacy.py 1252 megatron_patch/model/llama3_1/__init__.py 1 megatron_patch/model/llama3_1/layer_specs.py 85 megatron_patch/model/llama3_1/model.py 201 megatron_patch/model/llama3_1/rms_norm.py 13 megatron_patch/model/llama3_1/transformer_config.py 9 megatron_patch/model/llava/__init__.py 1 megatron_patch/model/llava/clip_encoder.py 75 megatron_patch/model/llava/gpt_model.py 89 megatron_patch/model/llava/language_model.py 507 megatron_patch/model/llava/mm_projector_builder.py 37 megatron_patch/model/llava/rotary_pos_embedding.py 54 megatron_patch/model/llava/transformer.py 1292 megatron_patch/model/llava_mcore/__init__.py 1 megatron_patch/model/llava_mcore/layer_specs.py 99 megatron_patch/model/llava_mcore/llava_model.py 424 megatron_patch/model/llava_mcore/llava_spec.py 75 megatron_patch/model/llava_mcore/transformer_config.py 133 megatron_patch/model/llava_mcore/vision/__init__.py 1 megatron_patch/model/llava_mcore/vision/clip_vit_model.py 130 megatron_patch/model/llava_mcore/vision/multimodal_projector.py 41 megatron_patch/model/llava_mcore/vision/vit_layer_specs.py 79 megatron_patch/model/mistral/__init__.py 1 megatron_patch/model/mistral/gpt_model.py 88 megatron_patch/model/mistral/language_model.py 466 megatron_patch/model/mistral/modeling_attn_mask_utils.py 121 megatron_patch/model/mistral/rotary_pos_embedding.py 36 megatron_patch/model/mistral/transformer.py 1292 megatron_patch/model/mixtral/__init__.py 1 megatron_patch/model/mixtral/layer_specs.py 129 megatron_patch/model/mixtral/model.py 177 megatron_patch/model/mixtral/moe/__init__.py 1 megatron_patch/model/mixtral/moe/experts.py 676 megatron_patch/model/mixtral/moe/moe_layer.py 113 megatron_patch/model/mixtral/moe/router.py 171 megatron_patch/model/mixtral/moe/token_dispatcher.py 303 megatron_patch/model/mixtral/transformer/attention.py 517 megatron_patch/model/mixtral/transformer/mlp.py 193 megatron_patch/model/mixtral/transformer_config.py 285 megatron_patch/model/mixtral_bak/__init__.py 1 megatron_patch/model/mixtral_bak/layer_specs.py 86 megatron_patch/model/mixtral_bak/model.py 162 megatron_patch/model/mixtral_bak/moe/__init__.py 1 megatron_patch/model/mixtral_bak/moe/experts.py 136 megatron_patch/model/mixtral_bak/moe/grouped_gemm_util.py 12 megatron_patch/model/mixtral_bak/moe/moe_layer.py 57 megatron_patch/model/mixtral_bak/moe/moe_utils.py 39 megatron_patch/model/mixtral_bak/moe/router.py 113 megatron_patch/model/mixtral_bak/moe/token_dispatcher.py 172 megatron_patch/model/mixtral_bak/transformer/attention.py 322 megatron_patch/model/mixtral_bak/transformer/mlp.py 131 megatron_patch/model/mixtral_bak/transformer_config.py 142 megatron_patch/model/qwen/__init__.py 1 megatron_patch/model/qwen/gpt_model.py 88 megatron_patch/model/qwen/language_model.py 440 megatron_patch/model/qwen/transformer.py 1243 megatron_patch/model/qwen1_5/__init__.py 1 megatron_patch/model/qwen1_5/layer_specs.py 92 megatron_patch/model/qwen1_5/model.py 144 megatron_patch/model/qwen1_5/moe/__init__.py 1 megatron_patch/model/qwen1_5/moe/experts.py 188 megatron_patch/model/qwen1_5/moe/moe_layer.py 78 megatron_patch/model/qwen1_5/moe/router.py 139 megatron_patch/model/qwen1_5/moe/token_dispatcher.py 282 megatron_patch/model/qwen1_5/transformer/attention.py 402 megatron_patch/model/qwen1_5/transformer/mlp.py 164 megatron_patch/model/qwen1_5/transformer_config.py 7 megatron_patch/model/qwen1_5_megablocks/__init__.py 1 megatron_patch/model/qwen1_5_megablocks/gpt_model.py 88 megatron_patch/model/qwen1_5_megablocks/language_model.py 453 megatron_patch/model/qwen1_5_megablocks/rotary_pos_embedding.py 56 megatron_patch/model/qwen1_5_megablocks/transformer.py 1184 megatron_patch/model/qwen2/layer_specs.py 108 megatron_patch/model/qwen2/model.py 146 megatron_patch/model/qwen2/moe/__init__.py 1 megatron_patch/model/qwen2/moe/experts.py 316 megatron_patch/model/qwen2/moe/moe_layer.py 114 megatron_patch/model/qwen2/moe/router.py 206 megatron_patch/model/qwen2/moe/token_dispatcher.py 327 megatron_patch/model/qwen2/rms_norm.py 13 megatron_patch/model/qwen2/transformer/attention.py 407 megatron_patch/model/qwen2/transformer/mlp.py 258 megatron_patch/model/qwen2/transformer_block.py 323 megatron_patch/model/qwen2/transformer_config.py 14 megatron_patch/model/qwen2/transformer_layer.py 147 megatron_patch/model/qwen2_5_vl/model.py 191 megatron_patch/model/qwen2_5_vl/transformer_block.py 434 megatron_patch/model/qwen2_5_vl/transformer_config.py 55 megatron_patch/model/qwen2_5_vl/visionmodel.py 224 megatron_patch/model/qwen2_moe/__init__.py 1 megatron_patch/model/qwen2_moe/layer_specs.py 281 megatron_patch/model/qwen2_moe/transformer_config.py 55 megatron_patch/model/qwen2_vl/attention.py 530 megatron_patch/model/qwen2_vl/attention_vision.py 529 megatron_patch/model/qwen2_vl/gpt_model.py 143 megatron_patch/model/qwen2_vl/language_model_embedding.py 98 megatron_patch/model/qwen2_vl/language_module.py 104 megatron_patch/model/qwen2_vl/layer_specs.py 95 megatron_patch/model/qwen2_vl/model.py 191 megatron_patch/model/qwen2_vl/rope_utils.py 108 megatron_patch/model/qwen2_vl/rotary_pos_embedding.py 145 megatron_patch/model/qwen2_vl/transformer_config.py 53 megatron_patch/model/qwen2_vl/visionmodel.py 160 megatron_patch/model/qwen3_moe/gpt_layer_specs.py 347 megatron_patch/model/qwen3_moe/moe/moe_layer.py 70 megatron_patch/model/qwen3_moe/moe/moe_utils.py 79 megatron_patch/model/qwen3_moe/moe/router.py 111 megatron_patch/model/qwen3_moe/moe_module_specs.py 61 megatron_patch/model/qwen_vl/__init__.py 1 megatron_patch/model/qwen_vl/gpt_model.py 89 megatron_patch/model/qwen_vl/language_model.py 481 megatron_patch/model/qwen_vl/transformer.py 1292 megatron_patch/model/qwen_vl/visual.py 296 megatron_patch/model/starcoder/__init__.py 1 megatron_patch/model/starcoder/enums.py 19 megatron_patch/model/starcoder/glu_activations.py 32 megatron_patch/model/starcoder/gpt_model.py 83 megatron_patch/model/starcoder/language_model.py 387 megatron_patch/model/starcoder/transformer.py 848 megatron_patch/template/helper.py 115 megatron_patch/tensor_parallel.py 66 megatron_patch/tokenizer/icetk_glm130b_tokenizer.py 273 megatron_patch/tokenizer/jiebabpe_tokenizer.py 53 megatron_patch/tokenizer/tokenization_baichuan.py 139 megatron_patch/tokenizer/tokenization_qwen_vl.py 441 megatron_patch/tokenizer/tokenization_yi.py 166 megatron_patch/training.py 612