Path	Lines of Code
megatron_patch/__init__.py	1
megatron_patch/arguments.py	449
megatron_patch/data/__init__.py	88
megatron_patch/data/dataset_helpers.py	381
megatron_patch/data/energon/chatml.py	46
megatron_patch/data/image_processing.py	67
megatron_patch/data/json_sft.py	106
megatron_patch/data/utils.py	318
megatron_patch/finetune_utils.py	202
megatron_patch/generation/api.py	170
megatron_patch/generation/generation.py	317
megatron_patch/generation/gpt_predictor.py	74
megatron_patch/generation/tokenization.py	76
megatron_patch/initialize.py	91
megatron_patch/lm_evaluate.py	139
megatron_patch/model/__init__.py	1
megatron_patch/model/baichuan/__init__.py	1
megatron_patch/model/baichuan/gpt_model.py	106
megatron_patch/model/baichuan/language_model.py	515
megatron_patch/model/baichuan/transformer.py	1179
megatron_patch/model/baichuan2/gpt_model.py	97
megatron_patch/model/baichuan2/language_model.py	450
megatron_patch/model/baichuan2/layers.py	134
megatron_patch/model/baichuan2/transformer.py	1292
megatron_patch/model/bloom/__init__.py	1
megatron_patch/model/bloom/gpt_model.py	81
megatron_patch/model/bloom/language_model.py	411
megatron_patch/model/bloom/layers.py	87
megatron_patch/model/bloom/positional_embeddings.py	122
megatron_patch/model/bloom/transformer.py	811
megatron_patch/model/chatglm/__init__.py	1
megatron_patch/model/chatglm/gpt_model.py	82
megatron_patch/model/chatglm/language_model.py	473
megatron_patch/model/chatglm/positional_embeddings.py	60
megatron_patch/model/chatglm/transformer.py	604
megatron_patch/model/deepseek_v2/__init__.py	1
megatron_patch/model/deepseek_v2/layer_specs.py	120
megatron_patch/model/deepseek_v2/mlp.py	196
megatron_patch/model/deepseek_v2/model.py	181
megatron_patch/model/deepseek_v2/moe/experts.py	676
megatron_patch/model/deepseek_v2/moe/moe_layer.py	118
megatron_patch/model/deepseek_v2/moe/shared_experts.py	180
megatron_patch/model/deepseek_v2/multi_latent_attention.py	276
megatron_patch/model/deepseek_v2/transformer_block.py	377
megatron_patch/model/deepseek_v2/transformer_config.py	42
megatron_patch/model/deepseek_v2/transformer_layer.py	226
megatron_patch/model/falcon/__init__.py	1
megatron_patch/model/falcon/gpt_model.py	94
megatron_patch/model/falcon/language_model.py	491
megatron_patch/model/falcon/transformer.py	845
megatron_patch/model/falcon40b/__init__.py	1
megatron_patch/model/falcon40b/gpt_model.py	94
megatron_patch/model/falcon40b/language_model.py	491
megatron_patch/model/falcon40b/transformer.py	683
megatron_patch/model/galactica/__init__.py	1
megatron_patch/model/galactica/gpt_model.py	94
megatron_patch/model/galactica/language_model.py	501
megatron_patch/model/galactica/transformer.py	570
megatron_patch/model/glm130b/__init__.py	1
megatron_patch/model/glm130b/gpt_model.py	80
megatron_patch/model/glm130b/language_model.py	434
megatron_patch/model/glm130b/transformer.py	875
megatron_patch/model/llama/__init__.py	1
megatron_patch/model/llama/gpt_model.py	92
megatron_patch/model/llama/language_model.py	501
megatron_patch/model/llama/positional_embeddings.py	54
megatron_patch/model/llama/transformer.py	715
megatron_patch/model/llama2/__init__.py	1
megatron_patch/model/llama2/gpt_model.py	88
megatron_patch/model/llama2/language_model.py	454
megatron_patch/model/llama2/rotary_pos_embedding.py	56
megatron_patch/model/llama2/transformer.py	1296
megatron_patch/model/llama3/__init__.py	1
megatron_patch/model/llama3/gpt_model.py	88
megatron_patch/model/llama3/language_model.py	438
megatron_patch/model/llama3/layer_specs.py	85
megatron_patch/model/llama3/model.py	144
megatron_patch/model/llama3/rms_norm.py	13
megatron_patch/model/llama3/transformer/attention.py	402
megatron_patch/model/llama3/transformer/mlp.py	150
megatron_patch/model/llama3/transformer_config.py	8
megatron_patch/model/llama3/transformer_legacy.py	1252
megatron_patch/model/llama3_1/__init__.py	1
megatron_patch/model/llama3_1/layer_specs.py	85
megatron_patch/model/llama3_1/model.py	201
megatron_patch/model/llama3_1/rms_norm.py	13
megatron_patch/model/llama3_1/transformer_config.py	9
megatron_patch/model/llava/__init__.py	1
megatron_patch/model/llava/clip_encoder.py	75
megatron_patch/model/llava/gpt_model.py	89
megatron_patch/model/llava/language_model.py	507
megatron_patch/model/llava/mm_projector_builder.py	37
megatron_patch/model/llava/rotary_pos_embedding.py	54
megatron_patch/model/llava/transformer.py	1292
megatron_patch/model/llava_mcore/__init__.py	1
megatron_patch/model/llava_mcore/layer_specs.py	99
megatron_patch/model/llava_mcore/llava_model.py	424
megatron_patch/model/llava_mcore/llava_spec.py	75
megatron_patch/model/llava_mcore/transformer_config.py	133
megatron_patch/model/llava_mcore/vision/__init__.py	1
megatron_patch/model/llava_mcore/vision/clip_vit_model.py	130
megatron_patch/model/llava_mcore/vision/multimodal_projector.py	41
megatron_patch/model/llava_mcore/vision/vit_layer_specs.py	79
megatron_patch/model/mistral/__init__.py	1
megatron_patch/model/mistral/gpt_model.py	88
megatron_patch/model/mistral/language_model.py	466
megatron_patch/model/mistral/modeling_attn_mask_utils.py	121
megatron_patch/model/mistral/rotary_pos_embedding.py	36
megatron_patch/model/mistral/transformer.py	1292
megatron_patch/model/mixtral/__init__.py	1
megatron_patch/model/mixtral/layer_specs.py	129
megatron_patch/model/mixtral/model.py	177
megatron_patch/model/mixtral/moe/__init__.py	1
megatron_patch/model/mixtral/moe/experts.py	676
megatron_patch/model/mixtral/moe/moe_layer.py	113
megatron_patch/model/mixtral/moe/router.py	171
megatron_patch/model/mixtral/moe/token_dispatcher.py	303
megatron_patch/model/mixtral/transformer/attention.py	517
megatron_patch/model/mixtral/transformer/mlp.py	193
megatron_patch/model/mixtral/transformer_config.py	285
megatron_patch/model/mixtral_bak/__init__.py	1
megatron_patch/model/mixtral_bak/layer_specs.py	86
megatron_patch/model/mixtral_bak/model.py	162
megatron_patch/model/mixtral_bak/moe/__init__.py	1
megatron_patch/model/mixtral_bak/moe/experts.py	136
megatron_patch/model/mixtral_bak/moe/grouped_gemm_util.py	12
megatron_patch/model/mixtral_bak/moe/moe_layer.py	57
megatron_patch/model/mixtral_bak/moe/moe_utils.py	39
megatron_patch/model/mixtral_bak/moe/router.py	113
megatron_patch/model/mixtral_bak/moe/token_dispatcher.py	172
megatron_patch/model/mixtral_bak/transformer/attention.py	322
megatron_patch/model/mixtral_bak/transformer/mlp.py	131
megatron_patch/model/mixtral_bak/transformer_config.py	142
megatron_patch/model/qwen/__init__.py	1
megatron_patch/model/qwen/gpt_model.py	88
megatron_patch/model/qwen/language_model.py	440
megatron_patch/model/qwen/transformer.py	1243
megatron_patch/model/qwen1_5/__init__.py	1
megatron_patch/model/qwen1_5/layer_specs.py	92
megatron_patch/model/qwen1_5/model.py	144
megatron_patch/model/qwen1_5/moe/__init__.py	1
megatron_patch/model/qwen1_5/moe/experts.py	188
megatron_patch/model/qwen1_5/moe/moe_layer.py	78
megatron_patch/model/qwen1_5/moe/router.py	139
megatron_patch/model/qwen1_5/moe/token_dispatcher.py	282
megatron_patch/model/qwen1_5/transformer/attention.py	402
megatron_patch/model/qwen1_5/transformer/mlp.py	164
megatron_patch/model/qwen1_5/transformer_config.py	7
megatron_patch/model/qwen1_5_megablocks/__init__.py	1
megatron_patch/model/qwen1_5_megablocks/gpt_model.py	88
megatron_patch/model/qwen1_5_megablocks/language_model.py	453
megatron_patch/model/qwen1_5_megablocks/rotary_pos_embedding.py	56
megatron_patch/model/qwen1_5_megablocks/transformer.py	1184
megatron_patch/model/qwen2/layer_specs.py	108
megatron_patch/model/qwen2/model.py	146
megatron_patch/model/qwen2/moe/__init__.py	1
megatron_patch/model/qwen2/moe/experts.py	316
megatron_patch/model/qwen2/moe/moe_layer.py	114
megatron_patch/model/qwen2/moe/router.py	206
megatron_patch/model/qwen2/moe/token_dispatcher.py	327
megatron_patch/model/qwen2/rms_norm.py	13
megatron_patch/model/qwen2/transformer/attention.py	407
megatron_patch/model/qwen2/transformer/mlp.py	258
megatron_patch/model/qwen2/transformer_block.py	323
megatron_patch/model/qwen2/transformer_config.py	14
megatron_patch/model/qwen2/transformer_layer.py	147
megatron_patch/model/qwen2_5_vl/model.py	191
megatron_patch/model/qwen2_5_vl/transformer_block.py	434
megatron_patch/model/qwen2_5_vl/transformer_config.py	55
megatron_patch/model/qwen2_5_vl/visionmodel.py	224
megatron_patch/model/qwen2_moe/__init__.py	1
megatron_patch/model/qwen2_moe/layer_specs.py	281
megatron_patch/model/qwen2_moe/transformer_config.py	55
megatron_patch/model/qwen2_vl/attention.py	530
megatron_patch/model/qwen2_vl/attention_vision.py	529
megatron_patch/model/qwen2_vl/gpt_model.py	143
megatron_patch/model/qwen2_vl/language_model_embedding.py	98
megatron_patch/model/qwen2_vl/language_module.py	104
megatron_patch/model/qwen2_vl/layer_specs.py	95
megatron_patch/model/qwen2_vl/model.py	191
megatron_patch/model/qwen2_vl/rope_utils.py	108
megatron_patch/model/qwen2_vl/rotary_pos_embedding.py	145
megatron_patch/model/qwen2_vl/transformer_config.py	53
megatron_patch/model/qwen2_vl/visionmodel.py	160
megatron_patch/model/qwen3_moe/gpt_layer_specs.py	347
megatron_patch/model/qwen3_moe/moe/moe_layer.py	70
megatron_patch/model/qwen3_moe/moe/moe_utils.py	79
megatron_patch/model/qwen3_moe/moe/router.py	111
megatron_patch/model/qwen3_moe/moe_module_specs.py	61
megatron_patch/model/qwen_vl/__init__.py	1
megatron_patch/model/qwen_vl/gpt_model.py	89
megatron_patch/model/qwen_vl/language_model.py	481
megatron_patch/model/qwen_vl/transformer.py	1292
megatron_patch/model/qwen_vl/visual.py	296
megatron_patch/model/starcoder/__init__.py	1
megatron_patch/model/starcoder/enums.py	19
megatron_patch/model/starcoder/glu_activations.py	32
megatron_patch/model/starcoder/gpt_model.py	83
megatron_patch/model/starcoder/language_model.py	387
megatron_patch/model/starcoder/transformer.py	848
megatron_patch/template/helper.py	115
megatron_patch/tensor_parallel.py	66
megatron_patch/tokenizer/icetk_glm130b_tokenizer.py	273
megatron_patch/tokenizer/jiebabpe_tokenizer.py	53
megatron_patch/tokenizer/tokenization_baichuan.py	139
megatron_patch/tokenizer/tokenization_qwen_vl.py	441
megatron_patch/tokenizer/tokenization_yi.py	166
megatron_patch/training.py	612