Path	Lines of Code
megatron_patch/__init__.py	1
megatron_patch/arguments.py	449
megatron_patch/data/__init__.py	88
megatron_patch/data/dataset_helpers.py	381
megatron_patch/data/energon/chatml.py	46
megatron_patch/data/image_processing.py	67
megatron_patch/data/json_sft.py	106
megatron_patch/data/utils.py	318
megatron_patch/finetune_utils.py	202
megatron_patch/generation/api.py	170
megatron_patch/generation/generation.py	317
megatron_patch/generation/gpt_predictor.py	74
megatron_patch/generation/tokenization.py	76
megatron_patch/initialize.py	91
megatron_patch/lm_evaluate.py	139
megatron_patch/model/__init__.py	1
megatron_patch/model/baichuan/__init__.py	1
megatron_patch/model/baichuan/gpt_model.py	106
megatron_patch/model/baichuan/language_model.py	515
megatron_patch/model/baichuan/transformer.py	1179
megatron_patch/model/baichuan2/gpt_model.py	97
megatron_patch/model/baichuan2/language_model.py	450
megatron_patch/model/baichuan2/layers.py	134
megatron_patch/model/baichuan2/transformer.py	1292
megatron_patch/model/bloom/__init__.py	1
megatron_patch/model/bloom/gpt_model.py	81
megatron_patch/model/bloom/language_model.py	411
megatron_patch/model/bloom/layers.py	87
megatron_patch/model/bloom/positional_embeddings.py	122
megatron_patch/model/bloom/transformer.py	811
megatron_patch/model/chatglm/__init__.py	1
megatron_patch/model/chatglm/gpt_model.py	82
megatron_patch/model/chatglm/language_model.py	473
megatron_patch/model/chatglm/positional_embeddings.py	60
megatron_patch/model/chatglm/transformer.py	604
megatron_patch/model/deepseek_v2/__init__.py	1
megatron_patch/model/deepseek_v2/layer_specs.py	120
megatron_patch/model/deepseek_v2/mlp.py	196
megatron_patch/model/deepseek_v2/model.py	181
megatron_patch/model/deepseek_v2/moe/experts.py	676
megatron_patch/model/deepseek_v2/moe/moe_layer.py	118
megatron_patch/model/deepseek_v2/moe/shared_experts.py	180
megatron_patch/model/deepseek_v2/multi_latent_attention.py	276
megatron_patch/model/deepseek_v2/transformer_block.py	377
megatron_patch/model/deepseek_v2/transformer_config.py	42
megatron_patch/model/deepseek_v2/transformer_layer.py	226
megatron_patch/model/falcon/__init__.py	1
megatron_patch/model/falcon/gpt_model.py	94
megatron_patch/model/falcon/language_model.py	491
megatron_patch/model/falcon/transformer.py	845
megatron_patch/model/falcon40b/__init__.py	1
megatron_patch/model/falcon40b/gpt_model.py	94
megatron_patch/model/falcon40b/language_model.py	491
megatron_patch/model/falcon40b/transformer.py	683
megatron_patch/model/galactica/__init__.py	1
megatron_patch/model/galactica/gpt_model.py	94
megatron_patch/model/galactica/language_model.py	501
megatron_patch/model/galactica/transformer.py	570
megatron_patch/model/glm130b/__init__.py	1
megatron_patch/model/glm130b/gpt_model.py	80
megatron_patch/model/glm130b/language_model.py	434
megatron_patch/model/glm130b/transformer.py	875
megatron_patch/model/llama/__init__.py	1
megatron_patch/model/llama/gpt_model.py	92
megatron_patch/model/llama/language_model.py	501
megatron_patch/model/llama/positional_embeddings.py	54
megatron_patch/model/llama/transformer.py	715
megatron_patch/model/llama2/__init__.py	1
megatron_patch/model/llama2/gpt_model.py	88
megatron_patch/model/llama2/language_model.py	454
megatron_patch/model/llama2/rotary_pos_embedding.py	56
megatron_patch/model/llama2/transformer.py	1296
megatron_patch/model/llama3/__init__.py	1
megatron_patch/model/llama3/gpt_model.py	88
megatron_patch/model/llama3/language_model.py	438
megatron_patch/model/llama3/layer_specs.py	85
megatron_patch/model/llama3/model.py	144
megatron_patch/model/llama3/rms_norm.py	13
megatron_patch/model/llama3/transformer/attention.py	402
megatron_patch/model/llama3/transformer/mlp.py	150
megatron_patch/model/llama3/transformer_config.py	8
megatron_patch/model/llama3/transformer_legacy.py	1252
megatron_patch/model/llama3_1/__init__.py	1
megatron_patch/model/llama3_1/layer_specs.py	85
megatron_patch/model/llama3_1/model.py	201
megatron_patch/model/llama3_1/rms_norm.py	13
megatron_patch/model/llama3_1/transformer_config.py	9
megatron_patch/model/llava/__init__.py	1
megatron_patch/model/llava/clip_encoder.py	75
megatron_patch/model/llava/gpt_model.py	89
megatron_patch/model/llava/language_model.py	507
megatron_patch/model/llava/mm_projector_builder.py	37
megatron_patch/model/llava/rotary_pos_embedding.py	54
megatron_patch/model/llava/transformer.py	1292
megatron_patch/model/llava_mcore/__init__.py	1
megatron_patch/model/llava_mcore/layer_specs.py	99
megatron_patch/model/llava_mcore/llava_model.py	424
megatron_patch/model/llava_mcore/llava_spec.py	75
megatron_patch/model/llava_mcore/transformer_config.py	133
megatron_patch/model/llava_mcore/vision/__init__.py	1
megatron_patch/model/llava_mcore/vision/clip_vit_model.py	130
megatron_patch/model/llava_mcore/vision/multimodal_projector.py	41
megatron_patch/model/llava_mcore/vision/vit_layer_specs.py	79
megatron_patch/model/mistral/__init__.py	1
megatron_patch/model/mistral/gpt_model.py	88
megatron_patch/model/mistral/language_model.py	466
megatron_patch/model/mistral/modeling_attn_mask_utils.py	121
megatron_patch/model/mistral/rotary_pos_embedding.py	36
megatron_patch/model/mistral/transformer.py	1292
megatron_patch/model/mixtral/__init__.py	1
megatron_patch/model/mixtral/layer_specs.py	129
megatron_patch/model/mixtral/model.py	177
megatron_patch/model/mixtral/moe/__init__.py	1
megatron_patch/model/mixtral/moe/experts.py	676
megatron_patch/model/mixtral/moe/moe_layer.py	113
megatron_patch/model/mixtral/moe/router.py	171
megatron_patch/model/mixtral/moe/token_dispatcher.py	303
megatron_patch/model/mixtral/transformer/attention.py	517
megatron_patch/model/mixtral/transformer/mlp.py	193
megatron_patch/model/mixtral/transformer_config.py	285
megatron_patch/model/mixtral_bak/__init__.py	1
megatron_patch/model/mixtral_bak/layer_specs.py	86
megatron_patch/model/mixtral_bak/model.py	162
megatron_patch/model/mixtral_bak/moe/__init__.py	1
megatron_patch/model/mixtral_bak/moe/experts.py	136
megatron_patch/model/mixtral_bak/moe/grouped_gemm_util.py	12
megatron_patch/model/mixtral_bak/moe/moe_layer.py	57
megatron_patch/model/mixtral_bak/moe/moe_utils.py	39
megatron_patch/model/mixtral_bak/moe/router.py	113
megatron_patch/model/mixtral_bak/moe/token_dispatcher.py	172
megatron_patch/model/mixtral_bak/transformer/attention.py	322
megatron_patch/model/mixtral_bak/transformer/mlp.py	131
megatron_patch/model/mixtral_bak/transformer_config.py	142
megatron_patch/model/qwen/__init__.py	1
megatron_patch/model/qwen/gpt_model.py	88
megatron_patch/model/qwen/language_model.py	440
megatron_patch/model/qwen/transformer.py	1243
megatron_patch/model/qwen1_5/__init__.py	1
megatron_patch/model/qwen1_5/layer_specs.py	92
megatron_patch/model/qwen1_5/model.py	144
megatron_patch/model/qwen1_5/moe/__init__.py	1
megatron_patch/model/qwen1_5/moe/experts.py	188
megatron_patch/model/qwen1_5/moe/moe_layer.py	78
megatron_patch/model/qwen1_5/moe/router.py	139
megatron_patch/model/qwen1_5/moe/token_dispatcher.py	282
megatron_patch/model/qwen1_5/transformer/attention.py	402
megatron_patch/model/qwen1_5/transformer/mlp.py	164
megatron_patch/model/qwen1_5/transformer_config.py	7
megatron_patch/model/qwen1_5_megablocks/__init__.py	1
megatron_patch/model/qwen1_5_megablocks/gpt_model.py	88
megatron_patch/model/qwen1_5_megablocks/language_model.py	453
megatron_patch/model/qwen1_5_megablocks/rotary_pos_embedding.py	56
megatron_patch/model/qwen1_5_megablocks/transformer.py	1184
megatron_patch/model/qwen2/layer_specs.py	108
megatron_patch/model/qwen2/model.py	146
megatron_patch/model/qwen2/moe/__init__.py	1
megatron_patch/model/qwen2/moe/experts.py	316
megatron_patch/model/qwen2/moe/moe_layer.py	114
megatron_patch/model/qwen2/moe/router.py	206
megatron_patch/model/qwen2/moe/token_dispatcher.py	327
megatron_patch/model/qwen2/rms_norm.py	13
megatron_patch/model/qwen2/transformer/attention.py	407
megatron_patch/model/qwen2/transformer/mlp.py	258
megatron_patch/model/qwen2/transformer_block.py	323
megatron_patch/model/qwen2/transformer_config.py	14
megatron_patch/model/qwen2/transformer_layer.py	147
megatron_patch/model/qwen2_5_vl/model.py	191
megatron_patch/model/qwen2_5_vl/transformer_block.py	434
megatron_patch/model/qwen2_5_vl/transformer_config.py	55
megatron_patch/model/qwen2_5_vl/visionmodel.py	224
megatron_patch/model/qwen2_moe/__init__.py	1
megatron_patch/model/qwen2_moe/layer_specs.py	281
megatron_patch/model/qwen2_moe/transformer_config.py	55
megatron_patch/model/qwen2_vl/attention.py	530
megatron_patch/model/qwen2_vl/attention_vision.py	529
megatron_patch/model/qwen2_vl/gpt_model.py	143
megatron_patch/model/qwen2_vl/language_model_embedding.py	98
megatron_patch/model/qwen2_vl/language_module.py	104
megatron_patch/model/qwen2_vl/layer_specs.py	95
megatron_patch/model/qwen2_vl/model.py	191
megatron_patch/model/qwen2_vl/rope_utils.py	108
megatron_patch/model/qwen2_vl/rotary_pos_embedding.py	145
megatron_patch/model/qwen2_vl/transformer_config.py	53
megatron_patch/model/qwen2_vl/visionmodel.py	160
megatron_patch/model/qwen3_moe/gpt_layer_specs.py	347
megatron_patch/model/qwen3_moe/moe/moe_layer.py	70
megatron_patch/model/qwen3_moe/moe/moe_utils.py	79
megatron_patch/model/qwen3_moe/moe/router.py	111
megatron_patch/model/qwen3_moe/moe_module_specs.py	61
megatron_patch/model/qwen_vl/__init__.py	1
megatron_patch/model/qwen_vl/gpt_model.py	89
megatron_patch/model/qwen_vl/language_model.py	481
megatron_patch/model/qwen_vl/transformer.py	1292
megatron_patch/model/qwen_vl/visual.py	296
megatron_patch/model/starcoder/__init__.py	1
megatron_patch/model/starcoder/enums.py	19
megatron_patch/model/starcoder/glu_activations.py	32
megatron_patch/model/starcoder/gpt_model.py	83
megatron_patch/model/starcoder/language_model.py	387
megatron_patch/model/starcoder/transformer.py	848
megatron_patch/template/helper.py	115
megatron_patch/tensor_parallel.py	66
megatron_patch/tokenizer/icetk_glm130b_tokenizer.py	273
megatron_patch/tokenizer/jiebabpe_tokenizer.py	53
megatron_patch/tokenizer/tokenization_baichuan.py	139
megatron_patch/tokenizer/tokenization_qwen_vl.py	441
megatron_patch/tokenizer/tokenization_yi.py	166
megatron_patch/training.py	612
rlhf/deepspeed-chat/rm_main.py	319
rlhf/deepspeed-chat/utils.py	204
rlhf/trlx/reward_model_bloom.py	81
rlhf/trlx/train_reward_model_bloom.py	151
rlhf/trlx/trlx_bloom_rlhf.py	178
toolkits/auto_configurator/report_auto_config.py	124
toolkits/auto_configurator/report_theoretical_memory.py	147
toolkits/distributed_checkpoints_convertor/impl/convert.py	80
toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/__init__.py	6
toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/h2m_synchronizer.py	46
toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/m2h_synchronizer.py	42
toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/patch.py	25
toolkits/distributed_checkpoints_convertor/impl/general/__init__.py	6
toolkits/distributed_checkpoints_convertor/impl/general/h2m_synchronizer.py	259
toolkits/distributed_checkpoints_convertor/impl/general/m2h_synchronizer.py	499
toolkits/distributed_checkpoints_convertor/impl/general/synchronizer.py	119
toolkits/model_checkpoints_convertor/baichuan/checkpoint_reshaping_and_interoperability.py	649
toolkits/model_checkpoints_convertor/baichuan/configuration_baichuan.py	43
toolkits/model_checkpoints_convertor/baichuan/hf2te.py	378
toolkits/model_checkpoints_convertor/baichuan2/checkpoint_reshaping_and_interoperability.py	638
toolkits/model_checkpoints_convertor/baichuan2/configuration_baichuan.py	43
toolkits/model_checkpoints_convertor/baichuan2/hf2te.py	360
toolkits/model_checkpoints_convertor/bloom/checkpoint_reshaping_and_interoperability.py	572
toolkits/model_checkpoints_convertor/bloom/deepspeed_to_megatron.py	149
toolkits/model_checkpoints_convertor/bloom/deepspeed_to_megatron_ori.py	149
toolkits/model_checkpoints_convertor/bloom/reward_model_to_megatron.py	573
toolkits/model_checkpoints_convertor/chatglm/checkpoint_reshaping_and_interoperability.py	396
toolkits/model_checkpoints_convertor/deepseek/fp8_cast_bf16.py	88
toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v2_moe.py	454
toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v3_moe.py	578
toolkits/model_checkpoints_convertor/falcon/checkpoint_reshaping_and_interoperability.py	612
toolkits/model_checkpoints_convertor/falcon/configuration_RW.py	101
toolkits/model_checkpoints_convertor/falcon40b/checkpoint_reshaping_and_interoperability.py	583
toolkits/model_checkpoints_convertor/falcon40b/configuration_RW.py	50
toolkits/model_checkpoints_convertor/galactica/checkpoint_reshaping_and_interoperability.py	454
toolkits/model_checkpoints_convertor/glm/checkpoint_reshaping_and_interoperability.py	378
toolkits/model_checkpoints_convertor/glm130b/checkpoint_reshaping_and_interoperability.py	363
toolkits/model_checkpoints_convertor/glm130b/merge_130b_ckpts.py	96
toolkits/model_checkpoints_convertor/llama/hf2mcore.py	674
toolkits/model_checkpoints_convertor/llama/hf2mcore_70b.py	577
toolkits/model_checkpoints_convertor/llama/hf2mcore_llama3_1.py	710
toolkits/model_checkpoints_convertor/llama/hf2megatron.py	808
toolkits/model_checkpoints_convertor/llama/hf_llama_moe/llama_moe.py	19
toolkits/model_checkpoints_convertor/llava/hf2mcore_llava.py	669
toolkits/model_checkpoints_convertor/mistral/hf2mcore.py	468
toolkits/model_checkpoints_convertor/mistral/hf2mcore_mixtral.py	672
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_gqa.py	593
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_mha.py	280
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_mha_to_moe.py	227
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_moe.py	479
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2.5_vl.py	607
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_dense_and_moe_gqa.py	821
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_moe.py	555
toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl.py	616
toolkits/model_checkpoints_convertor/qwen/hf2megablocks_qwen1.5.py	546
toolkits/model_checkpoints_convertor/qwen/hf2megatron_qwen1.0.py	725
toolkits/model_checkpoints_convertor/qwen/hf2megatron_qwen1.5.py	810
toolkits/model_checkpoints_convertor/starcoder/checkpoint_reshaping_and_interoperability.py	583
toolkits/model_checkpoints_convertor/utils/__init__.py	146
toolkits/model_checkpoints_convertor/yi/checkpoint_reshaping_and_interoperability.py	468
toolkits/multimodal_data_preprocessing/build_llava_frame_dataset.py	123
toolkits/multimodal_data_preprocessing/convert_custom_dataset_to_wds_chatml.py	98
toolkits/multimodal_data_preprocessing/convert_llava_pretrain_to_wds.py	25
toolkits/multimodal_data_preprocessing/replace_llava_image_key.py	29
toolkits/pretrain_data_preprocessing/clean_raw_text.py	69
toolkits/pretrain_data_preprocessing/convert_json_to_list.py	10
toolkits/pretrain_data_preprocessing/preprocess_data.py	198
toolkits/pretrain_data_preprocessing/preprocess_data_megatron.py	360
toolkits/pretrain_data_preprocessing/preprocess_wudao2.py	76
toolkits/pretrain_data_preprocessing/qwen_hf_preprocess_datasets.py	86
toolkits/sft_data_preprocessing/build_idxmap_sft_dataset.py	319
toolkits/sft_data_preprocessing/sample_stats.py	23