Path Lines of Code megatron_patch/__init__.py 1 megatron_patch/arguments.py 449 megatron_patch/data/__init__.py 88 megatron_patch/data/dataset_helpers.py 381 megatron_patch/data/energon/chatml.py 46 megatron_patch/data/image_processing.py 67 megatron_patch/data/json_sft.py 106 megatron_patch/data/utils.py 318 megatron_patch/finetune_utils.py 202 megatron_patch/generation/api.py 170 megatron_patch/generation/generation.py 317 megatron_patch/generation/gpt_predictor.py 74 megatron_patch/generation/tokenization.py 76 megatron_patch/initialize.py 91 megatron_patch/lm_evaluate.py 139 megatron_patch/model/__init__.py 1 megatron_patch/model/baichuan/__init__.py 1 megatron_patch/model/baichuan/gpt_model.py 106 megatron_patch/model/baichuan/language_model.py 515 megatron_patch/model/baichuan/transformer.py 1179 megatron_patch/model/baichuan2/gpt_model.py 97 megatron_patch/model/baichuan2/language_model.py 450 megatron_patch/model/baichuan2/layers.py 134 megatron_patch/model/baichuan2/transformer.py 1292 megatron_patch/model/bloom/__init__.py 1 megatron_patch/model/bloom/gpt_model.py 81 megatron_patch/model/bloom/language_model.py 411 megatron_patch/model/bloom/layers.py 87 megatron_patch/model/bloom/positional_embeddings.py 122 megatron_patch/model/bloom/transformer.py 811 megatron_patch/model/chatglm/__init__.py 1 megatron_patch/model/chatglm/gpt_model.py 82 megatron_patch/model/chatglm/language_model.py 473 megatron_patch/model/chatglm/positional_embeddings.py 60 megatron_patch/model/chatglm/transformer.py 604 megatron_patch/model/deepseek_v2/__init__.py 1 megatron_patch/model/deepseek_v2/layer_specs.py 120 megatron_patch/model/deepseek_v2/mlp.py 196 megatron_patch/model/deepseek_v2/model.py 181 megatron_patch/model/deepseek_v2/moe/experts.py 676 megatron_patch/model/deepseek_v2/moe/moe_layer.py 118 megatron_patch/model/deepseek_v2/moe/shared_experts.py 180 megatron_patch/model/deepseek_v2/multi_latent_attention.py 276 megatron_patch/model/deepseek_v2/transformer_block.py 377 megatron_patch/model/deepseek_v2/transformer_config.py 42 megatron_patch/model/deepseek_v2/transformer_layer.py 226 megatron_patch/model/falcon/__init__.py 1 megatron_patch/model/falcon/gpt_model.py 94 megatron_patch/model/falcon/language_model.py 491 megatron_patch/model/falcon/transformer.py 845 megatron_patch/model/falcon40b/__init__.py 1 megatron_patch/model/falcon40b/gpt_model.py 94 megatron_patch/model/falcon40b/language_model.py 491 megatron_patch/model/falcon40b/transformer.py 683 megatron_patch/model/galactica/__init__.py 1 megatron_patch/model/galactica/gpt_model.py 94 megatron_patch/model/galactica/language_model.py 501 megatron_patch/model/galactica/transformer.py 570 megatron_patch/model/glm130b/__init__.py 1 megatron_patch/model/glm130b/gpt_model.py 80 megatron_patch/model/glm130b/language_model.py 434 megatron_patch/model/glm130b/transformer.py 875 megatron_patch/model/llama/__init__.py 1 megatron_patch/model/llama/gpt_model.py 92 megatron_patch/model/llama/language_model.py 501 megatron_patch/model/llama/positional_embeddings.py 54 megatron_patch/model/llama/transformer.py 715 megatron_patch/model/llama2/__init__.py 1 megatron_patch/model/llama2/gpt_model.py 88 megatron_patch/model/llama2/language_model.py 454 megatron_patch/model/llama2/rotary_pos_embedding.py 56 megatron_patch/model/llama2/transformer.py 1296 megatron_patch/model/llama3/__init__.py 1 megatron_patch/model/llama3/gpt_model.py 88 megatron_patch/model/llama3/language_model.py 438 megatron_patch/model/llama3/layer_specs.py 85 megatron_patch/model/llama3/model.py 144 megatron_patch/model/llama3/rms_norm.py 13 megatron_patch/model/llama3/transformer/attention.py 402 megatron_patch/model/llama3/transformer/mlp.py 150 megatron_patch/model/llama3/transformer_config.py 8 megatron_patch/model/llama3/transformer_legacy.py 1252 megatron_patch/model/llama3_1/__init__.py 1 megatron_patch/model/llama3_1/layer_specs.py 85 megatron_patch/model/llama3_1/model.py 201 megatron_patch/model/llama3_1/rms_norm.py 13 megatron_patch/model/llama3_1/transformer_config.py 9 megatron_patch/model/llava/__init__.py 1 megatron_patch/model/llava/clip_encoder.py 75 megatron_patch/model/llava/gpt_model.py 89 megatron_patch/model/llava/language_model.py 507 megatron_patch/model/llava/mm_projector_builder.py 37 megatron_patch/model/llava/rotary_pos_embedding.py 54 megatron_patch/model/llava/transformer.py 1292 megatron_patch/model/llava_mcore/__init__.py 1 megatron_patch/model/llava_mcore/layer_specs.py 99 megatron_patch/model/llava_mcore/llava_model.py 424 megatron_patch/model/llava_mcore/llava_spec.py 75 megatron_patch/model/llava_mcore/transformer_config.py 133 megatron_patch/model/llava_mcore/vision/__init__.py 1 megatron_patch/model/llava_mcore/vision/clip_vit_model.py 130 megatron_patch/model/llava_mcore/vision/multimodal_projector.py 41 megatron_patch/model/llava_mcore/vision/vit_layer_specs.py 79 megatron_patch/model/mistral/__init__.py 1 megatron_patch/model/mistral/gpt_model.py 88 megatron_patch/model/mistral/language_model.py 466 megatron_patch/model/mistral/modeling_attn_mask_utils.py 121 megatron_patch/model/mistral/rotary_pos_embedding.py 36 megatron_patch/model/mistral/transformer.py 1292 megatron_patch/model/mixtral/__init__.py 1 megatron_patch/model/mixtral/layer_specs.py 129 megatron_patch/model/mixtral/model.py 177 megatron_patch/model/mixtral/moe/__init__.py 1 megatron_patch/model/mixtral/moe/experts.py 676 megatron_patch/model/mixtral/moe/moe_layer.py 113 megatron_patch/model/mixtral/moe/router.py 171 megatron_patch/model/mixtral/moe/token_dispatcher.py 303 megatron_patch/model/mixtral/transformer/attention.py 517 megatron_patch/model/mixtral/transformer/mlp.py 193 megatron_patch/model/mixtral/transformer_config.py 285 megatron_patch/model/mixtral_bak/__init__.py 1 megatron_patch/model/mixtral_bak/layer_specs.py 86 megatron_patch/model/mixtral_bak/model.py 162 megatron_patch/model/mixtral_bak/moe/__init__.py 1 megatron_patch/model/mixtral_bak/moe/experts.py 136 megatron_patch/model/mixtral_bak/moe/grouped_gemm_util.py 12 megatron_patch/model/mixtral_bak/moe/moe_layer.py 57 megatron_patch/model/mixtral_bak/moe/moe_utils.py 39 megatron_patch/model/mixtral_bak/moe/router.py 113 megatron_patch/model/mixtral_bak/moe/token_dispatcher.py 172 megatron_patch/model/mixtral_bak/transformer/attention.py 322 megatron_patch/model/mixtral_bak/transformer/mlp.py 131 megatron_patch/model/mixtral_bak/transformer_config.py 142 megatron_patch/model/qwen/__init__.py 1 megatron_patch/model/qwen/gpt_model.py 88 megatron_patch/model/qwen/language_model.py 440 megatron_patch/model/qwen/transformer.py 1243 megatron_patch/model/qwen1_5/__init__.py 1 megatron_patch/model/qwen1_5/layer_specs.py 92 megatron_patch/model/qwen1_5/model.py 144 megatron_patch/model/qwen1_5/moe/__init__.py 1 megatron_patch/model/qwen1_5/moe/experts.py 188 megatron_patch/model/qwen1_5/moe/moe_layer.py 78 megatron_patch/model/qwen1_5/moe/router.py 139 megatron_patch/model/qwen1_5/moe/token_dispatcher.py 282 megatron_patch/model/qwen1_5/transformer/attention.py 402 megatron_patch/model/qwen1_5/transformer/mlp.py 164 megatron_patch/model/qwen1_5/transformer_config.py 7 megatron_patch/model/qwen1_5_megablocks/__init__.py 1 megatron_patch/model/qwen1_5_megablocks/gpt_model.py 88 megatron_patch/model/qwen1_5_megablocks/language_model.py 453 megatron_patch/model/qwen1_5_megablocks/rotary_pos_embedding.py 56 megatron_patch/model/qwen1_5_megablocks/transformer.py 1184 megatron_patch/model/qwen2/layer_specs.py 108 megatron_patch/model/qwen2/model.py 146 megatron_patch/model/qwen2/moe/__init__.py 1 megatron_patch/model/qwen2/moe/experts.py 316 megatron_patch/model/qwen2/moe/moe_layer.py 114 megatron_patch/model/qwen2/moe/router.py 206 megatron_patch/model/qwen2/moe/token_dispatcher.py 327 megatron_patch/model/qwen2/rms_norm.py 13 megatron_patch/model/qwen2/transformer/attention.py 407 megatron_patch/model/qwen2/transformer/mlp.py 258 megatron_patch/model/qwen2/transformer_block.py 323 megatron_patch/model/qwen2/transformer_config.py 14 megatron_patch/model/qwen2/transformer_layer.py 147 megatron_patch/model/qwen2_5_vl/model.py 191 megatron_patch/model/qwen2_5_vl/transformer_block.py 434 megatron_patch/model/qwen2_5_vl/transformer_config.py 55 megatron_patch/model/qwen2_5_vl/visionmodel.py 224 megatron_patch/model/qwen2_moe/__init__.py 1 megatron_patch/model/qwen2_moe/layer_specs.py 281 megatron_patch/model/qwen2_moe/transformer_config.py 55 megatron_patch/model/qwen2_vl/attention.py 530 megatron_patch/model/qwen2_vl/attention_vision.py 529 megatron_patch/model/qwen2_vl/gpt_model.py 143 megatron_patch/model/qwen2_vl/language_model_embedding.py 98 megatron_patch/model/qwen2_vl/language_module.py 104 megatron_patch/model/qwen2_vl/layer_specs.py 95 megatron_patch/model/qwen2_vl/model.py 191 megatron_patch/model/qwen2_vl/rope_utils.py 108 megatron_patch/model/qwen2_vl/rotary_pos_embedding.py 145 megatron_patch/model/qwen2_vl/transformer_config.py 53 megatron_patch/model/qwen2_vl/visionmodel.py 160 megatron_patch/model/qwen3_moe/gpt_layer_specs.py 347 megatron_patch/model/qwen3_moe/moe/moe_layer.py 70 megatron_patch/model/qwen3_moe/moe/moe_utils.py 79 megatron_patch/model/qwen3_moe/moe/router.py 111 megatron_patch/model/qwen3_moe/moe_module_specs.py 61 megatron_patch/model/qwen_vl/__init__.py 1 megatron_patch/model/qwen_vl/gpt_model.py 89 megatron_patch/model/qwen_vl/language_model.py 481 megatron_patch/model/qwen_vl/transformer.py 1292 megatron_patch/model/qwen_vl/visual.py 296 megatron_patch/model/starcoder/__init__.py 1 megatron_patch/model/starcoder/enums.py 19 megatron_patch/model/starcoder/glu_activations.py 32 megatron_patch/model/starcoder/gpt_model.py 83 megatron_patch/model/starcoder/language_model.py 387 megatron_patch/model/starcoder/transformer.py 848 megatron_patch/template/helper.py 115 megatron_patch/tensor_parallel.py 66 megatron_patch/tokenizer/icetk_glm130b_tokenizer.py 273 megatron_patch/tokenizer/jiebabpe_tokenizer.py 53 megatron_patch/tokenizer/tokenization_baichuan.py 139 megatron_patch/tokenizer/tokenization_qwen_vl.py 441 megatron_patch/tokenizer/tokenization_yi.py 166 megatron_patch/training.py 612 rlhf/deepspeed-chat/rm_main.py 319 rlhf/deepspeed-chat/utils.py 204 rlhf/trlx/reward_model_bloom.py 81 rlhf/trlx/train_reward_model_bloom.py 151 rlhf/trlx/trlx_bloom_rlhf.py 178 toolkits/auto_configurator/report_auto_config.py 124 toolkits/auto_configurator/report_theoretical_memory.py 147 toolkits/distributed_checkpoints_convertor/impl/convert.py 80 toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/__init__.py 6 toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/h2m_synchronizer.py 46 toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/m2h_synchronizer.py 42 toolkits/distributed_checkpoints_convertor/impl/deepseek_v3/patch.py 25 toolkits/distributed_checkpoints_convertor/impl/general/__init__.py 6 toolkits/distributed_checkpoints_convertor/impl/general/h2m_synchronizer.py 259 toolkits/distributed_checkpoints_convertor/impl/general/m2h_synchronizer.py 499 toolkits/distributed_checkpoints_convertor/impl/general/synchronizer.py 119 toolkits/model_checkpoints_convertor/baichuan/checkpoint_reshaping_and_interoperability.py 649 toolkits/model_checkpoints_convertor/baichuan/configuration_baichuan.py 43 toolkits/model_checkpoints_convertor/baichuan/hf2te.py 378 toolkits/model_checkpoints_convertor/baichuan2/checkpoint_reshaping_and_interoperability.py 638 toolkits/model_checkpoints_convertor/baichuan2/configuration_baichuan.py 43 toolkits/model_checkpoints_convertor/baichuan2/hf2te.py 360 toolkits/model_checkpoints_convertor/bloom/checkpoint_reshaping_and_interoperability.py 572 toolkits/model_checkpoints_convertor/bloom/deepspeed_to_megatron.py 149 toolkits/model_checkpoints_convertor/bloom/deepspeed_to_megatron_ori.py 149 toolkits/model_checkpoints_convertor/bloom/reward_model_to_megatron.py 573 toolkits/model_checkpoints_convertor/chatglm/checkpoint_reshaping_and_interoperability.py 396 toolkits/model_checkpoints_convertor/deepseek/fp8_cast_bf16.py 88 toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v2_moe.py 454 toolkits/model_checkpoints_convertor/deepseek/hf2mcore_deepseek_v3_moe.py 578 toolkits/model_checkpoints_convertor/falcon/checkpoint_reshaping_and_interoperability.py 612 toolkits/model_checkpoints_convertor/falcon/configuration_RW.py 101 toolkits/model_checkpoints_convertor/falcon40b/checkpoint_reshaping_and_interoperability.py 583 toolkits/model_checkpoints_convertor/falcon40b/configuration_RW.py 50 toolkits/model_checkpoints_convertor/galactica/checkpoint_reshaping_and_interoperability.py 454 toolkits/model_checkpoints_convertor/glm/checkpoint_reshaping_and_interoperability.py 378 toolkits/model_checkpoints_convertor/glm130b/checkpoint_reshaping_and_interoperability.py 363 toolkits/model_checkpoints_convertor/glm130b/merge_130b_ckpts.py 96 toolkits/model_checkpoints_convertor/llama/hf2mcore.py 674 toolkits/model_checkpoints_convertor/llama/hf2mcore_70b.py 577 toolkits/model_checkpoints_convertor/llama/hf2mcore_llama3_1.py 710 toolkits/model_checkpoints_convertor/llama/hf2megatron.py 808 toolkits/model_checkpoints_convertor/llama/hf_llama_moe/llama_moe.py 19 toolkits/model_checkpoints_convertor/llava/hf2mcore_llava.py 669 toolkits/model_checkpoints_convertor/mistral/hf2mcore.py 468 toolkits/model_checkpoints_convertor/mistral/hf2mcore_mixtral.py 672 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_gqa.py 593 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_mha.py 280 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_dense_mha_to_moe.py 227 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen1.5_moe.py 479 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2.5_vl.py 607 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_dense_and_moe_gqa.py 821 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_moe.py 555 toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl.py 616 toolkits/model_checkpoints_convertor/qwen/hf2megablocks_qwen1.5.py 546 toolkits/model_checkpoints_convertor/qwen/hf2megatron_qwen1.0.py 725 toolkits/model_checkpoints_convertor/qwen/hf2megatron_qwen1.5.py 810 toolkits/model_checkpoints_convertor/starcoder/checkpoint_reshaping_and_interoperability.py 583 toolkits/model_checkpoints_convertor/utils/__init__.py 146 toolkits/model_checkpoints_convertor/yi/checkpoint_reshaping_and_interoperability.py 468 toolkits/multimodal_data_preprocessing/build_llava_frame_dataset.py 123 toolkits/multimodal_data_preprocessing/convert_custom_dataset_to_wds_chatml.py 98 toolkits/multimodal_data_preprocessing/convert_llava_pretrain_to_wds.py 25 toolkits/multimodal_data_preprocessing/replace_llava_image_key.py 29 toolkits/pretrain_data_preprocessing/clean_raw_text.py 69 toolkits/pretrain_data_preprocessing/convert_json_to_list.py 10 toolkits/pretrain_data_preprocessing/preprocess_data.py 198 toolkits/pretrain_data_preprocessing/preprocess_data_megatron.py 360 toolkits/pretrain_data_preprocessing/preprocess_wudao2.py 76 toolkits/pretrain_data_preprocessing/qwen_hf_preprocess_datasets.py 86 toolkits/sft_data_preprocessing/build_idxmap_sft_dataset.py 319 toolkits/sft_data_preprocessing/sample_stats.py 23