Path Lines of Code README.md 52 README_CN.md 50 examples/__init__.py 1 examples/fsdp/configs/grpo/base.yaml 3 examples/fsdp/configs/grpo/grpo.yaml 54 examples/fsdp/configs/grpo/log.yaml 9 examples/fsdp/configs/grpo/policy_trainer.yaml 9 examples/fsdp/configs/grpo/reference.yaml 3 examples/fsdp/configs/grpo/vllm_policy_inference.yaml 28 examples/fsdp/data/data_preprocess/gsm8k.py 52 examples/fsdp/data/data_preprocess/math_lighteval.py 79 examples/fsdp/data/prompt_dataset.py 33 examples/fsdp/entry/train_grpo.py 89 examples/fsdp/models/grpo/__init__.py 1 examples/fsdp/models/grpo/loss_gallery.py 22 examples/fsdp/models/grpo/policy_trainer.py 136 examples/fsdp/models/rule_reward.py 43 examples/fsdp/models/vllm_policy_inference.py 72 examples/fsdp/scripts/base_env.sh 28 examples/fsdp/scripts/train_grpo_qwen2_5.sh 41 examples/fsdp/scripts/train_grpo_qwen3.sh 42 examples/fsdp/utils/__init__.py 1 examples/fsdp/utils/rule_reward_score/__init__.py 1 examples/fsdp/utils/rule_reward_score/math.py 156 examples/huggingface/configs/qwen2/base.yaml 3 examples/huggingface/configs/qwen2/dpo.yaml 33 examples/huggingface/configs/qwen2/policy_trainer.yaml 9 examples/huggingface/configs/qwen2/reference.yaml 16 examples/huggingface/data/preprocess_data_chatml.py 46 examples/huggingface/data/reward_dataset.py 161 examples/huggingface/entry/train_dpo.py 21 examples/huggingface/models/dpo/__init__.py 1 examples/huggingface/models/dpo/policy_trainer.py 83 examples/huggingface/models/dpo/reference_model.py 22 examples/huggingface/models/dpo/utils.py 80 examples/huggingface/models/utils.py 296 examples/huggingface/scripts/base_env.sh 29 examples/huggingface/scripts/train_dpo_qwen.sh 18 examples/megatron/configs/gpt/base.yaml 41 examples/megatron/configs/gpt/base_inference.yaml 14 examples/megatron/configs/gpt/base_train.yaml 8 examples/megatron/configs/gpt/old_policy_inference.yaml 10 examples/megatron/configs/gpt/old_value_inference.yaml 4 examples/megatron/configs/gpt/policy_shared.yaml 7 examples/megatron/configs/gpt/ppo_policy.yaml 24 examples/megatron/configs/gpt/ppo_value.yaml 22 examples/megatron/configs/gpt/reference.yaml 5 examples/megatron/configs/gpt/reward_inference.yaml 8 examples/megatron/configs/gpt/reward_shared.yaml 12 examples/megatron/configs/gpt/rlhf.yaml 86 examples/megatron/configs/gpt/test_policy.yaml 22 examples/megatron/configs/llama2/base.yaml 56 examples/megatron/configs/llama2/base_inference.yaml 14 examples/megatron/configs/llama2/base_train.yaml 7 examples/megatron/configs/llama2/data.yaml 2 examples/megatron/configs/llama2/dpo.yaml 42 examples/megatron/configs/llama2/eval.yaml 33 examples/megatron/configs/llama2/eval_vllm.yaml 33 examples/megatron/configs/llama2/grpo_math_vllm.yaml 62 examples/megatron/configs/llama2/math_reward.yaml 4 examples/megatron/configs/llama2/old_policy_inference.yaml 12 examples/megatron/configs/llama2/old_value_inference.yaml 4 examples/megatron/configs/llama2/online_dpo.yaml 57 examples/megatron/configs/llama2/online_dpo_vllm.yaml 57 examples/megatron/configs/llama2/policy_shared.yaml 8 examples/megatron/configs/llama2/ppo_policy.yaml 30 examples/megatron/configs/llama2/ppo_value.yaml 23 examples/megatron/configs/llama2/reference.yaml 6 examples/megatron/configs/llama2/reward_inference.yaml 5 examples/megatron/configs/llama2/reward_shared.yaml 11 examples/megatron/configs/llama2/rlhf.yaml 76 examples/megatron/configs/llama2/rlhf_param_sync.yaml 47 examples/megatron/configs/llama2/test_policy.yaml 22 examples/megatron/configs/llama2/test_reward.yaml 17 examples/megatron/configs/llama2/test_vllm_policy.yaml 22 examples/megatron/configs/llama2/vllm_param_sync.yaml 47 examples/megatron/configs/llama2/vllm_policy_inference.yaml 39 examples/megatron/configs/llama2/vllm_rlhf.yaml 77 examples/megatron/data/__init__.py 1 examples/megatron/data/prepare_data_alignment.py 23 examples/megatron/data/prepare_data_math.py 15 examples/megatron/data/prepare_data_reward.py 23 examples/megatron/data/prepare_data_sft.py 18 examples/megatron/data/prompt_dataset.py 123 examples/megatron/data/reward_dataset.py 138 examples/megatron/data/sft_dataset.py 57 examples/megatron/entry/train_dpo.py 18 examples/megatron/entry/train_grpo_math.py 94 examples/megatron/entry/train_online_dpo.py 43 examples/megatron/entry/train_reward.py 148 examples/megatron/entry/train_rlhf.py 45 examples/megatron/entry/train_sft.py 105 examples/megatron/models/__init__.py 12 examples/megatron/models/base_trainer.py 183 examples/megatron/models/constants.py 85 examples/megatron/models/eval_post_process.py 26 examples/megatron/models/forward_step.py 112 examples/megatron/models/mcore_policy_model.py 245 examples/megatron/models/mcore_reward_model.py 159 examples/megatron/models/mcore_value_model.py 100 examples/megatron/models/old_policy_inference.py 288 examples/megatron/models/old_value_inference.py 46 examples/megatron/models/policy_model.py 201 examples/megatron/models/policy_trainer.py 424 examples/megatron/models/reference.py 192 examples/megatron/models/reward_inference.py 431 examples/megatron/models/reward_math.py 72 examples/megatron/models/reward_model.py 95 examples/megatron/models/rm_sys/__init__.py 1 examples/megatron/models/rm_sys/math_rule_rm.py 39 examples/megatron/models/rm_sys/math_utils/__init__.py 1 examples/megatron/models/rm_sys/math_utils/grader.py 257 examples/megatron/models/rm_sys/math_utils/parser.py 592 examples/megatron/models/utils.py 323 examples/megatron/models/value_model.py 105 examples/megatron/models/value_trainer.py 146 examples/megatron/models/vllm_policy_inference.py 103 examples/megatron/scripts/base_env.sh 134 examples/megatron/scripts/convert_hf_to_megatron.sh 49 examples/megatron/scripts/convert_megatron_to_hf.sh 38 examples/megatron/scripts/train_dpo_llama.sh 31 examples/megatron/scripts/train_grpo_math_llama.sh 41 examples/megatron/scripts/train_online_dpo_llama.sh 63 examples/megatron/scripts/train_reward_llama.sh 125 examples/megatron/scripts/train_rlhf_gpt.sh 122 examples/megatron/scripts/train_rlhf_llama.sh 100 examples/megatron/scripts/train_sft_llama.sh 148 examples/megatron/tests/get_eval_reward.py 37 examples/megatron/tests/get_eval_reward.sh 35 examples/megatron/tests/run_policy_generation.sh 52 examples/megatron/tests/test_parameter_sync.py 25 examples/megatron/tests/test_parameter_sync.sh 54 examples/megatron/tests/test_policy_generation.py 44 examples/megatron/tests/test_reward.sh 17 examples/megatron/tests/test_reward_forward.py 43 examples/megatron/tests/test_unbalanced_param_sync.py 37 examples/megatron/tests/test_unbalanced_param_sync.sh 45 examples/tests/barrier.py 18 examples/tests/benchmark_vllm.py 272 examples/tests/benchmark_vllm.sh 85 requirements.txt 20