compute-nest-best-practice/opensource/chatglm-finetune/oos/finetune.yaml (137 lines of code) (raw):

FormatVersion: OOS-2019-06-01 Description: 执行指定微调脚本运维操作 Parameters: regionId: Type: String Label: en: RegionId zh-cn: 地域ID AssociationProperty: RegionId Default: '{{ ACS::RegionId }}' serviceInstanceId: Type: String Label: en: TargetInstance zh-cn: 目标实例 AssociationProperty: ALIYUN::ComputeNest::ServiceInstance::ServiceInstanceId AssociationPropertyMetadata: Disabled: true workingDir: Type: String Default: /root/ChatGLM-Efficient-Tuning Description: 脚本执行路径 finetuneScriptPath: Type: String Default: src/train_bash.py Description: 微调脚本路径 dataset: Type: String Default: alpaca_gpt4_zh Description: 数据集名称 modelName: Type: String Default: THUDM/chatglm-6b AllowedValues: - THUDM/chatglm-6b - THUDM/chatglm2-6b stage: Type: String Description: 微调阶段 Default: sft AllowedValues: - sft - ppo - rm finetuneType: Type: String Description: 微调类型 Default: lora AllowedValues: - lora - p-tuning - full trainingEpoch: Type: Number Description: 训练轮次 Default: 3 precision: Type: String Description: 训练精度 Default: fp16 AllowedValues: - fp16 - fp32 - fp64 outputDir: Type: String Description: 模型输出地址 Default: path_to_sft_checkpoint timeout: Label: en: Timeout zh-cn: 超时时间 Type: Number Default: 600 Tasks: - Name: getInstance Description: en: Views the ECS instances zh-cn: 获取ECS实例 Action: ACS::SelectTargets Properties: ResourceType: ALIYUN::ECS::Instance RegionId: '{{ regionId }}' Filters: - Type: All RegionId: '{{regionId}}' Parameters: RegionId: '{{regionId}}' Status: Running Tags: - Key: acs:computenest:serviceInstanceId Value: '{{serviceInstanceId}}' Outputs: instanceIds: Type: List ValueSelector: Instances.Instance[].InstanceId - Name: runCommand Action: ACS::ECS::RunCommand Description: 执行云助手命令 Properties: commandContent: |- #!/bin/bash source /root/anaconda3/bin/activate chatglm_etuning CUDA_VISIBLE_DEVICES=0 nohup /root/anaconda3/envs/chatglm_etuning/bin/python src/train_bash.py \ --model_name_or_path {{modelName}} \ --stage {{stage}} \ --do_train \ --dataset {{dataset}} \ --finetuning_type {{finetuneType}} \ --output_dir {{outputDir}} \ --per_device_train_batch_size 4 \ --gradient_accumulation_steps 4 \ --lr_scheduler_type cosine \ --logging_steps 10 \ --save_steps 1000 \ --learning_rate 5e-5 \ --num_train_epochs {{trainingEpoch}} \ --plot_loss \ --{{precision}} >> finetune_log.log 2>&1 & workingDir: '{{workingDir}}' instanceId: '{{ ACS::TaskLoopItem }}' commandType: RunShellScript timeout: '{{timeout}}' Loop: Items: '{{ getInstance.instanceIds }}' Outputs: commandOutputs: AggregateType: Fn::ListJoin AggregateField: commandOutput Outputs: commandOutput: Type: String ValueSelector: invocationOutput Outputs: commandOutputs: Type: List Value: '{{ runCommand.commandOutputs }}'