megatron_patch/arguments.py

# Copyright (c) 2023 Alibaba PAI Team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from typing import Union import argparse def patch_if_not_exist( group_or_parser: Union[argparse._ArgumentGroup, argparse.ArgumentParser], keyname, type=None, default=None, choices=None, help=None ): has_keyname = False for action in vars(group_or_parser)["_actions"]: if isinstance(action, argparse._StoreAction): if keyname in action.option_strings: has_keyname = True if not has_keyname: return group_or_parser.add_argument( keyname, type=type, default=default, choices=choices, help=help, ) return None def get_patch_args(parser): group = parser.add_argument_group(title="patch") for action in vars(group)["_actions"]: if isinstance(action, argparse._StoreAction): if "--tokenizer-type" in action.option_strings: action.default = "NullTokenizer" for action in vars(group)["_actions"]: if isinstance(action, argparse._StoreAction): if "--vocab-size" in action.option_strings: action.default = -1 for action in vars(group)["_actions"]: if isinstance(action, argparse._StoreAction): if "--optimizer" in action.option_strings: action.choices.append("hybridadam") for action in vars(group)["_actions"]: if isinstance(action, argparse._StoreAction): if "--position-embedding-type" in action.option_strings: action.choices.append("none") patch_if_not_exist( group, "--rotary-base", type=int, default=10000, help="Base to use for rotary positional embeddings, default 10000", ) patch_if_not_exist( group, "--local-rank", type=int, default=None, help="local rank passed from distributed launcher", ) patch_if_not_exist( group, "--spatial-merge-size", type=int, default=2, ) patch_if_not_exist( group, "--temporal-patch-size", type=int, default=2, ) patch_if_not_exist( group, "--patch-size", type=int, default=14, ) patch_if_not_exist( group, "--rope-type", type=str, default='yarn', choices=['yarn', 'rope'], help="rope-type for MLA attn" ) group.add_argument("--n-head-kv", type=int, default=None, help="n-head-kv") group.add_argument( "--transformer-type", type=str, default="megatron", help="transformer-type" ) group.add_argument( "--max-padding-length", type=int, default=None, help="max-padding-length" ) group.add_argument("--dataset", type=str, default=None, help="dataset") group.add_argument( "--epochs", type=int, default=None, help="Number of finetunning epochs. Zero results in " "evaluation only.", ) group.add_argument( "--intermediate-size", type=int, default=None, help="--intermediate-size" ) group.add_argument( "--extra-vocab-size", type=int, default=0, help="--extra-vocab-size" ) group.add_argument( "--keep-last", action="store_true", help="Keep the last batch (maybe incomplete) in" "the data loader", ) group.add_argument("--data-dir", default=None, help="data-dir") group.add_argument( "--train-data", nargs="+", default=None, help="Whitespace separated paths or corpora names " "for training.", ) group.add_argument( "--valid-data", nargs="+", default=None, help="path(s) to the validation data." ) group.add_argument("--patch-tokenizer-type", type=str, help="patch-tokenizer-type") group.add_argument( "--use-alibi-mask", action="store_true", help="use alibi mask for baichuan model", ) group.add_argument("--use-normhead", action="store_true", help="use-normhead") group.add_argument("--glu-activation", type=str, help="GLU activations to use.") group.add_argument( "--attention-head-type", type=str, default=None, choices=["multihead", "multiquery"], help="Type of attention heads. `multihead` is the standard multi-head attention." "`multiquery` shares the values and keys across attention heads", ) group.add_argument( "--transformer-timers", action="store_true", help="If set, activate the timers within the transformer layers." "Only for debugging, as this slows down the model.", ) group.add_argument("--text-generate-input-file", type=str, default="") group.add_argument("--text-generate-output-file", type=str, default="") group.add_argument("--text-generate-gt-file", type=str, default="") group.add_argument( "--time", action="store_true", help="measure end to end text generation average time", ) group.add_argument("--eval-dev", action="store_true") group.add_argument( "--input-len", type=int, default=1, help="input lenth for measure end to end text generation average time", ) group.add_argument( "--generation-length", type=int, default=None, help="generation-seq-len" ) group.add_argument("--top-p", type=float, default=0.0, help="Top p sampling.") group.add_argument("--top-k", type=int, default=0, help="Top k sampling.") group.add_argument( "--out-seq-length", type=int, default=1024, help="Size of the output generated text.", ) group.add_argument( "--temperature", type=float, default=1.0, help="Sampling temperature." ) group.add_argument( "--repetition_penalty", type=float, default=1.1, help="Repetition_penalty." ) group.add_argument( "--embed-layernorm", action="store_true", help="use layernorm for embedding" ) group.add_argument( "--repetition-penalty", type=float, default=1.2, help="Repetition_penalty." ) group.add_argument( "--source-seq-len", type=int, default=None, help="source-seq-len" ) group.add_argument( "--target-seq-len", type=int, default=None, help="target-seq-len" ) group.add_argument( "--position-encoding-2d", action="store_true", help="position-encoding-2d" ) group.add_argument( "--z-loss-weight", type=float, default=0.0, help="the max-z weight for baichuan2", ) group.add_argument( "--use-llama2-rotary-position-embeddings", action="store_true", help="Use llama2 rotary positional embeddings or not. " "Deprecated: use --position-embedding-type", ) group.add_argument( "--use-mistral-rotary-position-embeddings", action="store_true", help="Use llama2 rotary positional embeddings or not. " "Deprecated: use --position-embedding-type", ) group.add_argument("--mm-use-im-start-end", action="store_true") group.add_argument("--mm-use-im-patch-token", action="store_true") group.add_argument("--tune-mm-mlp-adapter", action="store_true") group.add_argument("--freeze-clip-vision-tower", action="store_true") group.add_argument("--freeze-llm", action="store_true") group.add_argument("--image-folder", type=str, default="") group.add_argument("--mm-vision-select-layer", type=int, default=None) group.add_argument("--vision-tower", type=str, default="") group.add_argument("--image-aspect-ratio", type=str, default="square") group.add_argument("--version", type=str, default="plain") group.add_argument("--mm-projector-type", type=str, default=None) group.add_argument("--image-size", type=int, default=None, help="image-size") group.add_argument("--sliding-window", type=int, default=None) group.add_argument("--rotary-scale-factor", type=int, default=1) group.add_argument("--cvcuda-image-processing", action="store_true") group.add_argument( "--expert-interval", type=int, default=2, help='Use experts in every "expert-interval" layers', ) group.add_argument("--moe", action="store_true") group.add_argument("--moe-topk", type=int, default=1, help="moe-topk") group.add_argument( "--moe-expert-parallel-size", type=int, default=None, help="Degree of the MoE expert parallelism. By default, " "the size of this value will be automatically determined.", ) group.add_argument( "--moe-train-capacity-factor", type=float, default=1.0, help="The capacity of the MoE expert at training time", ) group.add_argument( "--moe-eval-capacity-factor", type=float, default=1.0, help="The capacity of the MoE expert at eval time.", ) group.add_argument( "--moe-min-capacity", type=int, default=4, help="The minimum capacity per MoE expert regardless of the capacity_factor.", ) group.add_argument( "--moe-loss-coeff", type=float, default=0.01, help="Scaling coefficient for adding MoE loss to model loss", ) group.add_argument( "--use-tutel", action="store_true", help="Use Tutel optimization for MoE" ) group.add_argument( "--router-type", type=str, default="topk", choices=["topk", "expert_choice"], help="Options for router type, support top1 & top2 and expert_choice", ) group.add_argument( "--moe-input-feature-slicing", action="store_true", help="Enable moe all2all performance optimization.", ) group.add_argument( "--disable-bias-linear-fc", action="store_false", help="Disable bias in the linear layers", dest="add_bias_linear_fc", ) group.add_argument( "--disable-bias-attn-fc", action="store_false", help="Disable bias in the linear layers", dest="add_bias_attn_fc", ) group.add_argument( "--disable-parallel-output", action="store_false", help="Disable parallel-output", dest="enable_parallel_output", ) group.add_argument( "--task-list", type=str, default="all", help='Either "all" or comma separated list of tasks.', ) group.add_argument( "--verbosity", type=str, default="INFO", help="Logging verbosity", ) group.add_argument( "--adaptive-seq-len", default=False, action="store_true", help="Should the sequence length be adapted to the batch during evaluation," " if in fp16 the results will be slightly different due to numerical" " errors but greatly speed up evaluation.", ) group.add_argument( "--eval-fp32", default=False, action="store_true", help="Should the evaluation run in fp32", ) group.add_argument("--num-fewshot", type=int, default=None, help="num fewshot") group.add_argument( "--convert-checkpoint-from-megatron-to-transformers", action="store_true", help=( "If True, convert a Megatron checkpoint to a Transformers checkpoint. " "If False, convert a Transformers checkpoint to a Megatron checkpoint." ), ) patch_if_not_exist( group, "--moe-ffn-hidden-size", type=int, default=None ) group.add_argument("--shared-moe-ffn-hidden-size", type=int, default=None) group.add_argument( "--enable-shared-expert", action="store_true", help="enable-shared-expert" ) patch_if_not_exist( group, "--q-lora-rank", type=int, default=None ) patch_if_not_exist( group, "--kv-lora-rank", type=int, default=None ) patch_if_not_exist( group, "--v-head-dim", type=int, default=None ) group.add_argument("--qk-nope-head-dim", type=int, default=None) group.add_argument("--qk-rope-head-dim", type=int, default=None) group.add_argument("--num-shared-experts", type=int, default=None) patch_if_not_exist( group, "--moe-layer-freq", type=int, default=1 ) patch_if_not_exist( group, "--rotary-scaling-factor", type=int, default=1 ) group.add_argument( "--optimizer-offload-policy", default="static", type=str, help="Optimizer Offload Policy used by OffloadDistributedOptimizer, " "valid if base optimizer is HybridAdam.", ) patch_if_not_exist( group, "--optimizer-offload-fraction", type=float, default=0.5 ) group.add_argument( "--train-mode", default="pretrain", type=str, help="pretrain or finetune" ) group.add_argument( "--optimizer-offload-auto-threshold", type=int, default=2048 * 1024 * 1024, help="Optimizer Offload Threshold currently used by auto policy, " "tune larger if OOM occurs", ) group.add_argument( "--optimizer-offload-chunk-size", type=int, default=32 * 1024 * 1024, help="Chunk size of Chunk Manager in Optimizer Offload," "keep zero to search for a optimal size", ) group.add_argument( "--cpu-offloading", default=False, action="store_true", help="Use activation checkpointing.", ) group.add_argument( "--cpu-offloading-num-layers", type=int, default=0, help="The num of layers to be moved to CPU", ) group.add_argument('--dataset-config', type=str, default=None) group.add_argument("--prompt-path", type=str, default=None) group.add_argument('--freeze-LM', action='store_true', default=False) group.add_argument('--freeze-ViT', action='store_true', default=False) group.add_argument('--language-model-type', type=str, required=False) group.add_argument('--vision-model-type', type=str, default="clip") group.add_argument("--disable-vision-class-token", action="store_true", default=False) group.add_argument( "--allow-missing-vision-projection-checkpoint", action="store_true", default=False ) group.add_argument("--use-te", action="store_true", default=False) group.add_argument( "--dataloader-save", type=str, default=None, help="Energon dataloader state save path" ) group.add_argument( "--use-tiling", action="store_true", default=False, help="Use input image tiling" ) group.add_argument("--max-num-tiles", type=int, default=1, help="Maximum number of image tiles") group.add_argument( "--use-thumbnail", action="store_true", default=False, help="Add image thumbnail as a tile" ) group.add_argument( "--dataloader-seq-length", type=int, help="Make dataloader to produce sequences of specific length.", ) group.add_argument( "--num-frames", type=int, default=1, help="Number of frames to regularly sample from the video as input to the model.", ) group.add_argument( "--online-evaluation-config", type=str, help="Config file for online evaluation." ) group.add_argument( "--tokenizer-prompt-format", type=str, choices=["mistral", "llama3", "chatml"], required=False, help="Prompt format to use with the tokenizer.", ) group.add_argument( "--special-tokens", nargs="*", default=["<image>"], help="Special tokens used in the multimodal model", ) group.add_argument( "--image-tag-type", type=str, choices=["nvlm", "internvl", ""], default="", # Default: Image tag not used. help="Surround image tokens with tags.", ) return parser

megatron_patch/arguments.py (449 lines of code) (raw):