in modules/SwissArmyTransformer/sat/arguments.py [0:0]
def get_args(args_list=None, parser=None):
"""Parse all the args."""
if parser is None:
parser = argparse.ArgumentParser(description='sat')
else:
assert isinstance(parser, argparse.ArgumentParser)
parser = add_model_config_args(parser)
parser = add_training_args(parser)
parser = add_evaluation_args(parser)
parser = add_data_args(parser)
parser = add_tokenization_args(parser)
parser = add_text_generate_args(parser)
# Include DeepSpeed configuration arguments
import deepspeed
parser = deepspeed.add_config_arguments(parser)
args = parser.parse_args(args_list)
if not args.iterable_dataset_eval:
args.iterable_dataset_eval = args.iterable_dataset
else:
args.iterable_dataset_eval = eval(args.iterable_dataset_eval)
if not args.train_data:
print_rank0('No training data specified', level='WARNING')
assert (args.train_iters is None) or (args.epochs is None), 'only one of train_iters and epochs should be set.'
if args.train_iters is None and args.epochs is None:
args.train_iters = 10000 # default 10k iters
print_rank0('No train_iters (recommended) or epochs specified, use default 10k iters.', level='WARNING')
args.cuda = torch.cuda.is_available()
args.rank = int(os.getenv('RANK', '0'))
args.world_size = int(os.getenv("WORLD_SIZE", '1'))
if args.local_rank is None:
args.local_rank = int(os.getenv("LOCAL_RANK", '0')) # torchrun
if args.device == -1: # not set manually
if torch.cuda.device_count() == 0:
args.device = 'cpu'
elif args.local_rank is not None:
args.device = args.local_rank
else:
args.device = args.rank % torch.cuda.device_count()
# local rank should be consistent with device in DeepSpeed
if args.local_rank != args.device and args.mode != 'inference':
raise ValueError(
'LOCAL_RANK (default 0) and args.device inconsistent. '
'This can only happens in inference mode. '
'Please use CUDA_VISIBLE_DEVICES=x for single-GPU training. '
)
# args.model_parallel_size = min(args.model_parallel_size, args.world_size)
if args.rank == 0:
print_rank0('using world size: {} and model-parallel size: {} '.format(
args.world_size, args.model_parallel_size))
if args.vocab_size > 0:
_adjust_vocab_size(args)
if args.train_data_weights is not None:
assert len(args.train_data_weights) == len(args.train_data), f'{args.train_data_weights} vs {args.train_data}'
if args.mode != 'inference': # training with deepspeed
args.deepspeed = True
if args.deepspeed_config is None: # not specified
args.deepspeed_config = os.path.join(os.path.dirname(__file__), 'training', f'deepspeed_zero{args.zero_stage}.json')
override_deepspeed_config = True
else:
override_deepspeed_config = False
assert not (args.fp16 and args.bf16), 'cannot specify both fp16 and bf16.'
if args.zero_stage > 0 and not args.fp16 and not args.bf16:
print_rank0('Automatically set fp16=True to use ZeRO.')
args.fp16 = True
args.bf16 = False
if args.deepspeed:
if args.checkpoint_activations:
args.deepspeed_activation_checkpointing = True
else:
args.deepspeed_activation_checkpointing = False
if args.deepspeed_config is not None:
with open(args.deepspeed_config) as file:
deepspeed_config = json.load(file)
if override_deepspeed_config: # not specify deepspeed_config, use args
if args.fp16:
deepspeed_config["fp16"]["enabled"] = True
elif args.bf16:
deepspeed_config["bf16"]["enabled"] = True
deepspeed_config["fp16"]["enabled"] = False
else:
deepspeed_config["fp16"]["enabled"] = False
deepspeed_config["train_micro_batch_size_per_gpu"] = args.batch_size
deepspeed_config["gradient_accumulation_steps"] = args.gradient_accumulation_steps
optimizer_params_config = deepspeed_config["optimizer"]["params"]
optimizer_params_config["lr"] = args.lr
optimizer_params_config["weight_decay"] = args.weight_decay
else: # override args with values in deepspeed_config
if args.rank == 0:
print_rank0('Will override arguments with manually specified deepspeed_config!')
if "fp16" in deepspeed_config and deepspeed_config["fp16"]["enabled"]:
args.fp16 = True
else:
args.fp16 = False
if "bf16" in deepspeed_config and deepspeed_config["bf16"]["enabled"]:
args.bf16 = True
else:
args.bf16 = False
if "train_micro_batch_size_per_gpu" in deepspeed_config:
args.batch_size = deepspeed_config["train_micro_batch_size_per_gpu"]
if "gradient_accumulation_steps" in deepspeed_config:
args.gradient_accumulation_steps = deepspeed_config["gradient_accumulation_steps"]
else:
args.gradient_accumulation_steps = None
if "optimizer" in deepspeed_config:
optimizer_params_config = deepspeed_config["optimizer"].get("params", {})
args.lr = optimizer_params_config.get("lr", args.lr)
args.weight_decay = optimizer_params_config.get("weight_decay", args.weight_decay)
args.deepspeed_config = deepspeed_config
# if args.sandwich_ln: # removed in v0.3
# args.layernorm_order = 'sandwich'
# initialize distributed and random seed because it always seems to be necessary.
initialize_distributed(args)
set_random_seed(args.seed)
return args