in utils/utils.py [0:0]
def get_params():
parser = argparse.ArgumentParser()
parser.add_argument(
"--frame",
help="communication framework",
choices=["Megatron", "DeepSpeed", "collective_test"],
default="Megatron",
)
parser.add_argument("--gpu_type", type=str, default=None),
parser.add_argument("--world_size", type=int, default=1,
help="Number of GPUs")
parser.add_argument("--tensor_model_parallel_size", type=int, default=1,
help='Degree of tensor model parallelism.')
parser.add_argument("--pipeline_model_parallel", type=int, default=1,
help='Degree of pipeline model parallelism.')
parser.add_argument('--context-parallel-size', type=int, default=1,
help='Degree of context parallelism.')
parser.add_argument("--pp_rank", type=int, default=-1,
help='Rank where encoder and decoder should be split.')
parser.add_argument("--global_batch", type=int, default=4,
help='Training batch size. If set, it should be a '
'multiple of micro-batch-size times data-parallel-size. '
'If this value is None, then '
'use micro-batch-size * data-parallel-size as the '
'global batch size. This choice will result in 1 for '
'number of micro-batches.')
parser.add_argument("--micro_batch", type=int, default=1,
help='Batch size per model instance (local batch size). '
'Global batch size is local batch size times data '
'parallel size times number of micro batches.'
)
parser.add_argument("--epoch_num", type=int, default=1,
help="Number of iterations")
parser.add_argument("--computation_enable", action="store_true", help="Enable computation")
parser.add_argument("--dtype", default="bfloat16")
parser.add_argument(
"--ffn_hidden_size",
type=int,
default=None,
help="Transformer Feed-Forward Network hidden size. "
"This is set to 4*hidden-size if not provided",
)
parser.add_argument(
"--enable_visual",
action="store_true",
help="Enable visualization",
)
parser.add_argument("--workload_only", action="store_true", help="Only generate workload")
get_model_params(parser)
get_ds_params(parser)
get_megatron_params(parser)
get_collective_test_params(parser)
get_moe_params(parser)
get_simAI_workload_params(parser)
get_aiob_params(parser)
args = parser.parse_args()
assert (
args.world_size % (args.tensor_model_parallel_size * args.pipeline_model_parallel) == 0
), f"world size: {args.world_size}, tp: {args.tensor_model_parallel_size}, pp: {args.pipeline_model_parallel}"
if args.moe_enable:
assert (
args.moe_enable and args.enable_sequence_parallel
), f"moe must be enabled with sequence parallel"
args.dp_num = args.world_size // (args.tensor_model_parallel_size * args.pipeline_model_parallel)
# assert args.global_batch % (args.dp_num * args.micro_batch) == 0, \
# f"global_batch: {args.global_batch}, dp: {args.dp_num}, micro_batch: {args.micro_batch}"
args.num_microbatches = args.global_batch // (args.dp_num * args.micro_batch)
if args.aiob_enable and not args.computation_enable:
args.computation_enable = True
if args.num_attention_heads is None:
args.num_attention_heads = args.num_layers
args.padded_vocab_size = get_padded_vocab_size(args)
if args.ffn_hidden_size is None:
if args.swiglu:
# reduce the dimnesion for MLP since projections happens on
# two linear layers. this keeps the number of paramters in
# the same ballpark as the counterpart with 4*h size
# we keep it a multiple of 64, which means the actual tensor size
# will be a multiple of 64 / tp_size
args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64
else:
args.ffn_hidden_size = 4 * args.hidden_size
if args.swiglu:
args.gated_linear_unit = True
args.bias_gelu_fusion = False
# Expert parallelism check
if args.expert_model_parallel_size > 1:
assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
assert args.num_experts % args.expert_model_parallel_size == 0, \
"Number of experts should be a multiple of expert model parallel_size."
assert not args.dtype == "float16", \
"Expert parallelism is not supported with fp16 training."
if args.moe_grouped_gemm:
assert args.dtype == "bfloat16", 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
if args.pipeline_model_parallel > 1 :
args.num_layers = int(args.num_layers//args.pipeline_model_parallel)
return args