def get_params()

in utils/utils.py [0:0]
89 lines of code
13 McCabe index (conditional complexity)

def get_params():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--frame",
        help="communication framework",
        choices=["Megatron", "DeepSpeed", "collective_test"],
        default="Megatron",
    )
    parser.add_argument("--gpu_type", type=str, default=None),
    parser.add_argument("--world_size", type=int, default=1,
                        help="Number of GPUs")
    parser.add_argument("--tensor_model_parallel_size", type=int, default=1,
                        help='Degree of tensor model parallelism.')
    parser.add_argument("--pipeline_model_parallel", type=int, default=1,
                        help='Degree of pipeline model parallelism.')
    parser.add_argument('--context-parallel-size', type=int, default=1,
                       help='Degree of context parallelism.')
    parser.add_argument("--pp_rank", type=int, default=-1,
                        help='Rank where encoder and decoder should be split.')
    parser.add_argument("--global_batch", type=int, default=4,
                        help='Training batch size. If set, it should be a '
                       'multiple of micro-batch-size times data-parallel-size. '
                       'If this value is None, then '
                       'use micro-batch-size * data-parallel-size as the '
                       'global batch size. This choice will result in 1 for '
                       'number of micro-batches.')
    parser.add_argument("--micro_batch", type=int, default=1,
                       help='Batch size per model instance (local batch size). '
                       'Global batch size is local batch size times data '
                       'parallel size times number of micro batches.'
                        )
    parser.add_argument("--epoch_num", type=int, default=1,
                        help="Number of iterations")
    parser.add_argument("--computation_enable", action="store_true", help="Enable computation")
    parser.add_argument("--dtype", default="bfloat16")
    parser.add_argument(
        "--ffn_hidden_size",
        type=int,
        default=None,
        help="Transformer Feed-Forward Network hidden size. "
        "This is set to 4*hidden-size if not provided",
    )
    parser.add_argument(
        "--enable_visual",
        action="store_true",
        help="Enable visualization",
    )
    parser.add_argument("--workload_only", action="store_true", help="Only generate workload")
    get_model_params(parser)
    get_ds_params(parser)
    get_megatron_params(parser)
    get_collective_test_params(parser)
    get_moe_params(parser)
    get_simAI_workload_params(parser)
    get_aiob_params(parser)
    args = parser.parse_args()

    assert (
        args.world_size % (args.tensor_model_parallel_size * args.pipeline_model_parallel) == 0
    ), f"world size: {args.world_size}, tp: {args.tensor_model_parallel_size}, pp: {args.pipeline_model_parallel}"
    if args.moe_enable:
        assert (
            args.moe_enable and args.enable_sequence_parallel
        ), f"moe must be enabled with sequence parallel"
    args.dp_num = args.world_size // (args.tensor_model_parallel_size * args.pipeline_model_parallel)
    # assert args.global_batch % (args.dp_num * args.micro_batch) == 0, \
    #     f"global_batch: {args.global_batch}, dp: {args.dp_num}, micro_batch: {args.micro_batch}"
    args.num_microbatches = args.global_batch // (args.dp_num * args.micro_batch)
    if args.aiob_enable and not args.computation_enable:
            args.computation_enable = True
            
    if args.num_attention_heads is None:
        args.num_attention_heads = args.num_layers

                    
    args.padded_vocab_size = get_padded_vocab_size(args)
    if args.ffn_hidden_size is None:
        if args.swiglu:
            # reduce the dimnesion for MLP since projections happens on
            # two linear layers. this keeps the number of paramters in
            # the same ballpark as the counterpart with 4*h size
            # we keep it a multiple of 64, which means the actual tensor size
            # will be a multiple of 64 / tp_size
            args.ffn_hidden_size = int((4 * args.hidden_size * 2 / 3) / 64) * 64

        else:
            args.ffn_hidden_size = 4 * args.hidden_size
    if args.swiglu:
        args.gated_linear_unit = True
        args.bias_gelu_fusion = False
    # Expert parallelism check
    if args.expert_model_parallel_size  > 1:
        assert args.num_experts is not None, "num_experts must be non None to use expert model parallelism"
        assert args.num_experts % args.expert_model_parallel_size == 0, \
            "Number of experts should be a multiple of expert model parallel_size."
        assert not args.dtype == "float16", \
            "Expert parallelism is not supported with fp16 training."
    if args.moe_grouped_gemm:
        assert args.dtype == "bfloat16", 'Currently GroupedGEMM for MoE only supports bf16 dtype.'
    if args.pipeline_model_parallel > 1 :
        args.num_layers = int(args.num_layers//args.pipeline_model_parallel)
    return args