in arctic_inference/vllm/args.py [0:0]
def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
parser = EngineArgsPatch._orig_add_cli_args(parser)
arctic_group = parser.add_argument_group(
title="Arctic Inference",
description="Arctic Inference configuration.",
)
arctic_group.add_argument(
"--ulysses-sequence-parallel-size",
type=int,
default=ArcticEngineArgs.ulysses_sequence_parallel_size,
help="Number of Ulysses sequence parallel replicas",
)
arctic_group.add_argument(
"--enable-shift-parallel",
action='store_true',
help='If True, enable shift parallelism.')
arctic_group.add_argument(
"--shift-parallel-threshold",
type=int,
default=ArcticEngineArgs.shift_parallel_threshold,
help=("Ulysses sequence parallel if batch size > threshold, "
"otherwise tensor parallel across the whole world size"),
)
return parser