in fastchat/serve/cli.py [0:0]
def main(args):
if args.gpus:
if len(args.gpus.split(",")) < args.num_gpus:
raise ValueError(
f"Larger --num-gpus ({args.num_gpus}) than --gpus {args.gpus}!"
)
os.environ["CUDA_VISIBLE_DEVICES"] = args.gpus
os.environ["XPU_VISIBLE_DEVICES"] = args.gpus
if args.enable_exllama:
exllama_config = ExllamaConfig(
max_seq_len=args.exllama_max_seq_len,
gpu_split=args.exllama_gpu_split,
cache_8bit=args.exllama_cache_8bit,
)
else:
exllama_config = None
if args.enable_xft:
xft_config = XftConfig(
max_seq_len=args.xft_max_seq_len,
data_type=args.xft_dtype,
)
if args.device != "cpu":
print("xFasterTransformer now is only support CPUs. Reset device to CPU")
args.device = "cpu"
else:
xft_config = None
if args.style == "simple":
chatio = SimpleChatIO(args.multiline)
elif args.style == "rich":
chatio = RichChatIO(args.multiline, args.mouse)
elif args.style == "programmatic":
chatio = ProgrammaticChatIO()
else:
raise ValueError(f"Invalid style for console: {args.style}")
try:
chat_loop(
args.model_path,
args.device,
args.num_gpus,
args.max_gpu_memory,
str_to_torch_dtype(args.dtype),
args.load_8bit,
args.cpu_offloading,
args.conv_template,
args.conv_system_msg,
args.temperature,
args.repetition_penalty,
args.max_new_tokens,
chatio,
gptq_config=GptqConfig(
ckpt=args.gptq_ckpt or args.model_path,
wbits=args.gptq_wbits,
groupsize=args.gptq_groupsize,
act_order=args.gptq_act_order,
),
awq_config=AWQConfig(
ckpt=args.awq_ckpt or args.model_path,
wbits=args.awq_wbits,
groupsize=args.awq_groupsize,
),
exllama_config=exllama_config,
xft_config=xft_config,
revision=args.revision,
judge_sent_end=args.judge_sent_end,
debug=args.debug,
history=not args.no_history,
)
except KeyboardInterrupt:
print("exit...")