in bench/generation/evaluate_model.py [0:0]
def main():
parser = argparse.ArgumentParser(description="Evaluate quantized model metrics")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="facebook/opt-350m",
help="The name of the trained Model.",
)
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
parser.add_argument("--metric", type=str, default="prediction", choices=["latency", "prediction", "perplexity"])
parser.add_argument("--quantizer", type=str, default="quanto", choices=["quanto", "awq", "bnb", "hqq"])
parser.add_argument(
"--weights",
type=str,
default="none",
choices=["none", "int4", "int8", "float8"],
)
parser.add_argument(
"--activations",
type=str,
default="none",
choices=["none", "int8", "float8"],
)
parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.")
parser.add_argument(
"--dtype",
type=str,
default="none",
choices=["none", "fp16", "bf16"],
)
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
dtype = {"none": None, "fp16": torch.float16, "bf16": torch.bfloat16}[args.dtype]
evaluate(args.model, args.metric, args.quantizer, args.weights, args.activations, args.batch_size, device, dtype)