bench/generation/evaluate_configurations.py (88 lines of code) (raw):
# Copyright 2024 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import json
import torch
from evaluate_model import evaluate
from gen_barchart import gen_barchart
from transformers import AutoConfig
from optimum.quanto import qtype
def evaluate_model_configurations(
model_id: str, metric: str, device: torch.device, batch_size: int = 32, dtype: torch.dtype = torch.float16
):
weights = [
"int4",
"int8",
"float8",
]
activations = [
"none",
"float8",
]
def short_name(qtype: qtype):
return {
"none": "f16" if dtype == torch.float16 else "bf16",
"int4": "i4",
"int8": "i8",
"float8": "f8",
}[qtype]
results = {}
# Evaluate float16/bfloat16 model
config_name = f"W{short_name('none')}A{short_name('none')}"
print(f"{model_id}[{config_name}]:")
results[config_name] = evaluate(model_id, metric, "quanto", "none", "none", batch_size, device, dtype)
# Evaluate quantized models
for w in weights:
for a in activations:
config_name = f"W{short_name(w)}A{short_name(a)}"
print(f"{model_id}[{config_name}]:")
results[config_name] = evaluate(model_id, metric, "quanto", w, a, batch_size, device, dtype)
return results
def main():
parser = argparse.ArgumentParser(description="Evaluate quantized model predictions on Lambada Dataset")
parser.add_argument("--seed", type=int, default=1, metavar="S", help="random seed (default: 1)")
parser.add_argument(
"--model",
type=str,
default="facebook/opt-350m",
help="The name of the trained Model.",
)
parser.add_argument("--device", type=str, default=None, help="The device to use for generation.")
parser.add_argument("--metric", type=str, default="prediction", choices=["latency", "prediction", "perplexity"])
parser.add_argument("--batch_size", type=int, default=32, help="The batch size during evaluation.")
parser.add_argument("--dtype", type=str, help="Use the following dtype to load the model.")
parser.add_argument("--json", action="store_true", help="Dump the results to a json file.")
parser.add_argument("--png", action="store_true", help="Generate a PNG.")
args = parser.parse_args()
torch.manual_seed(args.seed)
if args.device is None:
if torch.cuda.is_available():
device = torch.device("cuda")
elif torch.backends.mps.is_available():
device = torch.device("mps")
elif torch.xpu.is_available():
device = torch.device("xpu")
else:
device = torch.device("cpu")
else:
device = torch.device(args.device)
if args.dtype is None:
config = AutoConfig.from_pretrained(args.model)
dtype = getattr(config, "torch_dtype", torch.float16)
else:
dtype = torch.float16 if args.dtype == "fp16" else torch.bfloat16
results = evaluate_model_configurations(args.model, args.metric, device, batch_size=args.batch_size, dtype=dtype)
if args.json:
model_name = args.model.split("/")[-1]
json_path = f"{model_name}-{args.metric}.json"
with open(json_path, "w") as fp:
json.dump({model_name: results}, fp, indent=4)
if args.png:
if args.metric == "latency":
title = f"{args.model}: Mean latency per token"
label = "Latency (ms)"
elif args.metric == "prediction":
title = f"{args.model}: Prediction accuracy on Lambada dataset"
label = "Accuracy"
elif args.metric == "perplexity":
title = f"{args.model}: Perplexity evaluated on WikiText dataset"
label = "Perplexity"
gen_barchart(args.model, title, label, results, dtype)
if __name__ == "__main__":
main()