in optimum/commands/onnxruntime/quantize.py [0:0]
def run(self):
from ...onnxruntime.configuration import AutoQuantizationConfig, ORTConfig
from ...onnxruntime.quantization import ORTQuantizer
if self.args.output == self.args.onnx_model:
raise ValueError("The output directory must be different than the directory hosting the ONNX model.")
save_dir = self.args.output
quantizers = []
use_external_data_format = False
quantizers = [
ORTQuantizer.from_pretrained(self.args.onnx_model, file_name=model.name)
for model in self.args.onnx_model.glob("*.onnx")
]
if self.args.arm64:
qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=self.args.per_channel)
elif self.args.avx2:
qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=self.args.per_channel)
elif self.args.avx512:
qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=self.args.per_channel)
elif self.args.avx512_vnni:
qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.args.per_channel)
elif self.args.tensorrt:
raise ValueError(
"TensorRT quantization relies on static quantization that requires calibration, which is currently not supported through optimum-cli. Please adapt Optimum static quantization examples to run static quantization for TensorRT: https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/quantization"
)
else:
config = ORTConfig.from_pretrained(self.args.config)
qconfig = config.quantization
use_external_data_format = config.use_external_data_format
for q in quantizers:
q.quantize(
save_dir=save_dir, quantization_config=qconfig, use_external_data_format=use_external_data_format
)