optimum/commands/onnxruntime/quantize.py (76 lines of code) (raw):

# coding=utf-8 # Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Quantization with ONNX Runtime command-line interface class.""" from pathlib import Path from typing import TYPE_CHECKING from optimum.commands import BaseOptimumCLICommand if TYPE_CHECKING: from argparse import ArgumentParser def parse_args_onnxruntime_quantize(parser: "ArgumentParser"): required_group = parser.add_argument_group("Required arguments") required_group.add_argument( "--onnx_model", type=Path, required=True, help="Path to the repository where the ONNX models to quantize are located.", ) required_group.add_argument( "-o", "--output", type=Path, required=True, help="Path to the directory where to store generated ONNX model.", ) optional_group = parser.add_argument_group("Optional arguments") optional_group.add_argument( "--per_channel", action="store_true", help="Compute the quantization parameters on a per-channel basis.", ) level_group = parser.add_mutually_exclusive_group(required=True) level_group.add_argument("--arm64", action="store_true", help="Quantization for the ARM64 architecture.") level_group.add_argument("--avx2", action="store_true", help="Quantization with AVX-2 instructions.") level_group.add_argument("--avx512", action="store_true", help="Quantization with AVX-512 instructions.") level_group.add_argument( "--avx512_vnni", action="store_true", help="Quantization with AVX-512 and VNNI instructions." ) level_group.add_argument("--tensorrt", action="store_true", help="Quantization for NVIDIA TensorRT optimizer.") level_group.add_argument( "-c", "--config", type=Path, help="`ORTConfig` file to use to optimize the model.", ) class ONNXRuntimeQuantizeCommand(BaseOptimumCLICommand): @staticmethod def parse_args(parser: "ArgumentParser"): return parse_args_onnxruntime_quantize(parser) def run(self): from ...onnxruntime.configuration import AutoQuantizationConfig, ORTConfig from ...onnxruntime.quantization import ORTQuantizer if self.args.output == self.args.onnx_model: raise ValueError("The output directory must be different than the directory hosting the ONNX model.") save_dir = self.args.output quantizers = [] use_external_data_format = False quantizers = [ ORTQuantizer.from_pretrained(self.args.onnx_model, file_name=model.name) for model in self.args.onnx_model.glob("*.onnx") ] if self.args.arm64: qconfig = AutoQuantizationConfig.arm64(is_static=False, per_channel=self.args.per_channel) elif self.args.avx2: qconfig = AutoQuantizationConfig.avx2(is_static=False, per_channel=self.args.per_channel) elif self.args.avx512: qconfig = AutoQuantizationConfig.avx512(is_static=False, per_channel=self.args.per_channel) elif self.args.avx512_vnni: qconfig = AutoQuantizationConfig.avx512_vnni(is_static=False, per_channel=self.args.per_channel) elif self.args.tensorrt: raise ValueError( "TensorRT quantization relies on static quantization that requires calibration, which is currently not supported through optimum-cli. Please adapt Optimum static quantization examples to run static quantization for TensorRT: https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/quantization" ) else: config = ORTConfig.from_pretrained(self.args.config) qconfig = config.quantization use_external_data_format = config.use_external_data_format for q in quantizers: q.quantize( save_dir=save_dir, quantization_config=qconfig, use_external_data_format=use_external_data_format )