in distilvit/quantize.py [0:0]
def quantize(model_names_or_paths, **quantize_kwargs):
"""
Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU
Uses unsigned ints for activation values, signed ints for weights, per
https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
it is faster on most CPU architectures
Args:
onnx_model_path: Path to location the exported ONNX model is stored
Returns: The Path generated for the quantized
"""
quantize_config = dict(**quantize_kwargs, per_model_config={})
for model in tqdm(model_names_or_paths, desc="Quantizing"):
directory_path = os.path.dirname(model)
file_name_without_extension = os.path.splitext(os.path.basename(model))[0]
# NOTE:
# As of 2023/04/20, the current latest version of onnxruntime-web is 1.14.0, and does not support INT8 weights for Conv layers.
# For this reason, we choose model weight types to ensure compatibility with onnxruntime-web.
#
# As per docs, signed weight type (QInt8) is faster on most CPUs, so, we use that unless the model contains a Conv layer.
# For more information, see:
# - https://github.com/microsoft/onnxruntime/issues/3130#issuecomment-1105200621
# - https://github.com/microsoft/onnxruntime/issues/2339
loaded_model = onnx.load_model(model)
op_types = get_operators(loaded_model)
weight_type = QuantType.QUInt8 if "Conv" in op_types else QuantType.QInt8
quantize_dynamic(
model_input=model,
model_output=os.path.join(
directory_path, f"{file_name_without_extension}_quantized.onnx"
),
weight_type=weight_type,
optimize_model=False,
# TODO allow user to specify these
# op_types_to_quantize=['MatMul', 'Add', 'Conv'],
extra_options=dict(EnableSubgraph=True),
**quantize_kwargs,
)
quantize_config["per_model_config"][file_name_without_extension] = dict(
op_types=list(op_types),
weight_type=str(weight_type),
)
# Save quantization config
with open(os.path.join(directory_path, "quantize_config.json"), "w") as fp:
json.dump(quantize_config, fp, indent=4)