in scripts/quantize.py [0:0]
def quantize(input_folder, output_folder, quantization_args: QuantizationArguments):
# (Step 1) Validate the arguments
if not quantization_args.modes:
raise ValueError("At least one quantization mode must be specified")
if not os.path.exists(input_folder):
raise ValueError(f"Input folder {input_folder} does not exist")
model_names_or_paths = [
os.path.join(input_folder, file)
for file in os.listdir(input_folder)
if file.endswith(".onnx")
]
if not model_names_or_paths:
raise ValueError(f"No .onnx models found in {input_folder}")
os.makedirs(output_folder, exist_ok=True)
# (Step 2) Quantize the models
for model_path in (progress_models := tqdm(model_names_or_paths)):
progress_models.set_description(f"Processing {model_path}")
file_name_without_extension = os.path.splitext(os.path.basename(model_path))[0]
for mode in (progress := tqdm(quantization_args.modes)):
progress.set_description(f" - Quantizing to {mode}")
mode = QuantMode(mode)
suffix = QUANTIZE_SUFFIX_MAPPING.get(mode, mode.value)
save_path = os.path.join(
output_folder,
f"{file_name_without_extension}_{suffix}.onnx",
)
# NOTE: Unfortunately, we need to reload the model for each quantization mode,
# which is memory inefficient. This is because the quantization functions
# modify the model in-place, and we need to keep the original model for each mode.
model = onnx.load_model(model_path)
if mode == QuantMode.FP16:
quantize_fp16(
model,
save_path,
quantization_args.op_block_list
)
elif mode in (QuantMode.Q4, QuantMode.Q4F16):
block_size = quantization_args.block_size or 32
q4_model = quantize_q4(
model,
save_path=None if mode == QuantMode.Q4F16 else save_path,
block_size=block_size,
is_symmetric=quantization_args.is_symmetric,
accuracy_level=quantization_args.accuracy_level,
)
if mode == QuantMode.Q4F16:
quantize_fp16(
q4_model,
save_path,
quantization_args.op_block_list,
)
elif mode == QuantMode.BNB4:
quantize_bnb4(
model,
save_path,
block_size=quantization_args.block_size or 64,
quant_type=(
quantization_args.quant_type
if quantization_args.quant_type is not None
else MatMulBnb4Quantizer.NF4
),
)
elif mode in (QuantMode.Q8, QuantMode.QI8, QuantMode.QU8):
if mode == QuantMode.Q8:
op_types = get_operators(model)
weight_type = (
QuantType.QUInt8
if any(x in QUINT8_OPS for x in op_types)
else QuantType.QInt8
)
elif mode == QuantMode.QI8:
weight_type = QuantType.QInt8
else: # mode == QuantMode.QU8:
weight_type = QuantType.QUInt8
quantize_q8(
model,
save_path,
per_channel=quantization_args.per_channel,
reduce_range=quantization_args.reduce_range,
weight_type=weight_type,
op_block_list=quantization_args.op_block_list,
)