def quantize()

in scripts/quantize.py [0:0]


def quantize(input_folder, output_folder, quantization_args: QuantizationArguments):

    # (Step 1) Validate the arguments
    if not quantization_args.modes:
        raise ValueError("At least one quantization mode must be specified")

    if not os.path.exists(input_folder):
        raise ValueError(f"Input folder {input_folder} does not exist")

    model_names_or_paths = [
        os.path.join(input_folder, file)
        for file in os.listdir(input_folder)
        if file.endswith(".onnx")
    ]
    if not model_names_or_paths:
        raise ValueError(f"No .onnx models found in {input_folder}")

    os.makedirs(output_folder, exist_ok=True)

    # (Step 2) Quantize the models
    for model_path in (progress_models := tqdm(model_names_or_paths)):
        progress_models.set_description(f"Processing {model_path}")

        file_name_without_extension = os.path.splitext(os.path.basename(model_path))[0]

        for mode in (progress := tqdm(quantization_args.modes)):
            progress.set_description(f" - Quantizing to {mode}")
            mode = QuantMode(mode)
            suffix = QUANTIZE_SUFFIX_MAPPING.get(mode, mode.value)
            save_path = os.path.join(
                output_folder,
                f"{file_name_without_extension}_{suffix}.onnx",
            )

            # NOTE: Unfortunately, we need to reload the model for each quantization mode,
            # which is memory inefficient. This is because the quantization functions
            # modify the model in-place, and we need to keep the original model for each mode.
            model = onnx.load_model(model_path)

            if mode == QuantMode.FP16:
                quantize_fp16(
                    model,
                    save_path,
                    quantization_args.op_block_list
                )

            elif mode in (QuantMode.Q4, QuantMode.Q4F16):
                block_size = quantization_args.block_size or 32

                q4_model = quantize_q4(
                    model,
                    save_path=None if mode == QuantMode.Q4F16 else save_path,
                    block_size=block_size,
                    is_symmetric=quantization_args.is_symmetric,
                    accuracy_level=quantization_args.accuracy_level,
                )
                if mode == QuantMode.Q4F16:
                    quantize_fp16(
                        q4_model,
                        save_path,
                        quantization_args.op_block_list,
                    )

            elif mode == QuantMode.BNB4:
                quantize_bnb4(
                    model,
                    save_path,
                    block_size=quantization_args.block_size or 64,
                    quant_type=(
                        quantization_args.quant_type
                        if quantization_args.quant_type is not None
                        else MatMulBnb4Quantizer.NF4
                    ),
                )

            elif mode in (QuantMode.Q8, QuantMode.QI8, QuantMode.QU8):
                if mode == QuantMode.Q8:
                    op_types = get_operators(model)
                    weight_type = (
                        QuantType.QUInt8
                        if any(x in QUINT8_OPS for x in op_types)
                        else QuantType.QInt8
                    )

                elif mode == QuantMode.QI8:
                    weight_type = QuantType.QInt8

                else:  # mode == QuantMode.QU8:
                    weight_type = QuantType.QUInt8

                quantize_q8(
                    model,
                    save_path,
                    per_channel=quantization_args.per_channel,
                    reduce_range=quantization_args.reduce_range,
                    weight_type=weight_type,
                    op_block_list=quantization_args.op_block_list,
                )