optimum/commands/export/openvino.py

# Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Defines the command line for the export with OpenVINO.""" import json import logging import sys from pathlib import Path from typing import TYPE_CHECKING, Optional from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE from ...exporters import TasksManager from ...intel.utils.import_utils import DIFFUSERS_IMPORT_ERROR, is_diffusers_available, is_nncf_available from ...intel.utils.modeling_utils import _infer_library_from_model_name_or_path from ...utils.save_utils import maybe_load_preprocessors from ..base import BaseOptimumCLICommand, CommandInfo logger = logging.getLogger(__name__) if TYPE_CHECKING: from argparse import ArgumentParser, Namespace, _SubParsersAction def parse_args_openvino(parser: "ArgumentParser"): required_group = parser.add_argument_group("Required arguments") required_group.add_argument( "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from." ) required_group.add_argument( "output", type=Path, help="Path indicating the directory where to store the generated OV model." ) optional_group = parser.add_argument_group("Optional arguments") optional_group.add_argument( "--task", default="auto", help=( "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:" f" {str(TasksManager.get_all_tasks())}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder." ), ) optional_group.add_argument( "--framework", type=str, choices=["pt", "tf"], default=None, help=( "The framework to use for the export. If not provided, will attempt to use the local checkpoint's original framework or what is available in the environment." ), ) optional_group.add_argument( "--trust-remote-code", action="store_true", help=( "Allows to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which " "you have read the code, as it will execute on your local machine arbitrary code present in the model repository." ), ) optional_group.add_argument( "--weight-format", type=str, choices=["fp32", "fp16", "int8", "int4", "mxfp4", "nf4"], default=None, help="The weight format of the exported model.", ) optional_group.add_argument( "--quant-mode", type=str, choices=["int8", "f8e4m3", "f8e5m2", "nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"], default=None, help=( "Quantization precision mode. This is used for applying full model quantization including activations. " ), ) optional_group.add_argument( "--library", type=str, choices=["transformers", "diffusers", "timm", "sentence_transformers", "open_clip"], default=None, help="The library used to load the model before export. If not provided, will attempt to infer the local checkpoint's library", ) optional_group.add_argument( "--cache_dir", type=str, default=HUGGINGFACE_HUB_CACHE, help="The path to a directory in which the downloaded model should be cached if the standard cache should not be used.", ) optional_group.add_argument( "--pad-token-id", type=int, default=None, help=( "This is needed by some models, for some tasks. If not provided, will attempt to use the tokenizer to guess it." ), ) optional_group.add_argument( "--variant", type=str, default=None, help=("If specified load weights from variant filename."), ) optional_group.add_argument( "--ratio", type=float, default=None, help=( "A parameter used when applying 4-bit quantization to control the ratio between 4-bit and 8-bit quantization. If set to 0.8, 80%% of the layers will be quantized to int4 " "while 20%% will be quantized to int8. This helps to achieve better accuracy at the sacrifice of the model size and inference latency. Default value is 1.0. " "Note: If dataset is provided, and the ratio is less than 1.0, then data-aware mixed precision assignment will be applied." ), ) optional_group.add_argument( "--sym", action="store_true", default=None, help=( "Whether to apply symmetric quantization. This argument is related to integer-typed --weight-format and --quant-mode options. " "In case of full or mixed quantization (--quant-mode) symmetric quantization will be applied to weights in any case, so only activation quantization " "will be affected by --sym argument. For weight-only quantization (--weight-format) --sym argument does not affect backup precision. " "Examples: (1) --weight-format int8 --sym => int8 symmetric quantization of weights; " "(2) --weight-format int4 => int4 asymmetric quantization of weights; " "(3) --weight-format int4 --sym --backup-precision int8_asym => int4 symmetric quantization of weights with int8 asymmetric backup precision; " "(4) --quant-mode int8 --sym => weights and activations are quantized to int8 symmetric data type; " "(5) --quant-mode int8 => activations are quantized to int8 asymmetric data type, weights -- to int8 symmetric data type; " "(6) --quant-mode int4_f8e5m2 --sym => activations are quantized to f8e5m2 data type, weights -- to int4 symmetric data type." ), ) optional_group.add_argument( "--group-size", type=int, default=None, help=("The group size to use for quantization. Recommended value is 128 and -1 uses per-column quantization."), ) optional_group.add_argument( "--backup-precision", type=str, choices=["none", "int8_sym", "int8_asym"], default=None, help=( "Defines a backup precision for mixed-precision weight compression. Only valid for 4-bit weight formats. " "If not provided, backup precision is int8_asym. 'none' stands for original floating-point precision of " "the model weights, in this case weights are retained in their original precision without any " "quantization. 'int8_sym' stands for 8-bit integer symmetric quantization without zero point. 'int8_asym' " "stands for 8-bit integer asymmetric quantization with zero points per each quantization group." ), ) optional_group.add_argument( "--dataset", type=str, default=None, help=( "The dataset used for data-aware compression or quantization with NNCF. " "For language models you can use the one from the list ['auto','wikitext2','c4','c4-new']. With 'auto' the " "dataset will be collected from model's generations. " "For diffusion models it should be on of ['conceptual_captions'," "'laion/220k-GPT4Vision-captions-from-LIVIS','laion/filtered-wit']. " "For visual language models the dataset must be set to 'contextual'. " "Note: if none of the data-aware compression algorithms are selected and ratio parameter is omitted or " "equals 1.0, the dataset argument will not have an effect on the resulting model." ), ) optional_group.add_argument( "--all-layers", action="store_true", default=None, help=( "Whether embeddings and last MatMul layers should be compressed to INT4. If not provided an weight " "compression is applied, they are compressed to INT8." ), ) optional_group.add_argument( "--awq", action="store_true", default=None, help=( "Whether to apply AWQ algorithm. AWQ improves generation quality of INT4-compressed LLMs. If dataset is " "provided, a data-aware activation-based version of the algorithm will be executed, which requires " "additional time. Otherwise, data-free AWQ will be applied which relies on per-column magnitudes of " "weights instead of activations. Note: it is possible that there will be no matching patterns in the model " "to apply AWQ, in such case it will be skipped." ), ) optional_group.add_argument( "--scale-estimation", action="store_true", default=None, help=( "Indicates whether to apply a scale estimation algorithm that minimizes the L2 error between the original " "and compressed layers. Providing a dataset is required to run scale estimation. Please note, that " "applying scale estimation takes additional memory and time." ), ) optional_group.add_argument( "--gptq", action="store_true", default=None, help=( "Indicates whether to apply GPTQ algorithm that optimizes compressed weights in a layer-wise fashion to " "minimize the difference between activations of a compressed and original layer. Please note, that " "applying GPTQ takes additional memory and time." ), ) optional_group.add_argument( "--lora-correction", action="store_true", default=None, help=( "Indicates whether to apply LoRA Correction algorithm. When enabled, this algorithm introduces low-rank " "adaptation layers in the model that can recover accuracy after weight compression at some cost of " "inference latency. Please note, that applying LoRA Correction algorithm takes additional memory and time." ), ) optional_group.add_argument( "--sensitivity-metric", type=str, default=None, help=( "The sensitivity metric for assigning quantization precision to layers. It can be one of the following: " "['weight_quantization_error', 'hessian_input_activation', 'mean_activation_variance', " "'max_activation_variance', 'mean_activation_magnitude']." ), ) optional_group.add_argument( "--num-samples", type=int, default=None, help="The maximum number of samples to take from the dataset for quantization.", ) optional_group.add_argument( "--disable-stateful", action="store_true", help=( "Disable stateful converted models, stateless models will be generated instead. Stateful models are produced by default when this key is not used. " "In stateful models all kv-cache inputs and outputs are hidden in the model and are not exposed as model inputs and outputs. " "If --disable-stateful option is used, it may result in sub-optimal inference performance. " "Use it when you intentionally want to use a stateless model, for example, to be compatible with existing " "OpenVINO native inference code that expects KV-cache inputs and outputs in the model." ), ) optional_group.add_argument( "--disable-convert-tokenizer", action="store_true", help="Do not add converted tokenizer and detokenizer OpenVINO models.", ) optional_group.add_argument( "--smooth-quant-alpha", type=float, default=None, help=( "SmoothQuant alpha parameter that improves the distribution of activations before MatMul layers and " "reduces quantization error. Valid only when activations quantization is enabled." ), ) optional_group.add_argument( "--model-kwargs", type=json.loads, help=("Any kwargs passed to the model forward, or used to customize the export for a given model."), ) def no_compression_parameter_provided(args): return all( ( it is None for it in ( args.ratio, args.group_size, args.sym, args.all_layers, args.dataset, args.num_samples, args.awq, args.scale_estimation, args.gptq, args.lora_correction, args.sensitivity_metric, args.backup_precision, ) ) ) def no_quantization_parameter_provided(args): return all( ( it is None for it in ( args.sym, args.dataset, args.num_samples, args.smooth_quant_alpha, ) ) ) class OVExportCommand(BaseOptimumCLICommand): COMMAND = CommandInfo(name="openvino", help="Export PyTorch models to OpenVINO IR.") def __init__( self, subparsers: "_SubParsersAction", args: Optional["Namespace"] = None, command: Optional["CommandInfo"] = None, from_defaults_factory: bool = False, parser: Optional["ArgumentParser"] = None, ): super().__init__( subparsers, args=args, command=command, from_defaults_factory=from_defaults_factory, parser=parser ) self.args_string = " ".join(sys.argv[3:]) @staticmethod def parse_args(parser: "ArgumentParser"): return parse_args_openvino(parser) def run(self): from ...exporters.openvino.__main__ import infer_task, main_export, maybe_convert_tokenizers from ...exporters.openvino.utils import save_preprocessors from ...intel.openvino.configuration import _DEFAULT_4BIT_WQ_CONFIG, OVConfig, get_default_quantization_config if self.args.library is None: # TODO: add revision, subfolder and token to args library_name = _infer_library_from_model_name_or_path( model_name_or_path=self.args.model, cache_dir=self.args.cache_dir ) if library_name == "sentence_transformers": logger.warning( "Library name is not specified. There are multiple possible variants: `sentence_transformers`, `transformers`." "`transformers` will be selected. If you want to load your model with the `sentence-transformers` library instead, please set --library sentence_transformers" ) library_name = "transformers" else: library_name = self.args.library if self.args.weight_format is None and self.args.quant_mode is None: ov_config = None if not no_compression_parameter_provided(self.args): raise ValueError( "Some compression parameters are provided, but the weight format is not specified. " "Please provide it with --weight-format argument." ) if not no_quantization_parameter_provided(self.args): raise ValueError( "Some quantization parameters are provided, but the quantization mode is not specified. " "Please provide it with --quant-mode argument." ) elif self.args.weight_format in {"fp16", "fp32"}: ov_config = OVConfig(dtype=self.args.weight_format) else: if not is_nncf_available(): raise ImportError("Applying quantization requires nncf, please install it with `pip install nncf`") default_quantization_config = get_default_quantization_config( self.args.model, self.args.weight_format, self.args.quant_mode ) if self.args.weight_format is not None: # For int4 quantization if no parameter is provided, then use the default config if exists if no_compression_parameter_provided(self.args) and self.args.weight_format == "int4": if default_quantization_config is not None: quantization_config = default_quantization_config log_message = ( f"Applying the default quantization config for {self.args.model}: {quantization_config}." ) else: quantization_config = _DEFAULT_4BIT_WQ_CONFIG log_message = f"Applying a default quantization config: {quantization_config}." logger.info(log_message) else: quantization_config = prepare_wc_config(self.args, _DEFAULT_4BIT_WQ_CONFIG) else: if no_quantization_parameter_provided(self.args) and default_quantization_config is not None: quantization_config = default_quantization_config logger.info( f"Applying the default quantization config for {self.args.model}: {quantization_config}." ) else: if self.args.dataset is None: raise ValueError( "Dataset is required for full quantization. Please provide it with --dataset argument." ) if self.args.quant_mode in ["nf4_f8e4m3", "nf4_f8e5m2", "int4_f8e4m3", "int4_f8e5m2"]: if library_name == "diffusers": raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.") wc_config = prepare_wc_config(self.args, _DEFAULT_4BIT_WQ_CONFIG) wc_dtype, q_dtype = self.args.quant_mode.split("_") wc_config["dtype"] = wc_dtype q_config = prepare_q_config(self.args) q_config["dtype"] = q_dtype quantization_config = { "weight_quantization_config": wc_config, "full_quantization_config": q_config, "num_samples": self.args.num_samples, "dataset": self.args.dataset, } else: quantization_config = prepare_q_config(self.args) quantization_config["trust_remote_code"] = self.args.trust_remote_code ov_config = OVConfig(quantization_config=quantization_config) quantization_config = ov_config.quantization_config if ov_config else None quantize_with_dataset = quantization_config and getattr(quantization_config, "dataset", None) is not None task = infer_task(self.args.task, self.args.model, library_name=library_name) # in some cases automatic task detection for multimodal models gives incorrect results if self.args.task == "auto" and library_name == "transformers": from transformers import AutoConfig from ...exporters.openvino.utils import MULTI_MODAL_TEXT_GENERATION_MODELS config = AutoConfig.from_pretrained( self.args.model, cache_dir=self.args.cache_dir, trust_remote_code=self.args.trust_remote_code, ) if getattr(config, "model_type", "").replace("_", "-") in MULTI_MODAL_TEXT_GENERATION_MODELS: task = "image-text-to-text" if library_name == "diffusers" and quantize_with_dataset: if not is_diffusers_available(): raise ValueError(DIFFUSERS_IMPORT_ERROR.format("Export of diffusers models")) from diffusers import DiffusionPipeline diffusers_config = DiffusionPipeline.load_config(self.args.model) class_name = diffusers_config.get("_class_name", None) if class_name == "LatentConsistencyModelPipeline": from optimum.intel import OVLatentConsistencyModelPipeline model_cls = OVLatentConsistencyModelPipeline elif class_name == "StableDiffusionXLPipeline": from optimum.intel import OVStableDiffusionXLPipeline model_cls = OVStableDiffusionXLPipeline elif class_name == "StableDiffusionPipeline": from optimum.intel import OVStableDiffusionPipeline model_cls = OVStableDiffusionPipeline elif class_name == "StableDiffusion3Pipeline": from optimum.intel import OVStableDiffusion3Pipeline model_cls = OVStableDiffusion3Pipeline elif class_name == "FluxPipeline": from optimum.intel import OVFluxPipeline model_cls = OVFluxPipeline elif class_name == "SanaPipeline": from optimum.intel import OVSanaPipeline model_cls = OVSanaPipeline elif class_name == "SaneSprintPipeline": from optimum.intel import OVSanaSprintPipeline model_cls = OVSanaSprintPipeline else: raise NotImplementedError(f"Quantization isn't supported for class {class_name}.") model = model_cls.from_pretrained(self.args.model, export=True, quantization_config=quantization_config) model.save_pretrained(self.args.output) if not self.args.disable_convert_tokenizer: maybe_convert_tokenizers(library_name, self.args.output, model, task=task) elif ( quantize_with_dataset and ( task in ["fill-mask", "zero-shot-image-classification"] or task.startswith("text-generation") or task.startswith("automatic-speech-recognition") or task.startswith("feature-extraction") ) or (task == "image-text-to-text" and quantization_config is not None) ): if task.startswith("text-generation"): from optimum.intel import OVModelForCausalLM model_cls = OVModelForCausalLM elif task == "image-text-to-text": from optimum.intel import OVModelForVisualCausalLM model_cls = OVModelForVisualCausalLM elif "automatic-speech-recognition" in task: from optimum.intel import OVModelForSpeechSeq2Seq model_cls = OVModelForSpeechSeq2Seq elif task.startswith("feature-extraction") and library_name == "transformers": from ...intel import OVModelForFeatureExtraction model_cls = OVModelForFeatureExtraction elif task.startswith("feature-extraction") and library_name == "sentence_transformers": from ...intel import OVSentenceTransformer model_cls = OVSentenceTransformer elif task == "fill-mask": from ...intel import OVModelForMaskedLM model_cls = OVModelForMaskedLM elif task == "zero-shot-image-classification": from ...intel import OVModelForZeroShotImageClassification model_cls = OVModelForZeroShotImageClassification else: raise NotImplementedError( f"Unable to find a matching model class for the task={task} and library_name={library_name}." ) # In this case, to apply quantization an instance of a model class is required model = model_cls.from_pretrained( self.args.model, export=True, quantization_config=quantization_config, stateful=not self.args.disable_stateful, trust_remote_code=self.args.trust_remote_code, variant=self.args.variant, cache_dir=self.args.cache_dir, ) model.save_pretrained(self.args.output) preprocessors = maybe_load_preprocessors(self.args.model, trust_remote_code=self.args.trust_remote_code) save_preprocessors(preprocessors, model.config, self.args.output, self.args.trust_remote_code) if not self.args.disable_convert_tokenizer: maybe_convert_tokenizers(library_name, self.args.output, preprocessors=preprocessors, task=task) else: # TODO : add input shapes main_export( model_name_or_path=self.args.model, output=self.args.output, task=self.args.task, framework=self.args.framework, cache_dir=self.args.cache_dir, trust_remote_code=self.args.trust_remote_code, pad_token_id=self.args.pad_token_id, ov_config=ov_config, stateful=not self.args.disable_stateful, convert_tokenizer=not self.args.disable_convert_tokenizer, library_name=library_name, variant=self.args.variant, model_kwargs=self.args.model_kwargs, # **input_shapes, ) def prepare_wc_config(args, default_configs): is_int8 = args.weight_format == "int8" return { "bits": 8 if is_int8 else 4, "ratio": 1.0 if is_int8 else (args.ratio or default_configs["ratio"]), "sym": args.sym or False, "group_size": -1 if is_int8 else args.group_size, "all_layers": None if is_int8 else args.all_layers, "dataset": args.dataset, "num_samples": args.num_samples, "quant_method": "awq" if args.awq else "default", "sensitivity_metric": args.sensitivity_metric, "scale_estimation": args.scale_estimation, "gptq": args.gptq, "lora_correction": args.lora_correction, "dtype": args.weight_format, "backup_precision": args.backup_precision, } def prepare_q_config(args): return { "dtype": args.quant_mode, "bits": 8, "sym": args.sym or False, "dataset": args.dataset, "num_samples": args.num_samples, "smooth_quant_alpha": args.smooth_quant_alpha, "trust_remote_code": args.trust_remote_code, }

optimum/commands/export/openvino.py (514 lines of code) (raw):