optimum/exporters/openvino/model_configs.py (3,636 lines of code) (raw):

# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import enum from copy import deepcopy from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union from packaging import version from transformers import AutoConfig, PretrainedConfig, PreTrainedModel, TFPreTrainedModel from transformers.utils import is_tf_available from optimum.exporters.onnx.base import ConfigBehavior from optimum.exporters.onnx.config import OnnxConfig, TextDecoderOnnxConfig, TextDecoderWithPositionIdsOnnxConfig from optimum.exporters.onnx.model_configs import ( BartOnnxConfig, BlenderbotOnnxConfig, BlenderbotSmallOnnxConfig, BloomOnnxConfig, CLIPOnnxConfig, CLIPTextOnnxConfig, CLIPTextWithProjectionOnnxConfig, CLIPVisionModelOnnxConfig, CodeGenOnnxConfig, FalconOnnxConfig, GemmaOnnxConfig, GPTBigCodeOnnxConfig, GPTJOnnxConfig, GPTNeoOnnxConfig, GPTNeoXOnnxConfig, IBertOnnxConfig, LlamaOnnxConfig, MarianOnnxConfig, MistralOnnxConfig, MPTOnnxConfig, PegasusOnnxConfig, PhiOnnxConfig, SpeechT5OnnxConfig, T5OnnxConfig, UNetOnnxConfig, VaeDecoderOnnxConfig, VaeEncoderOnnxConfig, VisionOnnxConfig, WhisperOnnxConfig, ) from optimum.exporters.onnx.model_patcher import ModelPatcher from optimum.exporters.tasks import TasksManager from optimum.utils import DEFAULT_DUMMY_SHAPES from optimum.utils.input_generators import ( DTYPE_MAPPER, DummyInputGenerator, DummyPastKeyValuesGenerator, DummySeq2SeqDecoderTextInputGenerator, DummySeq2SeqPastKeyValuesGenerator, DummyTextInputGenerator, DummyTimestepInputGenerator, DummyVisionInputGenerator, FalconDummyPastKeyValuesGenerator, GemmaDummyPastKeyValuesGenerator, MistralDummyPastKeyValuesGenerator, ) from optimum.utils.normalized_config import NormalizedConfig, NormalizedTextConfig, NormalizedVisionConfig from ...intel.utils.import_utils import ( _transformers_version, is_diffusers_available, is_diffusers_version, is_transformers_version, ) from .model_patcher import ( AquilaModelPatcher, ArcticModelPatcher, BaichuanModelPatcher, BlenderbotModelPatcher, BlenderbotSmallModelPatcher, BlenderbotSmallStatefulSeq2SeqDecoderPatcher, BlenderbotStatefulSeq2SeqDecoderPatcher, BloomModelPatcher, ChatGLMModelPatcher, CodeGenModelPatcher, CommonImageEmbeddingsModelPatcher, DBRXModelPatcher, DeciLMModelPatcher, DeepseekPatcher, FalconModelPatcher, FluxTransfromerModelPatcher, Gemma2ModelPatcher, Gemma3LMModelPatcher, GptBigCodeModelPatcher, GptJModelPatcher, GptNeoModelPatcher, GptNeoxJapaneseModelPatcher, GptNeoxModelPatcher, GraniteMoEModelPatcher, IBertModelPatcher, Idefics3ImageEmbeddingsModelPatcher, InputEmbeddingPatcher, InternLM2Patcher, InternLMModelPatcher, InternVL2ChatLangModelPatcher, InternVLChatImageEmbeddingModelPatcher, JaisModelPatcher, Llama4ImageEmbeddingsModelPatcher, Llama4TextModelPatcher, LlamaModelPatcher, LlavaImageEmbeddingModelPatcher, LlavaNextVideoImageEmbeddingModelPatcher, LlavaQwen2ImageEmbeddingsModelPatcher, MairaImageEmbeddingModelPatcher, MarianModelPatcher, MarianStatefulSeq2SeqDecoderPatcher, MiniCPM3Patcher, MiniCPMModelPatcher, MiniCPMVImageEmbeddingsModelPatcher, MiniCPMVResamplerModelPatcher, MistralModelPatcher, MixtralModelPatcher, MPTModelPatcher, OVSpeechT5ModelPatcher, PegasusModelPatcher, PegasusStatefulSeq2SeqDecoderPatcher, PersimmonModelPatcher, Phi3ModelPatcher, Phi3VisionImageEmbeddingsPatcher, Phi4MMAudioEncoderPatcher, Phi4MMAudioForwardEmbeddingsPatcher, Phi4MMLanguageModelPatcher, Phi4MMVisionEmbeddingsPatcher, PhiMoEModelPatcher, Qwen2_5_VLVisionEmbMergerPatcher, Qwen2MoEPatcher, Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, QwenModelPatcher, RotaryEmbPatcher, SanaTextEncoderModelPatcher, StatefulSeq2SeqDecoderPatcher, UpdateCausalMaskModelPatcher, XverseModelPatcher, ) def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} TasksManager._CUSTOM_CLASSES[("pt", "llava", "image-text-to-text")] = ( "transformers", "LlavaForConditionalGeneration", ) TasksManager._CUSTOM_CLASSES[("pt", "llava-next", "image-text-to-text")] = ( "transformers", "LlavaNextForConditionalGeneration", ) TasksManager._CUSTOM_CLASSES[("pt", "qwen2-vl", "image-text-to-text")] = ( "transformers", "Qwen2VLForConditionalGeneration", ) TasksManager._CUSTOM_CLASSES[("pt", "qwen2-5-vl", "image-text-to-text")] = ( "transformers", "AutoModelForImageTextToText", ) TasksManager._CUSTOM_CLASSES[("pt", "llava-next-video", "image-text-to-text")] = ( "transformers", "AutoModelForVision2Seq", ) TasksManager._CUSTOM_CLASSES[("pt", "gemma3", "image-text-to-text")] = ( "transformers", "Gemma3ForConditionalGeneration", ) TasksManager._CUSTOM_CLASSES[("pt", "idefics3", "image-text-to-text")] = ( "transformers", "AutoModelForImageTextToText", ) TasksManager._CUSTOM_CLASSES[("pt", "smolvlm", "image-text-to-text")] = ( "transformers", "AutoModelForImageTextToText", ) TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "image-text-to-text")] = ("transformers", "AutoModelForCausalLM") TasksManager._CUSTOM_CLASSES[("pt", "phi4mm", "automatic-speech-recognition")] = ( "transformers", "AutoModelForCausalLM", ) TasksManager._CUSTOM_CLASSES[("pt", "phi4-multimodal", "image-text-to-text")] = ( "transformers", "AutoModelForCausalLM", ) TasksManager._CUSTOM_CLASSES[("pt", "phi4-multimodal", "automatic-speech-recognition")] = ( "transformers", "AutoModelForCausalLM", ) TasksManager._CUSTOM_CLASSES[("pt", "llama4", "image-text-to-text")] = ( "transformers", "AutoModelForImageTextToText", ) TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS[ "image-text-to-text" ] = TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["text-generation"] TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS["video-text-to-text"] = "AutoModelForVision2Seq" if is_diffusers_available() and "fill" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["fill"] = "FluxFillPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["fill"] = {"flux": "FluxFillPipeline"} TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS["text-to-image"] = ("AutoPipelineForText2Image", "SanaPipeline") TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana"] = "SanaPipeline" TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-image"]["sana-sprint"] = "SanaSprintPipeline" if is_diffusers_available() and "text-to-video" not in TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS: TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-video"] = {} TasksManager._DIFFUSERS_TASKS_TO_MODEL_MAPPINGS["text-to-video"]["ltx-video"] = "LTXPipeline" supported_model_types = [ "_SUPPORTED_MODEL_TYPE", "_DIFFUSERS_SUPPORTED_MODEL_TYPE", "_TIMM_SUPPORTED_MODEL_TYPE", "_SENTENCE_TRANSFORMERS_SUPPORTED_MODEL_TYPE", ] for supported_models_config in supported_model_types: supported_models = getattr(TasksManager, supported_models_config) for model, export_configs in supported_models.items(): if "onnx" not in export_configs: continue onnx_config = export_configs["onnx"] supported_models[model]["openvino"] = deepcopy(onnx_config) setattr(TasksManager, supported_models_config, supported_models) init_model_configs() if TYPE_CHECKING: from transformers.modeling_utils import PreTrainedModel # noqa: F811 from optimum.exporters.onnx.model_patcher import ModelPatcher # noqa: F811 if is_tf_available(): from transformers.modeling_tf_utils import TFPreTrainedModel # noqa: F811 register_in_tasks_manager = TasksManager.create_register("openvino", overwrite_existing=True) @register_in_tasks_manager("baichuan", *["text-generation", "text-generation-with-past"], library_name="transformers") class BaichaunOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" ) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return BaichuanModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "qwen2", *[ "text-generation", "text-generation-with-past", "feature-extraction", "feature-extraction-with-past", "text-classification", "token-classification", ], library_name="transformers", ) class Qwen2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("qwen2-moe", *["text-generation", "text-generation-with-past"], library_name="transformers") class Qwen2MoEOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return Qwen2MoEPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("qwen3", *["text-generation", "text-generation-with-past"], library_name="transformers") @register_in_tasks_manager("qwen3-moe", *["text-generation", "text-generation-with-past"], library_name="transformers") class Qwen3OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("minicpm", *["text-generation", "text-generation-with-past"], library_name="transformers") class MiniCPMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return MiniCPMModelPatcher(self, model, model_kwargs=model_kwargs) class OVMiniCPM3DummyPastKeyValuesGenerator(MistralDummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task=task, normalized_config=normalized_config, batch_size=batch_size, sequence_length=sequence_length, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, **kwargs, ) self.v_head_dim = getattr(normalized_config, "v_head_dim", self.hidden_size // self.num_attention_heads) self.k_head_dim = normalized_config.qk_nope_head_dim + normalized_config.qk_rope_head_dim def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): v_shape = ( self.batch_size, self.num_key_value_heads, self.sequence_length, self.v_head_dim, ) k_shape = (self.batch_size, self.num_key_value_heads, self.sequence_length, self.k_head_dim) return [ ( self.random_float_tensor(k_shape, framework=framework, dtype=float_dtype), self.random_float_tensor(v_shape, framework=framework, dtype=float_dtype), ) for _ in range(self.num_layers) ] @register_in_tasks_manager("minicpm3", *["text-generation", "text-generation-with-past"], library_name="transformers") class MiniCPM3OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, OVMiniCPM3DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = OVMiniCPM3DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return MiniCPM3Patcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("stablelm", *["text-generation", "text-generation-with-past"], library_name="transformers") class StableLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs) class ChatGLM2DummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task=task, normalized_config=normalized_config, batch_size=batch_size, sequence_length=sequence_length, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, ) self.multi_query_group_num = normalized_config.multi_query_group_num self.head_dim = normalized_config.kv_channels self.standart_cache_layout = hasattr(normalized_config, "rope_ratio") def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if not self.standart_cache_layout: pkv_shape = ( self.sequence_length, self.batch_size, self.multi_query_group_num, self.head_dim, ) else: pkv_shape = ( self.batch_size, self.multi_query_group_num, self.sequence_length, self.head_dim, ) return [ ( self.random_float_tensor(pkv_shape, framework=framework, dtype=float_dtype), self.random_float_tensor(pkv_shape, framework=framework, dtype=float_dtype), ) for _ in range(self.num_layers) ] @register_in_tasks_manager("chatglm", *["text-generation", "text-generation-with-past"], library_name="transformers") class ChatGLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(vocab_size="padded_vocab_size", num_layers="num_layers") DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, ChatGLM2DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = ChatGLM2DummyPastKeyValuesGenerator def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) dummy_inputs = {} input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] if self.use_past_in_inputs and self.use_cache_branch is not False: input_names.append("past_key_values") for input_name in input_names: input_was_inserted = False for dummy_input_gen in dummy_inputs_generators: if dummy_input_gen.supports_input(input_name): dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( dummy_input_gen, input_name, framework, input_shapes=kwargs, ) input_was_inserted = True break if not input_was_inserted: raise RuntimeError( f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' ) # refer to https://github.com/huggingface/optimum/pull/764 if ( self.use_past_in_inputs and self.PAD_ATTENTION_MASK_TO_PAST and self.use_cache_branch is not False and "attention_mask" in dummy_inputs ): # Obtain the past sequence length from the value instead of the key (Bloom). ChatGLM has seq_len in 0 dim instead of -2 seq_len_dim = 0 if not hasattr(self._normalized_config, "rope_ratio") else -2 past_present_length = ( dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[seq_len_dim] ) dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( dummy_inputs["attention_mask"], desired_length=past_present_length, dim=1, dtype=dummy_inputs["attention_mask"].dtype, ) return dummy_inputs def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): """ Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. Args: inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. direction (`str`): either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the output mapping, this is important for axes naming. """ if direction not in ["inputs", "outputs"]: raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') if direction == "inputs": decoder_sequence_name = "past_sequence_length" name = "past_key_values" else: decoder_sequence_name = "past_sequence_length + present_length" name = "present" is_v4 = hasattr(self._normalized_config, "rope_ratio") for i in range(self._normalized_config.num_layers): inputs_or_outputs[f"{name}.{i}.key"] = ( {1: "batch_size", 0: decoder_sequence_name} if not is_v4 else {0: "batch_size", 2: decoder_sequence_name} ) inputs_or_outputs[f"{name}.{i}.value"] = ( {1: "batch_size", 0: decoder_sequence_name} if not is_v4 else {0: "batch_size", 2: decoder_sequence_name} ) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return ChatGLMModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("mixtral", *["text-generation", "text-generation-with-past"], library_name="transformers") class MixtralOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): # This is because of the patching of torch.triu in AttentionMaskConverter, that exists from transformers>=4.35 MIN_TRANSFORMERS_VERSION = version.parse("4.34.99") # The ONNX export of this architecture needs the Trilu operator support, available since opset 14 DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = ( MistralDummyPastKeyValuesGenerator, ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return MixtralModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gemma", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GemmaOpenVINOConfig(GemmaOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return LlamaModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "llama", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class LlamaOpenVINOConfig(LlamaOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return LlamaModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "exaone", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class ExaoneOpenVINOConfig(LlamaOpenVINOConfig): pass class QwenDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task=task, normalized_config=normalized_config, batch_size=batch_size, sequence_length=sequence_length, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, ) self.kv_channels = normalized_config.kv_channels def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): past_key_shape = (self.batch_size, self.sequence_length, self.num_attention_heads, self.kv_channels) past_value_shape = (self.batch_size, self.sequence_length, self.num_attention_heads, self.kv_channels) return [ ( self.random_float_tensor(past_key_shape, framework=framework, dtype=float_dtype), self.random_float_tensor(past_value_shape, framework=framework, dtype=float_dtype), ) for _ in range(self.num_layers) ] @register_in_tasks_manager("qwen", *["text-generation", "text-generation-with-past"]) class QwenOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" ) DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, QwenDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = QwenDummyPastKeyValuesGenerator no_position_ids = False def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) dummy_inputs = {} input_names = [key for key in self.inputs.keys() if not key.startswith("past_key_values")] if self.use_past_in_inputs and self.use_cache_branch is not False: input_names.append("past_key_values") for input_name in input_names: input_was_inserted = False for dummy_input_gen in dummy_inputs_generators: if dummy_input_gen.supports_input(input_name): dummy_inputs[input_name] = self.overwrite_shape_and_generate_input( dummy_input_gen, input_name, framework, input_shapes=kwargs, ) input_was_inserted = True break if not input_was_inserted: raise RuntimeError( f'Could not generate dummy input for "{input_name}". Try adding a proper dummy input generator to the model ONNX config.' ) # refer to https://github.com/huggingface/optimum/pull/764 if ( self.use_past_in_inputs and self.PAD_ATTENTION_MASK_TO_PAST and self.use_cache_branch is not False and "attention_mask" in dummy_inputs ): # Obtain the past sequence length from the value instead of the key (Bloom). Qwen has seq_len in 1 dim instead of -2 past_present_length = dummy_inputs["input_ids"].shape[1] + dummy_inputs["past_key_values"][0][1].shape[1] dummy_inputs["attention_mask"] = DummyInputGenerator.pad_input_on_dim( dummy_inputs["attention_mask"], desired_length=past_present_length, dim=1, dtype=dummy_inputs["attention_mask"].dtype, ) return dummy_inputs def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): """ Fills `input_or_outputs` mapping with past_key_values dynamic axes considering the direction. Args: inputs_or_outputs (`Dict[str, Dict[int, str]]`): The mapping to fill. direction (`str`): either "inputs" or "outputs", it specifies whether `input_or_outputs` is the input mapping or the output mapping, this is important for axes naming. """ if direction not in ["inputs", "outputs"]: raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') if direction == "inputs": decoder_sequence_name = "past_sequence_length" name = "past_key_values" else: decoder_sequence_name = "past_sequence_length + 1" name = "present" for i in range(self._normalized_config.num_layers): inputs_or_outputs[f"{name}.{i}.key"] = {0: "batch_size", 1: decoder_sequence_name} inputs_or_outputs[f"{name}.{i}.value"] = {0: "batch_size", 1: decoder_sequence_name} def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return QwenModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "starcoder2", *["text-generation", "text-generation-with-past"], library_name="transformers" ) class Starcoder2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return RotaryEmbPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("internlm2", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return InternLM2Patcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("orion", *["text-generation", "text-generation-with-past"], library_name="transformers") class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @register_in_tasks_manager("olmo", *["text-generation", "text-generation-with-past"], library_name="transformers") class OlmoOpenVINOConfig(LlamaOpenVINOConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @register_in_tasks_manager( "mpt", *["text-generation", "text-generation-with-past", "text-classification"], library_name="transformers" ) class MPTOpenVINOConfig(MPTOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return MPTModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "phi3", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class Phi3OpenVINOConfig(PhiOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( MistralDummyPastKeyValuesGenerator, ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return Phi3ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "phimoe", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class PhiMoEOpenVINOConfig(Phi3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return PhiMoEModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "phi", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class PhiOpenVINOConfig(PhiOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return UpdateCausalMaskModelPatcher(self, model, model_kwargs=model_kwargs) class OVFalconDummyPastKeyValuesGenerator(FalconDummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task=task, normalized_config=normalized_config, batch_size=batch_size, sequence_length=sequence_length, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, **kwargs, ) if normalized_config.new_decoder_architecture: self.num_kv_heads = normalized_config.num_attention_heads else: self.num_kv_heads = normalized_config.num_kv_heads if not normalized_config.multi_query else 1 self.head_dim = self.hidden_size // self.num_attention_heads @register_in_tasks_manager( "falcon", *[ "feature-extraction", "feature-extraction-with-past", "question-answering", "text-generation", "text-generation-with-past", "token-classification", ], library_name="transformers", ) class FalconOpenVINOConfig(FalconOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( OVFalconDummyPastKeyValuesGenerator, ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES DUMMY_PKV_GENERATOR_CLASS = OVFalconDummyPastKeyValuesGenerator def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return FalconModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "persimmon", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class PersimmonOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return PersimmonModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("biogpt", *["text-generation", "text-generation-with-past"], library_name="transformers") class BioGPTOpenVINOConfig( TextDecoderWithPositionIdsOnnxConfig if is_transformers_version(">=", "4.52.0") else TextDecoderOnnxConfig ): # BioGPT does not require position_ids input. DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @register_in_tasks_manager( "gpt-neox-japanese", *["text-generation", "text-generation-with-past"], library_name="transformers" ) class GPTNeoxJapaneseOpenVINOConfig(TextDecoderOnnxConfig): # GPTNeoxJapanese does not require position_ids input. DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return GptNeoxJapaneseModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gpt-neo", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GPTNeoOpenVINOConfig(GPTNeoOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return GptNeoModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gptj", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GPTJOpenVINOConfig(GPTJOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return GptJModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "bloom", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", "token-classification", ], library_name="transformers", ) class BloomOpenVINOConfig(BloomOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return BloomModelPatcher(self, model, model_kwargs=model_kwargs) def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): if is_transformers_version(">=", "4.44"): super().add_past_key_values(inputs_or_outputs, direction) else: if direction not in ["inputs", "outputs"]: raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') if direction == "inputs": decoder_sequence_name = "past_sequence_length" name = "past_key_values" else: decoder_sequence_name = "past_sequence_length + 1" name = "present" for i in range(self._normalized_config.num_layers): inputs_or_outputs[f"{name}.{i}.key"] = { 0: "batch_size x num_heads", 2: decoder_sequence_name, } inputs_or_outputs[f"{name}.{i}.value"] = { 0: "batch_size x num_heads", 1: decoder_sequence_name, } @register_in_tasks_manager( "cohere", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class CohereOpenVINOConfig(LlamaOpenVINOConfig): pass @register_in_tasks_manager("xglm", *["text-generation", "text-generation-with-past"], library_name="transformers") class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="attention_heads", hidden_size="d_model" ) class AquilaDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task, normalized_config, batch_size, sequence_length, random_batch_size_range, random_sequence_length_range, **kwargs, ) self.num_key_value_heads = getattr( normalized_config, "num_key_value_heads", normalized_config.num_attention_heads ) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): shape = ( self.batch_size, self.num_key_value_heads, self.sequence_length, self.hidden_size // self.num_attention_heads, ) return [ ( self.random_float_tensor(shape, framework=framework, dtype=float_dtype), self.random_float_tensor(shape, framework=framework, dtype=float_dtype), ) for _ in range(self.num_layers) ] @register_in_tasks_manager("aquila", *["text-generation", "text-generation-with-past"], library_name="transformers") class AquilaMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, AquilaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = AquilaDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args(num_key_value_heads="num_key_value_heads", allow_new=True) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return AquilaModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("xverse", *["text-generation", "text-generation-with-past"], library_name="transformers") class XverseMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return XverseModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("internlm", *["text-generation", "text-generation-with-past"], library_name="transformers") class InternLMOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return InternLMModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "codegen", *["feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past"], library_name="transformers", ) class CodeGenOpenVINOConfig(CodeGenOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return CodeGenModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "dbrx", *["text-generation", "text-generation-with-past"], library_name="transformers", ) class DBRXOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="n_heads", hidden_size="d_model", num_layers="n_layers", num_key_value_heads="attn_config.kv_n_heads", allow_new=True, ) DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return DBRXModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "jais", *["text-generation", "text-generation-with-past"], library_name="transformers", ) class JaisOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DummyPastKeyValuesGenerator def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return JaisModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("arctic", *["text-generation", "text-generation-with-past"], library_name="transformers") class ArcticOpenVINOConfig(MixtralOpenVINOConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": if is_transformers_version("<=", "4.36.0"): raise ValueError( f"Model patching for Arctic models only available for transformers >= v4.37.0, found {_transformers_version}" ) return ArcticModelPatcher(self, model, model_kwargs=model_kwargs) class OVMistralDummyPastKeyValuesGenerator(MistralDummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task=task, normalized_config=normalized_config, batch_size=batch_size, sequence_length=sequence_length, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, **kwargs, ) self.head_dim = getattr(normalized_config, "head_dim", self.hidden_size // self.num_attention_heads) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): shape = ( self.batch_size, self.num_key_value_heads, self.sequence_length, self.head_dim, ) return [ ( self.random_float_tensor(shape, framework=framework, dtype=float_dtype), self.random_float_tensor(shape, framework=framework, dtype=float_dtype), ) for _ in range(self.num_layers) ] @register_in_tasks_manager( "mistral", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class MistralOpenVINOConfig(MistralOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( OVMistralDummyPastKeyValuesGenerator, ) + TextDecoderOnnxConfig.DUMMY_INPUT_GENERATOR_CLASSES DUMMY_PKV_GENERATOR_CLASS = OVMistralDummyPastKeyValuesGenerator def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return MistralModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gpt-neox", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GPTNeoxOpenVINOConfig(GPTNeoXOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return GptNeoxModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gemma2", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class Gemma2OpenVINOConfig(GemmaOnnxConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.43.0") def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return Gemma2ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gemma3-text", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class Gemma3TextOpenVINOConfig(Gemma2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.50.0") class DeciDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task=task, normalized_config=normalized_config, batch_size=batch_size, sequence_length=sequence_length, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, ) self.num_key_value_heads_per_layer = normalized_config.num_key_value_heads_per_layer def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): past_key_values = [] for layer_id in range(self.num_layers): shape = ( self.batch_size, self.num_key_value_heads_per_layer[layer_id], self.sequence_length, self.hidden_size // self.num_attention_heads, ) past_key_values.append( ( self.random_float_tensor(shape, framework=framework, dtype=float_dtype), self.random_float_tensor(shape, framework=framework, dtype=float_dtype), ) ) return past_key_values @register_in_tasks_manager("deci", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeciOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, DeciDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = DeciDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return DeciLMModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("clip", *["zero-shot-image-classification"], library_name="open_clip") class OpenCLIPOpenVINOConfig(CLIPOnnxConfig): DEFAULT_ONNX_OPSET = 14 @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "input_ids": {0: "text_batch_size"}, "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, "attention_mask": {0: "text_batch_size"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "text_features": {0: "text_batch_size"}, "image_features": {0: "image_batch_size"}, } def rename_ambiguous_inputs(self, inputs): model_inputs = {} model_inputs["image"] = inputs["pixel_values"] model_inputs["text"] = inputs["input_ids"] return model_inputs def generate_dummy_inputs(self, framework: str = "pt", **kwargs): # override sequence_length shape here in the kwargs kwargs["sequence_length"] = self._config.text_config.context_length return super().generate_dummy_inputs(framework, **kwargs) def generate_dummy_inputs_for_validation( self, reference_model_inputs: Dict[str, Any], onnx_input_names: Optional[List[str]] = None ) -> Dict[str, Any]: if "attention_mask" in reference_model_inputs: reference_model_inputs.pop("attention_mask") if "image" in onnx_input_names and "pixel_values" in reference_model_inputs: reference_model_inputs["image"] = reference_model_inputs.pop("pixel_values") if "text" in onnx_input_names and "input_ids" in reference_model_inputs: reference_model_inputs["text"] = reference_model_inputs.pop("input_ids") return super().generate_dummy_inputs_for_validation(reference_model_inputs) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="open_clip") class OpenCLIPTextOpenVINOConfig(CLIPTextOnnxConfig): DEFAULT_ONNX_OPSET = 14 @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "input_ids": {0: "text_batch_size"}, "attention_mask": {0: "text_batch_size"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "text_features": {0: "text_batch_size"}, } def rename_ambiguous_inputs(self, inputs): model_inputs = {} model_inputs["text"] = inputs["input_ids"] # model_inputs["attn_mask"] = inputs["attention_mask"] return model_inputs def generate_dummy_inputs(self, framework: str = "pt", **kwargs): # override sequence_length shape here in the kwargs kwargs["sequence_length"] = self._config.context_length dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs) return dummy_inputs def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="open_clip") class OpenCLIPVisualOpenVINOConfig(VisionOnnxConfig): DEFAULT_ONNX_OPSET = 14 NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "pixel_values": {0: "image_batch_size", 1: "num_channels", 2: "height", 3: "width"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "image_features": {0: "image_batch_size"}, } def rename_ambiguous_inputs(self, inputs): model_inputs = {} model_inputs["x"] = inputs["pixel_values"] return model_inputs @register_in_tasks_manager( "clip", *["feature-extraction", "zero-shot-image-classification"], library_name="transformers" ) class CLIPOpenVINOConfig(CLIPOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="transformers") @register_in_tasks_manager("clip-text-model", *["feature-extraction"], library_name="diffusers") @register_in_tasks_manager("clip-text", *["feature-extraction"], library_name="diffusers") class CLIPTextOpenVINOConfig(CLIPTextOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="transformers") @register_in_tasks_manager("clip-text-with-projection", *["feature-extraction"], library_name="diffusers") class CLIPTextWithProjectionOpenVINOConfig(CLIPTextWithProjectionOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("clip-vision-model", *["feature-extraction"], library_name="transformers") class CLIPVisionModelOpenVINOConfig(CLIPVisionModelOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return ModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "ibert", *[ "feature-extraction", "fill-mask", "text-classification", "multiple-choice", "token-classification", "question-answering", ], library_name="transformers", ) class IBertOpenVINOConfig(IBertOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return IBertModelPatcher(self, model, model_kwargs=model_kwargs) class LMInputEmbedsConfigHelper(TextDecoderWithPositionIdsOnnxConfig): def __init__(self, export_config, patcher_cls=None, dummy_input_generator=None, inputs_update=None): self.orig_export_config = export_config if dummy_input_generator is not None: export_config.DUMMY_INPUT_GENERATOR_CLASSES = ( dummy_input_generator, ) + export_config.DUMMY_INPUT_GENERATOR_CLASSES self.DUMMY_INPUT_GENERATOR_CLASSES = export_config.DUMMY_INPUT_GENERATOR_CLASSES self.DEFAULT_ONNX_OPSET = export_config.DEFAULT_ONNX_OPSET self.DUMMY_PKV_GENERATOR_CLASS = export_config.DUMMY_PKV_GENERATOR_CLASS self._config = export_config._config self._normalized_config = export_config._normalized_config self.use_past = export_config.use_past self.patcher_cls = patcher_cls self.input_info_upd = inputs_update def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": model_kwargs = model_kwargs or {} model_kwargs["use_cache"] = True if self.patcher_cls is not None: return self.patcher_cls(self, model, model_kwargs=model_kwargs) # Refer to DecoderModelPatcher. return self.orig_export_config.patch_model_for_export(model, model_kwargs=model_kwargs) @property def outputs(self) -> Dict[str, Dict[int, str]]: return self.orig_export_config.outputs @property def inputs(self) -> Dict[str, Dict[int, str]]: orig_inputs = self.orig_export_config.inputs input_ids_config = orig_inputs.pop("input_ids") orig_inputs["inputs_embeds"] = input_ids_config if self.input_info_upd is not None: orig_inputs.update(self.input_info_upd) return orig_inputs def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs = self.orig_export_config.generate_dummy_inputs(framework, **kwargs) input_ids = dummy_inputs.pop("input_ids") inputs_embed_shape = (input_ids.shape[0], input_ids.shape[1], self._normalized_config.hidden_size) inputs_embeds = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[0].random_float_tensor( inputs_embed_shape ) dummy_inputs["inputs_embeds"] = inputs_embeds if "token_type_ids" in self.inputs: dummy_inputs["token_type_ids"] = self.orig_export_config.DUMMY_INPUT_GENERATOR_CLASSES[ 0 ].random_int_tensor(input_ids.shape, min_value=0, max_value=2) return dummy_inputs class InputEmbedOpenvVINOConfig(TextDecoderOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig @property def inputs(self): return {"input_ids": {0: "batch_size", 1: "sequence_length"}} @property def outputs(self): return {"inputs_embeds": {0: "batch_size", 1: "sequence_length"}} def rename_ambiguous_inputs(self, inputs): model_inputs = {} model_inputs["input"] = inputs["input_ids"] return model_inputs def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": # making 16bit tracable overrides embeedings input signature these changes required to prevent this issue return InputEmbeddingPatcher(self, model, model_kwargs) def get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype): model_type = model_type.replace("_", "-") if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: raise ValueError( f"Unsupported language model type provided `{model_type}`. Please define custom export config" ) if "text-generation-with-past" not in TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]: raise ValueError( f"Export config for text generation for `{model_type}` is not available. Please define custom export config" ) export_config_class = TasksManager._SUPPORTED_MODEL_TYPE[model_type]["openvino"]["text-generation-with-past"] export_config = export_config_class( model_config, use_past=True, use_past_in_inputs=True, int_dtype=int_dtype, float_dtype=float_dtype, ) return export_config def get_vlm_text_embeddings_config(model_type, model_config, int_dtype, float_dtype): internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype) InputEmbedOpenvVINOConfig.NORMALIZED_CONFIG_CLASS = internal_export_config.NORMALIZED_CONFIG_CLASS export_config = InputEmbedOpenvVINOConfig( model_config, task="feature-extraction", int_dtype=int_dtype, float_dtype=float_dtype, ) return export_config def get_vlm_text_generation_config( model_type, model_config, int_dtype, float_dtype, model_patcher=None, dummy_input_generator=None, inputs_update=None, ): internal_export_config = get_vlm_internal_text_generation_config(model_type, model_config, int_dtype, float_dtype) export_config = LMInputEmbedsConfigHelper( internal_export_config, patcher_cls=model_patcher, dummy_input_generator=dummy_input_generator, inputs_update=inputs_update, ) export_config._normalized_config = internal_export_config._normalized_config return export_config class VLMConfigBehavior(str, enum.Enum): VISION_EMBEDDINGS = "vision_embeddings" TEXT_EMBEDDINGS = "text_embeddings" LANGUAGE = "language" class BaseVLMOpenVINOConfig(OnnxConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in VLMConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) SUPPORTS_PAST = True def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, **kwargs, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._behavior = behavior @property def inputs(self) -> Dict[str, Dict[int, str]]: if not self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return {} return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} @property def outputs(self) -> Dict[str, Dict[int, str]]: if not self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return {} return {"last_hidden_state": {0: "batch_size"}} def with_behavior( self, behavior: Union[str, VLMConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: model_type = self._orig_config.text_config.model_type return get_vlm_text_embeddings_config( model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype ) if behavior == VLMConfigBehavior.LANGUAGE: model_type = self._orig_config.text_config.model_type return get_vlm_text_generation_config( model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype ) if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: return model.language_model if not hasattr(model, "lm_head") else model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() text_embedding.config = model.language_model.config return text_embedding def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return CommonImageEmbeddingsModelPatcher(self, model, model_kwargs) @register_in_tasks_manager("llava", *["image-text-to-text"], library_name="transformers") class LlavaOpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.37.2") def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, **kwargs, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._orig_config = config if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return LlavaImageEmbeddingModelPatcher(self, model, model_kwargs) def generate_dummy_inputs(self, framework: str = "pt", **kwargs) -> Dict: if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and self._config.model_type == "pixtral": kwargs["batch_size"] = 1 return super().generate_dummy_inputs(framework, **kwargs) @register_in_tasks_manager("llava-next", *["image-text-to-text"], library_name="transformers") class LlavaNextOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") class DummyLLavaMultiModalProjectorInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ["image_features"] def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], random_batch_size_range: Optional[Tuple[int, int]] = None, **kwargs, ): self.task = task self.batch_size = batch_size self.hidden_size = normalized_config.hidden_size self.num_patches = (normalized_config.image_size // normalized_config.patch_size) ** 2 self.normalized_config = normalized_config def generate( self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32", ): shape = [self.batch_size, self.num_patches, self.hidden_size] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) class LLavaMultimodalProjectorOpenVINOConfig(OnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyLLavaMultiModalProjectorInputGenerator,) NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig @property def inputs(self) -> Dict[str, Dict[int, str]]: return {"image_features": {0: "batch_size", 1: "sequence_length"}} @property def outputs(self) -> Dict[str, Dict[int, str]]: return {"hidden_states": {0: "batch_size", 1: "sequence_length"}} class LlavaNextVideoConfigBehavior(str, enum.Enum): LANGUAGE = "language" VISION_EMBEDDINGS = "vision_embeddings" VISION_RESAMPLER = "vision_resampler" MULTI_MODAL_PROJECTOR = "multi_modal_projector" TEXT_EMBEDDINGS = "text_embeddings" @register_in_tasks_manager( "llava-next-video", *["image-text-to-text", "video-text-to-text"], library_name="transformers" ) class LlavaNextVideoOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.42.0") SUPPORTED_BEHAVIORS = [model_type.value for model_type in LlavaNextVideoConfigBehavior] def with_behavior( self, behavior: Union[str, LlavaNextVideoConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior): behavior = LlavaNextVideoConfigBehavior(behavior) if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR: export_config = LLavaMultimodalProjectorOpenVINOConfig( self._orig_config.vision_config, task="feature-extraction", int_dtype=self.int_dtype, float_dtype=self.float_dtype, ) return export_config if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER: export_config = LLavaMultimodalProjectorOpenVINOConfig( self._orig_config.vision_config, task="feature-extraction", int_dtype=self.int_dtype, float_dtype=self.float_dtype, ) return export_config return super().with_behavior(behavior) def get_model_for_behavior(self, model, behavior: Union[str, LlavaNextVideoConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, LlavaNextVideoConfigBehavior): behavior = LlavaNextVideoConfigBehavior(behavior) if behavior == LlavaNextVideoConfigBehavior.MULTI_MODAL_PROJECTOR: return ( model.multi_modal_projector if hasattr(model, "multi_model_projector") else model.model.multi_modal_projector ) if behavior == LlavaNextVideoConfigBehavior.VISION_RESAMPLER: return model.vision_resampler if hasattr(model, "vision_resampler") else model.model.vision_resampler return super().get_model_for_behavior(model, behavior) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != LlavaNextVideoConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return LlavaNextVideoImageEmbeddingModelPatcher(self, model, model_kwargs) @register_in_tasks_manager( "maira2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" ) class MairaOpenVINOConfig(LlavaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.46.0") SUPPORTS_PAST = True def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return MairaImageEmbeddingModelPatcher(self, model, model_kwargs) def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.language_model.get_input_embeddings() text_embedding.config = model.language_model.config return text_embedding return super().get_model_for_behavior(model, behavior) @register_in_tasks_manager("internvl-chat", *["image-text-to-text"], library_name="transformers") class InternVLChatOpenVINOConfig(BaseVLMOpenVINOConfig): def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._behavior = behavior self._orig_config = config if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) def with_behavior( self, behavior: Union[str, VLMConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: model_type = self._orig_config.llm_config.model_type return get_vlm_text_embeddings_config( model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype ) if behavior == VLMConfigBehavior.LANGUAGE: model_type = self._orig_config.llm_config.model_type return get_vlm_text_generation_config( model_type, self._orig_config.llm_config, self.int_dtype, self.float_dtype, InternVL2ChatLangModelPatcher, ) if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) @staticmethod def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: return model.language_model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.language_model.get_input_embeddings() text_embedding.config = model.language_model.config return text_embedding def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return InternVLChatImageEmbeddingModelPatcher(self, model, model_kwargs) @register_in_tasks_manager( "llava-qwen2", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" ) class LlavaQwen2OpenVINOConfig(BaseVLMOpenVINOConfig): SUPPORTS_PAST = True MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, use_past: bool = False, ): self._behavior = behavior self._orig_config = config if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: config = AutoConfig.from_pretrained(config.mm_vision_tower, trust_remote_code=True) if hasattr(config, "vision_config"): config = config.vision_config super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) @property def inputs(self) -> Dict[str, Dict[int, str]]: if not self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return {} return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} @property def outputs(self) -> Dict[str, Dict[int, str]]: if not self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return {} return {"last_hidden_state": {0: "batch_size"}} @staticmethod def get_model_for_behavior(model, behavior: Union[str, VLMConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: model.forward = super(type(model), model).forward return model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.model.embed_tokens text_embedding.config = model.model.config return text_embedding def with_behavior( self, behavior: Union[str, VLMConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: model_type = self._orig_config.model_type.replace("llava-", "") return get_vlm_text_embeddings_config(model_type, self._orig_config, self.int_dtype, self.float_dtype) if behavior == VLMConfigBehavior.LANGUAGE: model_type = self._orig_config.model_type.replace("llava-", "") return get_vlm_text_generation_config(model_type, self._orig_config, self.int_dtype, self.float_dtype) if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return LlavaQwen2ImageEmbeddingsModelPatcher(self, model, model_kwargs) def rename_ambiguous_inputs(self, inputs): if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: model_inputs = {} model_inputs["images"] = inputs["pixel_values"] return model_inputs return super().rename_ambiguous_inputs(inputs) class PooledProjectionsDummyInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ["pooled_projections"] def __init__( self, task: str, normalized_config: NormalizedConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], random_batch_size_range: Optional[Tuple[int, int]] = None, **kwargs, ): self.task = task self.batch_size = batch_size self.pooled_projection_dim = normalized_config.config.pooled_projection_dim def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): shape = [self.batch_size, self.pooled_projection_dim] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) class DummyTransformerTimestpsInputGenerator(DummyTimestepInputGenerator): SUPPORTED_INPUT_NAMES = ("timestep", "text_embeds", "time_ids", "timestep_cond", "guidance") def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name in ["timestep", "guidance"]: shape = [self.batch_size] return self.random_float_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) return super().generate(input_name, framework, int_dtype, float_dtype) class DummyUnetVisionInputGenerator(DummyVisionInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name not in ["sample", "latent_sample"]: return super().generate(input_name, framework, int_dtype, float_dtype) # add height and width discount for enable any resolution generation return self.random_float_tensor( shape=[self.batch_size, self.num_channels, self.height - 1, self.width - 1], framework=framework, dtype=float_dtype, ) class DummyUnetTimestepInputGenerator(DummyTimestepInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name != "timestep": return super().generate(input_name, framework, int_dtype, float_dtype) shape = [self.batch_size] return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=int_dtype) class DummySanaTimestepInputGenerator(DummyTimestepInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name != "timestep": return super().generate(input_name, framework, int_dtype, float_dtype) shape = [self.batch_size] return self.random_int_tensor(shape, max_value=self.vocab_size, framework=framework, dtype=float_dtype) class DummyUnetEncoderInputGenerator(DummySeq2SeqDecoderTextInputGenerator): def __init__( self, task: str, normalized_config: NormalizedTextConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], num_choices: int = DEFAULT_DUMMY_SHAPES["num_choices"], random_batch_size_range: Optional[Tuple[int, int]] = None, random_sequence_length_range: Optional[Tuple[int, int]] = None, random_num_choices_range: Optional[Tuple[int, int]] = None, **kwargs, ): super().__init__( task, normalized_config, batch_size=batch_size, sequence_length=sequence_length, num_choices=num_choices, random_batch_size_range=random_batch_size_range, random_sequence_length_range=random_sequence_length_range, random_num_choices_range=random_num_choices_range, **kwargs, ) if hasattr(normalized_config.config, "model_max_length"): self.sequence_length = normalized_config.config.model_max_length @register_in_tasks_manager("unet", *["semantic-segmentation"], library_name="diffusers") @register_in_tasks_manager("unet-2d-condition", *["semantic-segmentation"], library_name="diffusers") class UNetOpenVINOConfig(UNetOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( DummyUnetVisionInputGenerator, DummyUnetTimestepInputGenerator, DummyUnetEncoderInputGenerator, ) @property def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = super().inputs common_inputs["timestep"] = {0: "batch_size"} if hasattr(self._normalized_config.config, "model_max_length"): common_inputs["encoder_hidden_states"] = {0: "batch_size"} return common_inputs @register_in_tasks_manager("sd3-transformer", *["semantic-segmentation"], library_name="diffusers") @register_in_tasks_manager("sd3-transformer-2d", *["semantic-segmentation"], library_name="diffusers") class SD3TransformerOpenVINOConfig(UNetOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( (DummyTransformerTimestpsInputGenerator,) + UNetOpenVINOConfig.DUMMY_INPUT_GENERATOR_CLASSES + (PooledProjectionsDummyInputGenerator,) ) NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( image_size="sample_size", num_channels="in_channels", hidden_size="joint_attention_dim", vocab_size="attention_head_dim", allow_new=True, ) @property def inputs(self): common_inputs = super().inputs common_inputs["pooled_projections"] = {0: "batch_size"} return common_inputs def rename_ambiguous_inputs(self, inputs): # The input name in the model signature is `x, hence the export input name is updated. hidden_states = inputs.pop("sample", None) if hidden_states is not None: inputs["hidden_states"] = hidden_states return inputs @register_in_tasks_manager("t5-encoder-model", *["feature-extraction"], library_name="diffusers") @register_in_tasks_manager("t5-encoder", *["feature-extraction"], library_name="diffusers") class T5EncoderOpenVINOConfig(CLIPTextOpenVINOConfig): pass @register_in_tasks_manager("gemma2-text-encoder", *["feature-extraction"], library_name="diffusers") class Gemma2TextEncoderOpenVINOConfig(CLIPTextOpenVINOConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "input_ids": {0: "batch_size", 1: "sequence_length"}, "attention_mask": {0: "batch_size", 1: "sequence_length"}, } def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return SanaTextEncoderModelPatcher(self, model, model_kwargs) class DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator(DummySeq2SeqDecoderTextInputGenerator): SUPPORTED_INPUT_NAMES = ( "decoder_input_ids", "decoder_attention_mask", "encoder_outputs", "encoder_hidden_states", "encoder_attention_mask", ) class DummySanaTransformerVisionInputGenerator(DummyUnetVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "pixel_values", "pixel_mask", "sample", "latent_sample", "guidance", ) def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"] // 8, height: int = DEFAULT_DUMMY_SHAPES["height"] // 8, # Reduce img shape by 4 for FLUX to reduce memory usage on conversion **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width=width, height=height, **kwargs) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "guidance": return self.random_float_tensor([self.batch_size], framework=framework, dtype=float_dtype) return super().generate(input_name, framework, int_dtype, float_dtype) @register_in_tasks_manager("sana-transformer", *["semantic-segmentation"], library_name="diffusers") class SanaTransformerOpenVINOConfig(UNetOpenVINOConfig): NORMALIZED_CONFIG_CLASS = NormalizedConfig.with_args( image_size="sample_size", num_channels="in_channels", hidden_size="caption_channels", vocab_size="attention_head_dim", allow_new=True, ) DUMMY_INPUT_GENERATOR_CLASSES = ( DummySanaTransformerVisionInputGenerator, DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator, DummySanaTimestepInputGenerator, ) @property def inputs(self): common_inputs = super().inputs common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "sequence_length"} if getattr(self._normalized_config.config, "guidance_embeds", False): common_inputs["guidance"] = {0: "batch_size"} return common_inputs def rename_ambiguous_inputs(self, inputs): # The input name in the model signature is `x, hence the export input name is updated. hidden_states = inputs.pop("sample", None) if hidden_states is not None: inputs["hidden_states"] = hidden_states return inputs @register_in_tasks_manager("dcae-encoder", *["semantic-segmentation"], library_name="diffusers") class DcaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "sample": {0: "batch_size", 2: "height", 3: "width"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } @register_in_tasks_manager("dcae-decoder", *["semantic-segmentation"], library_name="diffusers") class DcaeDecoderOpenVINOConfig(VaeDecoderOnnxConfig): @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "latent_sample": {0: "batch_size", 2: "height_latent", 3: "width_latent"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "sample": {0: "batch_size", 2: "height", 3: "width"}, } class DummyFluxTransformerInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "pixel_values", "pixel_mask", "sample", "latent_sample", "hidden_states", "img_ids", ) def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"] // 4, height: int = DEFAULT_DUMMY_SHAPES["height"] // 4, # Reduce img shape by 4 for FLUX to reduce memory usage on conversion **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) if getattr(normalized_config, "in_channels", None): self.num_channels = normalized_config.in_channels // 4 def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name in ["hidden_states", "sample"]: shape = [self.batch_size, (self.height // 2) * (self.width // 2), self.num_channels * 4] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) if input_name == "img_ids": img_ids_height = self.height // 2 img_ids_width = self.width // 2 return self.random_int_tensor( ( [self.batch_size, img_ids_height * img_ids_width, 3] if is_diffusers_version("<", "0.31.0") else [img_ids_height * img_ids_width, 3] ), min_value=0, max_value=min(img_ids_height, img_ids_width), framework=framework, dtype=float_dtype, ) return super().generate(input_name, framework, int_dtype, float_dtype) class DummyFluxTextInputGenerator(DummySeq2SeqDecoderTextInputGenerator): SUPPORTED_INPUT_NAMES = ( "decoder_input_ids", "decoder_attention_mask", "encoder_outputs", "encoder_hidden_states", "txt_ids", ) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "txt_ids": import torch shape = ( [self.batch_size, self.sequence_length, 3] if is_diffusers_version("<", "0.31.0") else [self.sequence_length, 3] ) dtype = DTYPE_MAPPER.pt(float_dtype) return torch.full(shape, 0, dtype=dtype) return super().generate(input_name, framework, int_dtype, float_dtype) @register_in_tasks_manager("flux-transformer", *["semantic-segmentation"], library_name="diffusers") @register_in_tasks_manager("flux-transformer-2d", *["semantic-segmentation"], library_name="diffusers") class FluxTransformerOpenVINOConfig(SD3TransformerOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( DummyTransformerTimestpsInputGenerator, DummyFluxTransformerInputGenerator, DummyFluxTextInputGenerator, PooledProjectionsDummyInputGenerator, ) @property def inputs(self): common_inputs = super().inputs common_inputs.pop("sample", None) common_inputs["hidden_states"] = {0: "batch_size", 1: "packed_height_width"} common_inputs["txt_ids"] = ( {0: "batch_size", 1: "sequence_length"} if is_diffusers_version("<", "0.31.0") else {0: "sequence_length"} ) common_inputs["img_ids"] = ( {0: "batch_size", 1: "packed_height_width"} if is_diffusers_version("<", "0.31.0") else {0: "packed_height_width"} ) if getattr(self._normalized_config, "guidance_embeds", False): common_inputs["guidance"] = {0: "batch_size"} return common_inputs def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return FluxTransfromerModelPatcher(self, model, model_kwargs=model_kwargs) class LTXVaeDummyInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("pixel_values", "pixel_mask", "sample", "latent_sample", "timestep") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"], height: int = DEFAULT_DUMMY_SHAPES["height"], num_frames: int = 2, **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) self.num_frames = num_frames def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name in ["sample", "latent_sample"]: return self.random_float_tensor( [self.batch_size, self.num_channels, self.num_frames, self.height, self.width] ) if input_name == "timestep": return self.random_int_tensor([1], max_value=20, min_value=1, framework=framework, dtype=int_dtype) return super().generate(input_name, framework, int_dtype, float_dtype) @register_in_tasks_manager("ltx-vae-encoder", *["semantic-segmentation"], library_name="diffusers") class LTXVaeEncoderOpenVINOConfig(VaeEncoderOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (LTXVaeDummyInputGenerator,) @property def inputs(self) -> Dict[str, Dict[int, str]]: return { "sample": {0: "batch_size", 2: "num_frames", 3: "height", 4: "width"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "latent_parameters": {0: "batch_size", 2: "num_frames", 3: "height_latent", 4: "width_latent"}, } @register_in_tasks_manager("ltx-vae-decoder", *["semantic-segmentation"], library_name="diffusers") class LTXVaeDecoderOpenVINOConfig(VaeDecoderOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (LTXVaeDummyInputGenerator,) @property def inputs(self) -> Dict[str, Dict[int, str]]: base_input = { "latent_sample": {0: "batch_size", 2: "num_frames", 3: "latent_height", 4: "latent_width"}, } if self._normalized_config.config.timestep_conditioning: base_input["timestep"] = {} return base_input @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "sample": {0: "batch_size", 2: "num_frames", 3: "height", 4: "width"}, } class LTXTransformerDummyInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("hidden_states", "width", "height", "num_frames", "rope_interpolation_scale") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = 16, height: int = 8, num_frames: int = 2, frame_rate: int = 10, **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) self.num_frames = num_frames self.frame_rate = frame_rate self.vae_spatial_compression_ratio = normalized_config.config.vae_spatial_compression_ratio self.vae_temporal_compression_ratio = normalized_config.config.vae_temporal_compression_ratio def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): import torch if input_name == "hidden_states": return self.random_float_tensor( [self.batch_size, self.num_frames * self.height * self.width, self.num_channels] ) if input_name == "width": return torch.tensor(self.width) if input_name == "height": return torch.tensor(self.height) if input_name == "num_frames": return torch.tensor(self.num_frames) if input_name == "rope_interpolation_scale": import torch return torch.tensor( [ self.vae_temporal_compression_ratio / self.frame_rate, self.vae_spatial_compression_ratio, self.vae_spatial_compression_ratio, ] ) return super().generate(input_name, framework, int_dtype, float_dtype) @register_in_tasks_manager("ltx-video-transformer", *["semantic-segmentation"], library_name="diffusers") class LTXVideoTransformerOpenVINOConfig(SanaTransformerOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( LTXTransformerDummyInputGenerator, DummySanaSeq2SeqDecoderTextWithEncMaskInputGenerator, DummySanaTimestepInputGenerator, ) @property def inputs(self): return { "hidden_states": {0: "batch_size", 1: "video_sequence_length"}, "encoder_hidden_states": {0: "batch_size", 1: "sequence_length"}, "encoder_attention_mask": {0: "batch_size", 1: "sequence_length"}, "width": {}, "height": {}, "num_frames": {}, "timestep": {0: "batch_size"}, "rope_interpolation_scale": {}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: return { "out_sample": {0: "batch_size", 1: "video_sequence_length"}, } class DummyMiniCPMVImageInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("pixel_values", "patch_attention_mask", "position_ids") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"], height: int = DEFAULT_DUMMY_SHAPES["height"], **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height) self.patch_size = normalized_config.config.patch_size def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "pixel_values": return self.random_float_tensor( shape=[ self.batch_size, self.num_channels, self.patch_size, (self.height * self.width) // self.patch_size, ], framework=framework, dtype=float_dtype, ) if input_name == "patch_attention_mask": return self.random_int_tensor( shape=[self.batch_size, 1, (self.height // self.patch_size) * (self.width // self.patch_size)], framework=framework, dtype=float_dtype, min_value=0, max_value=2, ) if input_name == "position_ids": return self.random_int_tensor( shape=[self.batch_size, (self.height // self.patch_size) * (self.width // self.patch_size)], max_value=self.patch_size, ) class DummyMiniCPMVResampleInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("image_feature", "pos_embed", "key_padding_mask") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"], height: int = DEFAULT_DUMMY_SHAPES["height"], **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height) self.patch_size = normalized_config.config.patch_size self.hidden_size = normalized_config.config.hidden_size self.img_hidden_size = normalized_config.config.vision_config.hidden_size self.feat_size = (normalized_config.config.vision_config.image_size // self.patch_size) * ( normalized_config.config.vision_config.image_size // self.patch_size ) def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "image_feature": return self.random_float_tensor( shape=[self.batch_size, self.feat_size, self.img_hidden_size], framework=framework, dtype=float_dtype ) if input_name == "key_padding_mask": return self.constant_tensor( shape=[self.batch_size, self.feat_size], framework=framework, value=1, dtype=DTYPE_MAPPER.pt(float_dtype), ) if input_name == "pos_embed": return self.random_float_tensor(shape=[self.feat_size, self.batch_size, self.hidden_size]) class MiniCPMVConfigBehavior(str, enum.Enum): RESAMPLER = "resampler" LANGUAGE = "language" VISION_EMBEDDINGS = "vision_embeddings" TEXT_EMBEDDINGS = "text_embeddings" @register_in_tasks_manager("minicpmv", *["image-text-to-text"], library_name="transformers") class MiniCPMVOpenVINOConfig(BaseVLMOpenVINOConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in MiniCPMVConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = () def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: MiniCPMVConfigBehavior = MiniCPMVConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._behavior = behavior self._orig_config = config if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVImageInputGenerator,) if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyMiniCPMVResampleInputGenerator,) self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) @property def inputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return { "pixel_values": {0: "batch_size", 2: "height", 3: "width"}, "patch_attention_mask": {0: "batch_size", 1: "num_patches", 2: "patch_size"}, "position_ids": {0: "batch_size", 1: "patch_size"}, } if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: return { "image_feature": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, "pos_embed": {0: "patch_size", 1: "batch_size", 2: "num_patches"}, "key_padding_mask": {0: "batch_size", 1: "patch_size"}, } return {} @property def outputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return {"last_hidden_state": {0: "batch_size", 1: "patch_height", 2: "patch_width"}} if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: return {"last_hidden_state": {0: "batch_size"}} return {} def with_behavior( self, behavior: Union[str, MiniCPMVConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): behavior = MiniCPMVConfigBehavior(behavior) if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) if behavior == MiniCPMVConfigBehavior.LANGUAGE: return get_vlm_text_generation_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) if behavior == MiniCPMVConfigBehavior.RESAMPLER: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) @staticmethod def get_model_for_behavior(model, behavior: Union[str, MiniCPMVConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, MiniCPMVConfigBehavior): behavior = MiniCPMVConfigBehavior(behavior) if behavior == MiniCPMVConfigBehavior.LANGUAGE: return model.llm if behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return model.vpm if behavior == MiniCPMVConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.get_input_embeddings() text_embedding.config = model.llm.config return text_embedding if behavior == MiniCPMVConfigBehavior.RESAMPLER: model.resampler.config = model.vpm.config return model.resampler def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior == MiniCPMVConfigBehavior.VISION_EMBEDDINGS: return MiniCPMVImageEmbeddingsModelPatcher(self, model, model_kwargs) if self._behavior == MiniCPMVConfigBehavior.RESAMPLER: return MiniCPMVResamplerModelPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) class Phi3VisionConfigBehavior(str, enum.Enum): LANGUAGE = "language" VISION_PROJECTION = "vision_projection" VISION_EMBEDDINGS = "vision_embeddings" TEXT_EMBEDDINGS = "text_embeddings" class DummyPhi3VisionProjectionInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("input",) def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = 336, height: int = 336, crop_size=336, **kwargs, ): self.batch_size = batch_size self._embed_layer_realization = ( normalized_config.config.embd_layer["embedding_cls"] if hasattr(normalized_config.config, "embd_layer") else "image_audio" ) if not hasattr(normalized_config.config, "vision_config"): self.image_dim_out = ( normalized_config.config.img_processor.get( "image_dim_out", normalized_config.config.img_processor.get("hidden_size") ) if normalized_config.config.img_processor is not None else 1152 ) if "image_embd_layer" in normalized_config.config.embd_layer: self.crop_size = normalized_config.config.embd_layer["image_embd_layer"].get("crop_size", crop_size) else: self.crop_size = normalized_config.config.embd_layer.get("crop_size", crop_size) else: self.image_dim_out = normalized_config.config.vision_config.hidden_size self.crop_size = normalized_config.config.vision_config.crop_size self.height = height self.width = width def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): h = self.height // self.crop_size w = self.width // self.crop_size feat_size = (h * w + 1) * 144 + 1 + (h + 1) * 12 if self._embed_layer_realization in ["linear", "image_audio"]: shape = [self.batch_size, feat_size, self.image_dim_out] else: shape = [self.batch_size, feat_size, self.image_dim_out * 4] return self.random_float_tensor(shape, framework=framework, dtype=float_dtype) @register_in_tasks_manager("phi3-v", *["image-text-to-text"], library_name="transformers") class Phi3VisionOpenVINOConfig(BaseVLMOpenVINOConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in Phi3VisionConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) MIN_TRANSFORMERS_VERSION = version.parse("4.40.0") def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: Phi3VisionConfigBehavior = Phi3VisionConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._behavior = behavior self._orig_config = config if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "img_processor"): self._config = AutoConfig.from_pretrained( config.img_processor["model_name"], trust_remote_code=True ).vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION and hasattr(config, "img_processor"): self._config = config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyPhi3VisionProjectionInputGenerator,) @property def inputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: return {"pixel_values": {0: "batch_size", 2: "height", 3: "width"}} if self._behavior == Phi3VisionConfigBehavior.VISION_PROJECTION: return {"input": {0: "batch_size", 1: "img_feat_size"}} @property def outputs(self) -> Dict[str, Dict[int, str]]: if self._behavior in [Phi3VisionConfigBehavior.VISION_EMBEDDINGS, Phi3VisionConfigBehavior.VISION_PROJECTION]: return {"last_hidden_state": {0: "batch_size", 1: "height_width_projection"}} return {} def with_behavior( self, behavior: Union[str, Phi3VisionConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior): behavior = Phi3VisionConfigBehavior(behavior) if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS: return get_vlm_text_embeddings_config("phi3", self._orig_config, self.int_dtype, self.float_dtype) if behavior == Phi3VisionConfigBehavior.LANGUAGE: return get_vlm_text_generation_config("phi3", self._orig_config, self.int_dtype, self.float_dtype) if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) @staticmethod def get_model_for_behavior(model, behavior: Union[str, Phi3VisionConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, Phi3VisionConfigBehavior): behavior = Phi3VisionConfigBehavior(behavior) if behavior == Phi3VisionConfigBehavior.LANGUAGE: return model if behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: vision_embeddings = model.model.vision_embed_tokens vision_embeddings.config = model.config return vision_embeddings if behavior == Phi3VisionConfigBehavior.VISION_PROJECTION: projection = model.model.vision_embed_tokens.img_projection projection.config = model.config return projection if behavior == Phi3VisionConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.model.embed_tokens text_embedding.config = model.config return text_embedding def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior == Phi3VisionConfigBehavior.VISION_EMBEDDINGS: return Phi3VisionImageEmbeddingsPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) class DummyAudioPhi4MMInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ("audio_input", "audio_feature", "audio_mask") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], signal_length=498, **kwargs, ): self.signal_length = signal_length if hasattr(normalized_config.config, "audio_processor"): self.audio_chunk_size = ( signal_length // normalized_config.config.audio_processor["config"]["time_reduction"] + 1 ) self.input_size = normalized_config.config.audio_processor["config"]["input_size"] self.attention_dim = normalized_config.config.audio_processor["config"]["attention_dim"] else: self.audio_chunk_size = signal_length // normalized_config.config.audio_config.time_reduction + 1 self.input_size = normalized_config.config.audio_config.input_size self.attention_dim = normalized_config.config.audio_config.hidden_size self.batch_size = batch_size self.task = task self.normalized_config = normalized_config def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "audio_input": return self.random_float_tensor( [self.batch_size, self.signal_length, self.input_size], framework=framework, dtype=float_dtype ) if input_name == "audio_feature": return self.random_float_tensor( [self.batch_size, self.audio_chunk_size, self.attention_dim], framework=framework, dtype=float_dtype ) if input_name == "audio_mask": return self.random_int_tensor( [self.batch_size, self.audio_chunk_size, self.audio_chunk_size], max_value=2, framework=framework, dtype="bool", ) class DummyVisionPositionIdsPhi4InputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("patch_position_ids", "patch_attention_mask") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"], height: int = DEFAULT_DUMMY_SHAPES["height"], **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) if hasattr(normalized_config.config, "vision_conifg"): self.patch_size = getattr(normalized_config.config.vision_config, "patch_size", 14) else: self.patch_size = 14 self.num_patches_per_side = self.height // self.patch_size def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "patch_position_ids": return self.get_vision_position_ids() if input_name == "patch_attention_mask": return self.random_int_tensor( [self.batch_size, self.height // self.patch_size, self.width // self.patch_size], framework=framework, dtype="bool", max_value=2, ) return super().generate(input_name, framework, int_dtype, float_dtype) def get_vision_position_ids(self): # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L494-L512 import torch batch_size = self.batch_size max_im_h, max_im_w = self.height, self.width max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) position_ids = torch.full( size=( batch_size, max_nb_patches_h * max_nb_patches_w, ), fill_value=0, ) patch_attention_mask = torch.ones( [self.batch_size, self.height // self.patch_size, self.width // self.patch_size], dtype=torch.int64 ) patch_attention_mask[0, self.height - 2 :] = 0 for batch_idx, p_attn_mask in enumerate(patch_attention_mask): nb_patches_h = p_attn_mask[:, 0].sum() nb_patches_w = p_attn_mask[0].sum() fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() position_ids[batch_idx][p_attn_mask.view(-1)] = pos_ids return position_ids class Phi4MMConfigBehavior(str, enum.Enum): AUDIO_EMBEDDINGS = "audio_embeddings" AUDIO_ENCODER = "audio_encoder" AUDIO_FORWARD_EMBEDDINGS = "audio_forward_embeddings" AUDIO_VISION_PROJECTION = "audio_vision_projection" AUDIO_SPEECH_PROJECTION = "audio_speech_projection" LANGUAGE = "language" TEXT_EMBEDDINGS = "text_embeddings" VISION_PROJECTION = "vision_projection" VISION_EMBEDDINGS = "vision_embeddings" @register_in_tasks_manager( "phi4mm", *["image-text-to-text", "automatic-speech-recognition"], library_name="transformers" ) @register_in_tasks_manager( "phi4-multimodal", *["image-text-to-text", "automatic-speech-recognition"], library_name="transformers" ) class Phi4MMOpenVINOConfig(BaseVLMOpenVINOConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in Phi4MMConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator,) MIN_TRANSFORMERS_VERSION = version.parse("4.51.0") def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: Phi4MMConfigBehavior = Phi4MMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._behavior = behavior self._orig_config = config if self._behavior == Phi4MMConfigBehavior.VISION_EMBEDDINGS: if hasattr(self._config, "vision_config"): self._config = self._config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) else: self._config.image_size = self._config.embd_layer.get("image_embd_layer", {}).get("crop_size", 448) self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionPositionIdsPhi4InputGenerator) if self._behavior == Phi4MMConfigBehavior.VISION_PROJECTION: self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyPhi3VisionProjectionInputGenerator,) if self._behavior in ( Phi4MMConfigBehavior.AUDIO_EMBEDDINGS, Phi4MMConfigBehavior.AUDIO_FORWARD_EMBEDDINGS, Phi4MMConfigBehavior.AUDIO_ENCODER, Phi4MMConfigBehavior.AUDIO_SPEECH_PROJECTION, Phi4MMConfigBehavior.AUDIO_VISION_PROJECTION, ): self.DUMMY_INPUT_GENERATOR_CLASSES = (DummyAudioPhi4MMInputGenerator,) @property def inputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == Phi4MMConfigBehavior.VISION_EMBEDDINGS: return { "pixel_values": {0: "batch_size", 2: "height", 3: "width"}, "patch_attention_mask": {0: "batch_size", 1: "patch_height", 2: "patch_width"}, "patch_position_ids": {0: "batch_size", 1: "patch_size"}, } if self._behavior == Phi4MMConfigBehavior.VISION_PROJECTION: return {"input": {0: "batch_size", 1: "img_feat_size"}} if self._behavior in [Phi4MMConfigBehavior.AUDIO_EMBEDDINGS, Phi4MMConfigBehavior.AUDIO_FORWARD_EMBEDDINGS]: return {"audio_input": {0: "batch_size", 1: "audio_length"}} if self._behavior == Phi4MMConfigBehavior.AUDIO_ENCODER: return { "audio_feature": {0: "batch_size", 1: "audio_length"}, "audio_mask": {0: "batch_size", 1: "audio_length", 2: "audio_length"}, } if self._behavior in [ Phi4MMConfigBehavior.AUDIO_SPEECH_PROJECTION, Phi4MMConfigBehavior.AUDIO_VISION_PROJECTION, ]: return {"audio_feature": {0: "batch_size", 1: "audio_length"}} return {} @property def outputs(self) -> Dict[str, Dict[int, str]]: if self._behavior in [ Phi4MMConfigBehavior.VISION_EMBEDDINGS, Phi4MMConfigBehavior.VISION_PROJECTION, Phi4MMConfigBehavior.AUDIO_EMBEDDINGS, Phi4MMConfigBehavior.AUDIO_FORWARD_EMBEDDINGS, Phi4MMConfigBehavior.AUDIO_ENCODER, Phi4MMConfigBehavior.AUDIO_SPEECH_PROJECTION, Phi4MMConfigBehavior.AUDIO_VISION_PROJECTION, ]: return {"last_hidden_state": {0: "batch_size", 1: "projection_size"}} return {} def with_behavior( self, behavior: Union[str, Phi4MMConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, Phi4MMConfigBehavior): behavior = Phi4MMConfigBehavior(behavior) if behavior == Phi4MMConfigBehavior.TEXT_EMBEDDINGS: return get_vlm_text_embeddings_config("phi3", self._orig_config, self.int_dtype, self.float_dtype) if behavior == Phi4MMConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( "phi3", self._orig_config, self.int_dtype, self.float_dtype, model_patcher=Phi4MMLanguageModelPatcher ) return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) @staticmethod def get_model_for_behavior(model, behavior: Union[str, Phi4MMConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, Phi4MMConfigBehavior): behavior = Phi4MMConfigBehavior(behavior) if behavior == Phi4MMConfigBehavior.LANGUAGE: return model if behavior == Phi4MMConfigBehavior.VISION_EMBEDDINGS: vision_embeddings = model.model.embed_tokens_extend.image_embed vision_embeddings.config = model.model.embed_tokens_extend.image_embed.img_processor.config return model.model.embed_tokens_extend.image_embed if behavior == Phi4MMConfigBehavior.VISION_PROJECTION: vision_model = model.model.embed_tokens_extend.image_embed if hasattr(vision_model, "img_projection"): projection = vision_model.img_projection else: import torch projection = torch.nn.Sequential( *[vision_model.img_projection_up, torch.nn.GELU(), vision_model.img_projection_down] ) projection.config = vision_model.img_processor.config return projection if behavior == Phi4MMConfigBehavior.TEXT_EMBEDDINGS: if hasattr(model.model, "_require_grads_hook"): model.model.disable_input_require_grads() text_embedding = model.model.embed_tokens text_embedding.config = model.config return text_embedding if behavior == Phi4MMConfigBehavior.AUDIO_EMBEDDINGS: audio_embeddings = model.model.embed_tokens_extend.audio_embed.encoder.encoder_embedding audio_embeddings.config = model.config return audio_embeddings if behavior == Phi4MMConfigBehavior.AUDIO_ENCODER: audio_encoder = model.model.embed_tokens_extend.audio_embed.encoder audio_encoder.config = model.config return audio_encoder if behavior == Phi4MMConfigBehavior.AUDIO_FORWARD_EMBEDDINGS: audio_encoder = model.model.embed_tokens_extend.audio_embed.encoder audio_encoder.config = model.config return audio_encoder if behavior == Phi4MMConfigBehavior.AUDIO_SPEECH_PROJECTION: if hasattr(model.model.embed_tokens_extend.audio_embed, "audio_projection"): audio_projection = model.model.embed_tokens_extend.audio_embed.audio_projection["speech"] audio_projection.config = model.config return audio_projection else: import torch audio_projection = torch.nn.Sequential( *[ model.model.embed_tokens_extend.audio_embed.up_proj_for_speech, torch.nn.GELU(), model.model.embed_tokens_extend.audio_embed.down_proj_for_speech, ] ) audio_projection.config = model.config return audio_projection if behavior == Phi4MMConfigBehavior.AUDIO_VISION_PROJECTION: if hasattr(model.model.embed_tokens_extend.audio_embed, "audio_projection"): audio_projection = model.model.embed_tokens_extend.audio_embed.audio_projection["vision"] audio_projection.config = model.config return audio_projection else: import torch audio_projection = torch.nn.Sequential( *[ model.model.embed_tokens_extend.audio_embed.up_proj_for_vision_speech, torch.nn.GELU(), model.model.embed_tokens_extend.audio_embed.down_proj_for_vision_speech, ] ) audio_projection.config = model.config return audio_projection def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior == Phi4MMConfigBehavior.VISION_EMBEDDINGS: return Phi4MMVisionEmbeddingsPatcher(self, model, model_kwargs) if self._behavior == Phi4MMConfigBehavior.AUDIO_FORWARD_EMBEDDINGS: return Phi4MMAudioForwardEmbeddingsPatcher(self, model, model_kwargs) if self._behavior == Phi4MMConfigBehavior.AUDIO_ENCODER: return Phi4MMAudioEncoderPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) def rename_ambiguous_inputs(self, inputs): if self._behavior == Phi4MMConfigBehavior.AUDIO_EMBEDDINGS: input_info = inputs.pop("audio_input") inputs["input_" if hasattr(self._normalized_config.config, "audio_processor") else "x"] = input_info if self._behavior in [ Phi4MMConfigBehavior.AUDIO_SPEECH_PROJECTION, Phi4MMConfigBehavior.AUDIO_VISION_PROJECTION, ]: input_info = inputs.pop("audio_feature") inputs["input"] = input_info return inputs class DummyQwen2VLLMInputGenerator(DummyTextInputGenerator): def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): generated_input = super().generate(input_name, framework, int_dtype, float_dtype) if input_name == "position_ids": return generated_input.unsqueeze(0).expand(3, -1, -1) return generated_input class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "hidden_states", "attention_mask", "window_attention_mask", "window_index", "rotary_pos_emb", ) def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = 1, num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = 420, height: int = 420, **kwargs, ): self.batch_size = batch_size self.height = height self.width = width self.num_channels = num_channels self.temporal_patch_size = normalized_config.config.temporal_patch_size self.patch_size = normalized_config.config.patch_size if normalized_config.use_embed_dim: self.embed_dim = ( normalized_config.config.embed_dim if hasattr(normalized_config.config, "embed_dim") else normalized_config.hidden_size ) else: self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size self.num_heads = normalized_config.config.num_heads self.spatial_merge_size = None if hasattr(normalized_config.config, "spatial_merge_size"): self.spatial_merge_size = normalized_config.config.spatial_merge_size def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size grid_t = self.batch_size if input_name == "hidden_states": return self.random_float_tensor( [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype ) if input_name in ["attention_mask", "window_attention_mask"]: return self.random_mask_tensor( [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype ) if input_name == "rotary_pos_emb": dim = self.embed_dim // self.num_heads // 2 return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype) if input_name == "window_index": if self.spatial_merge_size is None: raise ValueError( "`spatial_merge_size` parameter is not found in model config. Can not generate dummy input data for `window_index` input" ) spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size hidden_size = (grid_t * grid_h * grid_w) // spatial_merge_unit return self.random_int_tensor([hidden_size], max_value=hidden_size) class Qwen2VLConfigBehavior(str, enum.Enum): LANGUAGE = "language" VISION_EMBEDDINGS = "vision_embeddings" VISION_EMBEDDINGS_MERGER = "vision_embeddings_merger" TEXT_EMBEDDINGS = "text_embeddings" @register_in_tasks_manager("qwen2-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers") class Qwen2VLOpenVINOConfig(BaseVLMOpenVINOConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen2VLConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen2VLVisionEmbedInputGenerator,) MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: Qwen2VLConfigBehavior = Qwen2VLConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._behavior = behavior self._orig_config = config if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self._normalized_config.use_embed_dim = False if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) self._normalized_config.use_embed_dim = True @staticmethod def get_model_for_behavior(model, behavior: Union[str, Qwen2VLConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): behavior = Qwen2VLConfigBehavior(behavior) if behavior == Qwen2VLConfigBehavior.LANGUAGE: return model if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: vision_embeddings = model.visual.patch_embed vision_embeddings.config = model.config.vision_config return vision_embeddings if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: vision_emb_merger = model.visual vision_emb_merger.config = model.config.vision_config return vision_emb_merger if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: text_embedding = ( model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens ) text_embedding.config = model.config return text_embedding def with_behavior( self, behavior: Union[str, Qwen2VLConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, Qwen2VLConfigBehavior): behavior = Qwen2VLConfigBehavior(behavior) if behavior == Qwen2VLConfigBehavior.TEXT_EMBEDDINGS: return get_vlm_text_embeddings_config("qwen2", self._orig_config, self.int_dtype, self.float_dtype) if behavior == Qwen2VLConfigBehavior.LANGUAGE: return get_vlm_text_generation_config( "qwen2", self._orig_config, self.int_dtype, self.float_dtype, model_patcher=Qwen2VLLanguageModelPatcher, dummy_input_generator=DummyQwen2VLLMInputGenerator, inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, ) if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) if behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: return self.__class__( self._orig_config, task=self.task, int_dtype=self.int_dtype, float_dtype=self.float_dtype, behavior=behavior, preprocessors=self._preprocessors, ) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: return Qwen2VLVisionEmbMergerPatcher(self, model, model_kwargs) if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: return ModelPatcher(self, model, model_kwargs=model_kwargs) return super().patch_model_for_export(model, model_kwargs) @property def inputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS: return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}} if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: return { "hidden_states": {0: "sequence_length"}, "attention_mask": {1: "sequence_length", 2: "sequence_length"}, "rotary_pos_emb": {0: "sequence_length"}, } @property def outputs(self) -> Dict[str, Dict[int, str]]: if self._behavior in [Qwen2VLConfigBehavior.VISION_EMBEDDINGS, Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER]: return {"last_hidden_state": {0: "seq_len"}} return {} @register_in_tasks_manager("qwen2-5-vl", *["image-text-to-text", "video-text-to-text"], library_name="transformers") class Qwen2_5_VLOpenVINOConfig(Qwen2VLOpenVINOConfig): MIN_TRANSFORMERS_VERSION = version.parse("4.49.0") @property def inputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: return { "hidden_states": {0: "sequence_length"}, "attention_mask": {1: "sequence_length", 2: "sequence_length"}, "window_attention_mask": {1: "sequence_length", 2: "sequence_length"}, "window_index": {0: "unit_sequence_length"}, "rotary_pos_emb": {0: "sequence_length"}, } return super().inputs def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: return Qwen2_5_VLVisionEmbMergerPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) @register_in_tasks_manager( "glm", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GLMOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.46.0" @register_in_tasks_manager( "glm4", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GLM4OpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.3" @register_in_tasks_manager( "granite", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GraniteOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.45.0" @register_in_tasks_manager( "granitemoe", *["text-generation", "text-generation-with-past"], library_name="transformers" ) class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.45.0" def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: return GraniteMoEModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "gpt-bigcode", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text-classification", ], library_name="transformers", ) class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return GptBigCodeModelPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager( "whisper", *[ "feature-extraction", "feature-extraction-with-past", "audio-classification", "automatic-speech-recognition", "automatic-speech-recognition-with-past", ], library_name="transformers", ) class WhisperOpenVINOConfig(WhisperOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return StatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} if self._behavior is not ConfigBehavior.ENCODER and self.use_past_in_inputs: if is_transformers_version(">=", "4.43.0"): # since https://github.com/huggingface/transformers/pull/31166 common_inputs["cache_position"] = {0: "decoder_sequence_length"} return common_inputs @register_in_tasks_manager( "t5", *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"], library_name="transformers", ) class T5OpenVINOConfig(T5OnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return StatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} return common_inputs @register_in_tasks_manager( "mt5", *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"], library_name="transformers", ) class MT5OpenVINOConfig(T5OpenVINOConfig): pass @register_in_tasks_manager( "longt5", *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"], library_name="transformers", ) class LongT5OpenVINOConfig(T5OpenVINOConfig): pass @register_in_tasks_manager( "bart", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text2text-generation", "text2text-generation-with-past", "text-classification", "question-answering", ], library_name="transformers", ) class BartOpenVINOConfig(BartOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> ModelPatcher: if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return StatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} return common_inputs @register_in_tasks_manager( "mbart", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text2text-generation", "text2text-generation-with-past", "text-classification", "question-answering", ], library_name="transformers", ) class MBartOpenVINOConfig(BartOpenVINOConfig): pass @register_in_tasks_manager( "m2m-100", *["feature-extraction", "feature-extraction-with-past", "text2text-generation", "text2text-generation-with-past"], library_name="transformers", ) class M2M100OpenVINOConfig(BartOpenVINOConfig): pass @register_in_tasks_manager( "deepseek-v3", *["text-generation", "text-generation-with-past"], library_name="transformers" ) @register_in_tasks_manager( "deepseek-v2", *["text-generation", "text-generation-with-past"], library_name="transformers" ) @register_in_tasks_manager("deepseek", *["text-generation", "text-generation-with-past"], library_name="transformers") class DeepseekOpenVINOConfig(MiniCPM3OpenVINOConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return DeepseekPatcher(self, model, model_kwargs=model_kwargs) @register_in_tasks_manager("got-ocr2", *["image-to-text", "image-text-to-text"], library_name="transformers") class GotOCR2OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.49.0" def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, **kwargs, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._orig_config = config if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) @register_in_tasks_manager("gemma3", *["image-text-to-text"], library_name="transformers") class Gemma3OpenVINOConfig(BaseVLMOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, **kwargs, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._orig_config = config if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) def with_behavior( self, behavior: Union[str, VLMConfigBehavior], ): """ Creates a config for different behaviour. Args: behavior ([`ConfigBehavior`]): The behavior to use for the new instance. """ if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: model_type = self._orig_config.text_config.model_type return get_vlm_text_generation_config( model_type, self._orig_config.text_config, self.int_dtype, self.float_dtype, model_patcher=Gemma3LMModelPatcher, inputs_update={"token_type_ids": {0: "batch_size", 1: "sequence_length"}}, ) return super().with_behavior(behavior) class DummyVisionPositionIdsInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ("patch_attention_mask", "patch_position_ids") def __init__( self, task: str, normalized_config: NormalizedVisionConfig, batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], width: int = DEFAULT_DUMMY_SHAPES["width"], height: int = DEFAULT_DUMMY_SHAPES["height"], **kwargs, ): super().__init__(task, normalized_config, batch_size, num_channels, width, height, **kwargs) self.patch_size = normalized_config.config.patch_size def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name == "patch_attention_mask": shape = [self.batch_size, self.height // self.patch_size, self.width // self.patch_size] return self.random_int_tensor(shape, max_value=2, framework=framework, dtype="bool") if input_name == "patch_position_ids": max_nb_patches_h, max_nb_patches_w = self.height // self.patch_size, self.width // self.patch_size shape = [self.batch_size, max_nb_patches_h * max_nb_patches_w] return self.random_int_tensor( shape, max_value=min(max_nb_patches_h, max_nb_patches_w), framework=framework, dtype=int_dtype ) return super().generate(input_name, framework, int_dtype, float_dtype) @register_in_tasks_manager("idefics3", *["image-text-to-text", "video-text-to-text"], library_name="transformers") class Idefics3OpenVINOConfig(BaseVLMOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyVisionInputGenerator, DummyVisionPositionIdsInputGenerator) MIN_TRANSFORMERS_VERSION = "4.46.0" def __init__( self, config: "PretrainedConfig", task: str = "feature-extraction", int_dtype: str = "int64", float_dtype: str = "fp32", behavior: VLMConfigBehavior = VLMConfigBehavior.VISION_EMBEDDINGS, preprocessors: Optional[List[Any]] = None, **kwargs, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, preprocessors=preprocessors, ) self._orig_config = config if self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): self._config = config.vision_config self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return Idefics3ImageEmbeddingsModelPatcher(self, model, model_kwargs) @property def inputs(self) -> Dict[str, Dict[int, str]]: if not self._behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return {} return { "pixel_values": {0: "batch_size", 2: "height", 3: "width"}, "patch_attention_mask": {0: "batch_size", 1: "num_height_patches", 2: "num_width_patches"}, "patch_position_ids": {0: "batch_size", 1: "num_patches"}, } def get_model_for_behavior(self, model, behavior: Union[str, VLMConfigBehavior]): if isinstance(behavior, str) and not isinstance(behavior, VLMConfigBehavior): behavior = VLMConfigBehavior(behavior) if behavior == VLMConfigBehavior.LANGUAGE: return model if behavior == VLMConfigBehavior.VISION_EMBEDDINGS: return model.model if behavior == VLMConfigBehavior.TEXT_EMBEDDINGS: text_embedding = model.model.text_model.get_input_embeddings() text_embedding.config = model.model.text_model.config return text_embedding @register_in_tasks_manager("smolvlm", *["image-text-to-text", "video-text-to-text"], library_name="transformers") class SmolVLMOpenVINOConfig(Idefics3OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.50.0" @register_in_tasks_manager( "blenderbot", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text2text-generation", "text2text-generation-with-past", ], library_name="transformers", ) class BlenderbotOpenVINOConfig(BlenderbotOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return BlenderbotStatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return BlenderbotModelPatcher(self, model, model_kwargs=model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} return common_inputs @register_in_tasks_manager( "blenderbot-small", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text2text-generation", "text2text-generation-with-past", ], library_name="transformers", ) class BlenderbotSmallOpenVINOConfig(BlenderbotSmallOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return BlenderbotSmallStatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return BlenderbotSmallModelPatcher(self, model, model_kwargs=model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} return common_inputs @register_in_tasks_manager( "pegasus", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text2text-generation", "text2text-generation-with-past", ], library_name="transformers", ) class PegasusOpenVINOConfig(PegasusOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return PegasusStatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return PegasusModelPatcher(self, model, model_kwargs=model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} return common_inputs @register_in_tasks_manager( "marian", *[ "feature-extraction", "feature-extraction-with-past", "text-generation", "text-generation-with-past", "text2text-generation", "text2text-generation-with-past", ], library_name="transformers", ) class MarianOpenVINOConfig(MarianOnnxConfig): def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: return MarianStatefulSeq2SeqDecoderPatcher(self, model, model_kwargs) return MarianModelPatcher(self, model, model_kwargs=model_kwargs) @property def inputs(self): common_inputs = super().inputs if getattr(self, "stateful", False) and self._behavior == ConfigBehavior.DECODER: common_inputs["decoder_input_ids"] = {0: "batch_size", 1: "decoder_sequence_length"} return common_inputs class DummySpeechT5OpenVINOInputGenerator(DummyInputGenerator): SUPPORTED_INPUT_NAMES = ( "inputs_embeds", "output_sequence", "speaker_embeddings", "spectrogram", "raw_spectrogram", "encoder_hidden_states", ) def __init__( self, task: str, normalized_config: NormalizedConfig, sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], **kwargs, ): self.task = task self.batch_size = 1 self.sequence_length = sequence_length self.speaker_embedding_dim = normalized_config.speaker_embedding_dim self.num_mel_bins = normalized_config.num_mel_bins self.reduction_factor = normalized_config.config.reduction_factor self.hidden_size = normalized_config.config.hidden_size def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): if input_name in ["output_sequence", "inputs_embeds"]: shape = [self.batch_size, self.sequence_length, self.num_mel_bins] elif input_name == "speaker_embeddings": shape = [self.batch_size, self.speaker_embedding_dim] elif input_name == "raw_spectrogram": shape = [self.sequence_length, self.batch_size, self.reduction_factor, self.num_mel_bins] elif input_name == "encoder_hidden_states": shape = [self.batch_size, self.sequence_length, self.hidden_size] elif input_name == "spectrogram": shape = [self.batch_size, self.sequence_length, self.num_mel_bins] else: raise ValueError(f"Unsupported input {input_name} for DummySpeechT5InputGenerator") return self.random_float_tensor( shape=shape, min_value=0, max_value=1, framework=framework, dtype=float_dtype, ) class SpeechT5ConfigBehavior(str, enum.Enum): ENCODER = "encoder" DECODER = "decoder" POSTNET = "postnet" VOCODER = "vocoder" @register_in_tasks_manager( "speecht5", *["text-to-audio", "text-to-audio-with-past"], library_name="transformers", ) class SpeechT5OpenVINOConfig(SpeechT5OnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = ( DummyTextInputGenerator, DummySeq2SeqPastKeyValuesGenerator, DummySpeechT5OpenVINOInputGenerator, ) def __init__( self, config: "PretrainedConfig", task: str = "text-to-audio", int_dtype: str = "int64", float_dtype: str = "fp32", use_past: bool = True, use_past_in_inputs: bool = True, behavior: SpeechT5ConfigBehavior = SpeechT5ConfigBehavior.ENCODER, preprocessors: Optional[List[Any]] = None, legacy: bool = False, ): super().__init__( config=config, task=task, int_dtype=int_dtype, float_dtype=float_dtype, use_past=use_past, use_past_in_inputs=use_past_in_inputs, behavior=behavior, preprocessors=preprocessors, is_postnet_and_vocoder=False, legacy=legacy, ) def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ) -> "ModelPatcher": return OVSpeechT5ModelPatcher(self, model, model_kwargs=model_kwargs) def add_past_key_values(self, inputs_or_outputs: Dict[str, Dict[int, str]], direction: str): if direction not in ["inputs", "outputs"]: raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') if direction == "inputs": decoder_sequence_name = "past_decoder_sequence_length" name = "past_key_values" else: decoder_sequence_name = "past_decoder_sequence_length + 1" name = "present" for i in range(self._normalized_config.decoder_num_layers): inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch_size", 2: decoder_sequence_name} inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch_size", 2: decoder_sequence_name} inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch_size", 2: "encoder_sequence_length_out"} inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch_size", 2: "encoder_sequence_length_out"} @property def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = {} if self._behavior is SpeechT5ConfigBehavior.ENCODER: common_inputs["input_ids"] = {1: "encoder_sequence_length"} elif self._behavior is SpeechT5ConfigBehavior.DECODER: common_inputs["inputs_embeds"] = {0: "batch_size", 1: "decoder_sequence_length"} common_inputs["speaker_embeddings"] = {} # No dynamic shape here. common_inputs["encoder_hidden_states"] = {0: "batch_size", 1: "encoder_sequence_length"} common_inputs["encoder_attention_mask"] = {0: "batch_size", 1: "encoder_sequence_length"} if self.variant == "with-past" and self.use_past_in_inputs: self.add_past_key_values(common_inputs, direction="inputs") elif self._behavior is SpeechT5ConfigBehavior.POSTNET: common_inputs["raw_spectrogram"] = { 0: "n_spectrums", 1: "batch_size", } elif self._behavior is SpeechT5ConfigBehavior.VOCODER: common_inputs["spectrogram"] = {0: "batch_size", 1: "n_spectrums"} else: raise ValueError( "self._behavior is neither encoder, decoder, postnet, or vocoder. This should not happen." ) return common_inputs @property def outputs(self) -> Dict[str, Dict[int, str]]: common_outputs = {} if self._behavior == SpeechT5ConfigBehavior.ENCODER: common_outputs = { "last_hidden_state": {1: "encoder_sequence_length"}, "encoder_attention_mask": {1: "encoder_sequence_length"}, } elif self._behavior is SpeechT5ConfigBehavior.DECODER: common_outputs["output_sequence_out"] = {1: "decoder_sequence_length + 1"} common_outputs["spectrum"] = {} # No dynamic shape here. common_outputs["prob"] = {} # No dynamic shape here. if self.variant == "with-past" and self.use_past: self.add_past_key_values(common_outputs, direction="outputs") elif self._behavior is SpeechT5ConfigBehavior.POSTNET: common_outputs["postnet_spectrogram"] = {} elif self._behavior is SpeechT5ConfigBehavior.VOCODER: common_outputs["waveform"] = {} return common_outputs def with_behavior( self, behavior: Union[str, SpeechT5ConfigBehavior], ): """ Creates a config for different behaviour. """ if isinstance(behavior, str) and not isinstance(behavior, SpeechT5ConfigBehavior): behavior = SpeechT5ConfigBehavior(behavior) if behavior == SpeechT5ConfigBehavior.ENCODER: return self.__class__( self._config, use_past=False, use_past_in_inputs=False, behavior=behavior, ) elif behavior == SpeechT5ConfigBehavior.DECODER: return self.__class__( self._config, use_past=True, use_past_in_inputs=True, behavior=behavior, ) elif behavior == SpeechT5ConfigBehavior.POSTNET: return self.__class__( self._config, use_past=False, use_past_in_inputs=False, behavior=behavior, ) elif behavior == SpeechT5ConfigBehavior.VOCODER: return self.__class__( self._config, use_past=False, use_past_in_inputs=False, behavior=behavior, ) else: raise ValueError( "self._behavior is neither encoder, decoder, postnet, or vocoder. This should not happen." ) @register_in_tasks_manager( "llama4-text", *["text-generation", "text-generation-with-past"], library_name="transformers" ) class Llama4TextOpenVINOConfig(LlamaOpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): return Llama4TextModelPatcher(self, model, model_kwargs) @register_in_tasks_manager( "llama4", *["image-text-to-text", "text-generation", "text-generation-with-past"], library_name="transformers" ) class Llama4OpenVINOConfig(GotOCR2OpenVINOConfig): MIN_TRANSFORMERS_VERSION = "4.51.0" def patch_model_for_export( self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None ): model_kwargs = model_kwargs or {} if self._behavior != VLMConfigBehavior.VISION_EMBEDDINGS: return super().patch_model_for_export(model, model_kwargs) return Llama4ImageEmbeddingsModelPatcher(self, model, model_kwargs)