optimum/intel/openvino/modeling.py (576 lines of code) (raw):
# Copyright 2022 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from pathlib import Path
from typing import Optional, Union
import numpy as np
import openvino
import torch
import transformers
from huggingface_hub import model_info
from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE
from transformers import (
AutoConfig,
AutoModel,
AutoModelForAudioClassification,
AutoModelForAudioFrameClassification,
AutoModelForAudioXVector,
AutoModelForCTC,
AutoModelForImageClassification,
AutoModelForMaskedLM,
AutoModelForQuestionAnswering,
AutoModelForSequenceClassification,
AutoModelForTokenClassification,
AutoModelForZeroShotImageClassification,
PretrainedConfig,
)
from transformers.file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
from transformers.modeling_outputs import (
BaseModelOutput,
CausalLMOutput,
ImageClassifierOutput,
MaskedLMOutput,
ModelOutput,
QuestionAnsweringModelOutput,
SequenceClassifierOutput,
TokenClassifierOutput,
XVectorOutput,
)
from transformers.models.clip.modeling_clip import CLIPOutput
from ..utils.import_utils import is_timm_available, is_timm_version
from .modeling_base import OVBaseModel
from .modeling_sam import OVSamModel
from .utils import _is_timm_ov_dir
logger = logging.getLogger(__name__)
_TOKENIZER_FOR_DOC = "AutoTokenizer"
_FEATURE_EXTRACTOR_FOR_DOC = "AutoFeatureExtractor"
MODEL_START_DOCSTRING = r"""
This model inherits from [`optimum.intel.openvino.modeling.OVBaseModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving)
Parameters:
model (`openvino.Model`): is the main class used to run OpenVINO Runtime inference.
config (`transformers.PretrainedConfig`): [PretrainedConfig](https://huggingface.co/docs/transformers/main_classes/configuration#transformers.PretrainedConfig)
is the Model configuration class with all the parameters of the model.
Initializing with a config file does not load the weights associated with the model, only the configuration.
Check out the [`~intel.openvino.modeling.OVBaseModel.from_pretrained`] method to load the model weights.
device (`str`, defaults to `"CPU"`):
The device type for which the model will be optimized for. The resulting compiled model will contains nodes specific to this device.
dynamic_shapes (`bool`, defaults to `True`):
All the model's dimension will be set to dynamic when set to `True`. Should be set to `False` for the model to not be dynamically reshaped by default.
ov_config (`Optional[Dict]`, defaults to `None`):
The dictionary containing the information related to the model compilation.
compile (`bool`, defaults to `True`):
Disable the model compilation during the loading step when set to `False`.
Can be useful to avoid unnecessary compilation, in the case where the model needs to be statically reshaped, the device modified or if FP16 conversion is enabled.
"""
INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.Tensor`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`](https://huggingface.co/docs/transformers/autoclass_tutorial#autotokenizer).
[What are input IDs?](https://huggingface.co/docs/transformers/glossary#input-ids)
attention_mask (`torch.Tensor`), *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](https://huggingface.co/docs/transformers/glossary#attention-mask)
token_type_ids (`torch.Tensor`, *optional*):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, 1]`:
- 1 for tokens that are **sentence A**,
- 0 for tokens that are **sentence B**.
[What are token type IDs?](https://huggingface.co/docs/transformers/glossary#token-type-ids)
"""
IMAGE_INPUTS_DOCSTRING = r"""
Args:
pixel_values (`torch.Tensor`):
Pixel values corresponding to the images in the current batch.
Pixel values can be obtained from encoded images using [`AutoFeatureExtractor`](https://huggingface.co/docs/transformers/autoclass_tutorial#autofeatureextractor).
"""
AUDIO_INPUTS_DOCSTRING = r"""
Args:
input_values (`torch.Tensor` of shape `({0})`):
Float values of input raw speech waveform..
Input values can be obtained from audio file loaded into an array using [`AutoFeatureExtractor`](https://huggingface.co/docs/transformers/autoclass_tutorial#autofeatureextractor).
"""
class OVModel(OVBaseModel):
base_model_prefix = "openvino_model"
auto_model_class = AutoModel
def __init__(self, model: openvino.Model, config: transformers.PretrainedConfig = None, **kwargs):
super().__init__(model, config, **kwargs)
# Avoid warnings when creating a transformers pipeline
AutoConfig.register(self.base_model_prefix, AutoConfig)
self.auto_model_class.register(AutoConfig, self.__class__)
def forward(self, *args, **kwargs):
raise NotImplementedError
SEQUENCE_CLASSIFICATION_EXAMPLE = r"""
Example of sequence classification using `transformers.pipeline`:
```python
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
>>> outputs = pipe("Hello, my dog is cute")
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a SequenceClassifierOutput for sequence classification tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForSequenceClassification(OVModel):
export_feature = "text-classification"
auto_model_class = AutoModelForSequenceClassification
def __init__(self, model=None, config=None, **kwargs):
super().__init__(model, config, **kwargs)
@add_start_docstrings_to_model_forward(
INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ SEQUENCE_CLASSIFICATION_EXAMPLE.format(
processor_class=_TOKENIZER_FOR_DOC,
model_class="OVModelForSequenceClassification",
checkpoint="distilbert-base-uncased-finetuned-sst-2-english",
)
)
def forward(
self,
input_ids: Union[torch.Tensor, np.ndarray],
attention_mask: Union[torch.Tensor, np.ndarray],
token_type_ids: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Add the token_type_ids when needed
if "token_type_ids" in self.input_names:
inputs["token_type_ids"] = token_type_ids if token_type_ids is not None else np.zeros_like(input_ids)
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return SequenceClassifierOutput(logits=logits)
QUESTION_ANSWERING_EXAMPLE = r"""
Example of question answering using `transformers.pipeline`:
```python
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> pipe = pipeline("question-answering", model=model, tokenizer=tokenizer)
>>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
>>> outputs = pipe(question, text)
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a QuestionAnsweringModelOutput for extractive question-answering tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForQuestionAnswering(OVModel):
export_feature = "question-answering"
auto_model_class = AutoModelForQuestionAnswering
def __init__(self, model=None, config=None, **kwargs):
super().__init__(model, config, **kwargs)
@add_start_docstrings_to_model_forward(
INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ QUESTION_ANSWERING_EXAMPLE.format(
processor_class=_TOKENIZER_FOR_DOC,
model_class="OVModelForQuestionAnswering",
checkpoint="distilbert-base-cased-distilled-squad",
)
)
def forward(
self,
input_ids: Union[torch.Tensor, np.ndarray],
attention_mask: Union[torch.Tensor, np.ndarray],
token_type_ids: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Add the token_type_ids when needed
if "token_type_ids" in self.input_names:
inputs["token_type_ids"] = token_type_ids if token_type_ids is not None else np.zeros_like(input_ids)
outputs = self._inference(inputs)
start_logits = (
torch.from_numpy(outputs["start_logits"]).to(self.device) if not np_inputs else outputs["start_logits"]
)
end_logits = (
torch.from_numpy(outputs["end_logits"]).to(self.device) if not np_inputs else outputs["end_logits"]
)
return QuestionAnsweringModelOutput(start_logits=start_logits, end_logits=end_logits)
TOKEN_CLASSIFICATION_EXAMPLE = r"""
Example of token classification using `transformers.pipelines`:
```python
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> pipe = pipeline("token-classification", model=model, tokenizer=tokenizer)
>>> outputs = pipe("My Name is Peter and I live in New York.")
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a TokenClassifierOutput for token classification tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForTokenClassification(OVModel):
export_feature = "token-classification"
auto_model_class = AutoModelForTokenClassification
def __init__(self, model=None, config=None, **kwargs):
super().__init__(model, config, **kwargs)
@add_start_docstrings_to_model_forward(
INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ TOKEN_CLASSIFICATION_EXAMPLE.format(
processor_class=_TOKENIZER_FOR_DOC,
model_class="OVModelForTokenClassification",
checkpoint="dslim/bert-base-NER",
)
)
def forward(
self,
input_ids: Union[torch.Tensor, np.ndarray],
attention_mask: Union[torch.Tensor, np.ndarray],
token_type_ids: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Add the token_type_ids when needed
if "token_type_ids" in self.input_names:
inputs["token_type_ids"] = token_type_ids if token_type_ids is not None else np.zeros_like(input_ids)
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return TokenClassifierOutput(logits=logits)
FEATURE_EXTRACTION_EXAMPLE = r"""
Example of feature extraction using `transformers.pipelines`:
```python
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> pipe = pipeline("feature-extraction", model=model, tokenizer=tokenizer)
>>> outputs = pipe("My Name is Peter and I live in New York.")
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a BaseModelOutput for feature extraction tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForFeatureExtraction(OVModel):
export_feature = "feature-extraction"
auto_model_class = AutoModel
def __init__(self, model=None, config=None, **kwargs):
if {"token_embeddings", "sentence_embedding"}.issubset(
{name for output in model.outputs for name in output.names}
): # Sentence Transormers outputs
raise ValueError(
"This model is a Sentence Transformers model. Please use `OVSentenceTransformer` to load this model."
)
super().__init__(model, config, **kwargs)
@add_start_docstrings_to_model_forward(
INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ FEATURE_EXTRACTION_EXAMPLE.format(
processor_class=_TOKENIZER_FOR_DOC,
model_class="OVModelForFeatureExtraction",
checkpoint="sentence-transformers/all-MiniLM-L6-v2",
)
)
def forward(
self,
input_ids: Union[torch.Tensor, np.ndarray],
attention_mask: Union[torch.Tensor, np.ndarray],
token_type_ids: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Add the token_type_ids when needed
if "token_type_ids" in self.input_names:
inputs["token_type_ids"] = token_type_ids if token_type_ids is not None else np.zeros_like(input_ids)
if "decoder_input_ids" in self.input_names:
inputs["decoder_input_ids"] = input_ids
outputs = self._inference(inputs)
last_hidden_state = (
torch.from_numpy(outputs["last_hidden_state"]).to(self.device)
if not np_inputs
else outputs["last_hidden_state"]
)
return BaseModelOutput(last_hidden_state=last_hidden_state)
@classmethod
def _from_pretrained(cls, model_id: Union[str, Path], config: PretrainedConfig, *args, **kwargs):
if config.model_type == "sam":
return OVSamModel._from_pretrained(model_id, config, *args, **kwargs)
else:
return super()._from_pretrained(model_id, config, *args, **kwargs)
MASKED_LM_EXAMPLE = r"""
Example of masked language modeling using `transformers.pipelines`:
```python
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> mask_token = tokenizer.mask_token
>>> pipe = pipeline("fill-mask", model=model, tokenizer=tokenizer)
>>> outputs = pipe("The goal of life is" + mask_token)
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a MaskedLMOutput for masked language modeling tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForMaskedLM(OVModel):
export_feature = "fill-mask"
auto_model_class = AutoModelForMaskedLM
def __init__(self, model=None, config=None, **kwargs):
super().__init__(model, config, **kwargs)
@add_start_docstrings_to_model_forward(
INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ MASKED_LM_EXAMPLE.format(
processor_class=_TOKENIZER_FOR_DOC,
model_class="OVModelForMaskedLM",
checkpoint="roberta-base",
)
)
def forward(
self,
input_ids: Union[torch.Tensor, np.ndarray],
attention_mask: Union[torch.Tensor, np.ndarray],
token_type_ids: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
attention_mask = attention_mask.cpu().numpy()
token_type_ids = token_type_ids.cpu().numpy() if token_type_ids is not None else token_type_ids
inputs = {
"input_ids": input_ids,
"attention_mask": attention_mask,
}
# Add the token_type_ids when needed
if "token_type_ids" in self.input_names:
inputs["token_type_ids"] = token_type_ids if token_type_ids is not None else np.zeros_like(input_ids)
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return MaskedLMOutput(logits=logits)
IMAGE_CLASSIFICATION_EXAMPLE = r"""
Example of image classification using `transformers.pipelines`:
```python
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> preprocessor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> model.reshape(batch_size=1, sequence_length=3, height=224, width=224)
>>> pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor)
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> outputs = pipe(url)
```
This class can also be used with [timm](https://github.com/huggingface/pytorch-image-models)
models hosted on [HuggingFaceHub](https://huggingface.co/timm). Example:
```python
>>> from transformers import pipeline
>>> from optimum.intel.openvino.modeling_timm import TimmImageProcessor
>>> from optimum.intel import OVModelForImageClassification
>>> model_id = "timm/vit_tiny_patch16_224.augreg_in21k"
>>> preprocessor = TimmImageProcessor.from_pretrained(model_id)
>>> model = OVModelForImageClassification.from_pretrained(model_id, export=True)
>>> pipe = pipeline("image-classification", model=model, feature_extractor=preprocessor)
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> outputs = pipe(url)
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a ImageClassifierOutput for image classification tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForImageClassification(OVModel):
export_feature = "image-classification"
auto_model_class = AutoModelForImageClassification
def __init__(self, model=None, config=None, **kwargs):
super().__init__(model, config, **kwargs)
@classmethod
def from_pretrained(
cls,
model_id: Union[str, Path],
export: bool = False,
config: Optional["PretrainedConfig"] = None,
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
force_download: bool = False,
cache_dir: str = HUGGINGFACE_HUB_CACHE,
subfolder: str = "",
local_files_only: bool = False,
task: Optional[str] = None,
trust_remote_code: bool = False,
**kwargs,
):
# Fix the mismatch between timm_config and huggingface_config
local_timm_model = _is_timm_ov_dir(model_id)
if local_timm_model or (not os.path.isdir(model_id) and model_info(model_id).library_name == "timm"):
if not is_timm_available():
raise ImportError(
"To load a timm model, timm needs to be installed. Please install it with `pip install timm`."
)
if is_timm_version("<", "0.9.0"):
raise ImportError(
"To load a timm model, please make sure to upgrade your `timm` version to at least 0.9.0, you can upgrade it by running `pip install --upgrade timm`"
)
from .modeling_timm import TimmConfig, TimmForImageClassification, TimmOnnxConfig
config = TimmConfig.from_pretrained(model_id, **kwargs)
# If locally saved timm model, directly load
if local_timm_model:
return super()._from_pretrained(model_id=model_id, config=config)
model = TimmForImageClassification.from_pretrained(model_id, **kwargs)
onnx_config = TimmOnnxConfig(model.config)
return cls._to_load(model=model, config=config, onnx_config=onnx_config, stateful=False, **kwargs)
else:
return super().from_pretrained(
model_id=model_id,
config=config,
export=export,
token=token,
revision=revision,
force_download=force_download,
cache_dir=cache_dir,
subfolder=subfolder,
local_files_only=local_files_only,
task=task,
trust_remote_code=trust_remote_code,
**kwargs,
)
@add_start_docstrings_to_model_forward(
IMAGE_INPUTS_DOCSTRING.format("batch_size, num_channels, height, width")
+ IMAGE_CLASSIFICATION_EXAMPLE.format(
processor_class=_FEATURE_EXTRACTOR_FOR_DOC,
model_class="OVModelForImageClassification",
checkpoint="google/vit-base-patch16-224",
)
)
def forward(
self,
pixel_values: Union[torch.Tensor, np.ndarray],
**kwargs,
):
self.compile()
np_inputs = isinstance(pixel_values, np.ndarray)
if not np_inputs:
pixel_values = pixel_values.cpu().numpy()
inputs = {
"pixel_values": pixel_values,
}
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return ImageClassifierOutput(logits=logits)
AUDIO_CLASSIFICATION_EXAMPLE = r"""
Example of audio classification using `transformers.pipelines`:
```python
>>> from datasets import load_dataset
>>> from transformers import {processor_class}, pipeline
>>> from optimum.intel import {model_class}
>>> preprocessor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> pipe = pipeline("audio-classification", model=model, feature_extractor=preprocessor)
>>> dataset = load_dataset("superb", "ks", split="test")
>>> audio_file = dataset[3]["audio"]["array"]
>>> outputs = pipe(audio_file)
```
"""
@add_start_docstrings(
"""
OpenVINO Model with a SequenceClassifierOutput for audio classification tasks.
""",
MODEL_START_DOCSTRING,
)
class OVModelForAudioClassification(OVModel):
export_feature = "audio-classification"
auto_model_class = AutoModelForAudioClassification
def __init__(self, model=None, config=None, **kwargs):
super().__init__(model, config, **kwargs)
@add_start_docstrings_to_model_forward(
INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ AUDIO_CLASSIFICATION_EXAMPLE.format(
processor_class=_FEATURE_EXTRACTOR_FOR_DOC,
model_class="OVModelForAudioClassification",
checkpoint="superb/hubert-base-superb-er",
)
)
def forward(
self,
input_values: Union[torch.Tensor, np.ndarray],
attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
self.compile()
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
inputs = {
"input_values": input_values,
}
# Add the attention_mask when needed
if "attention_mask" in self.input_names:
inputs["attention_mask"] = attention_mask
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return SequenceClassifierOutput(logits=logits)
CTC_EXAMPLE = r"""
Example of CTC:
```python
>>> from transformers import {processor_class}
>>> from optimum.intel import {model_class}
>>> from datasets import load_dataset
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate
>>> processor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> # audio file is decoded on the fly
>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="np")
>>> logits = model(**inputs).logits
>>> predicted_ids = np.argmax(logits, axis=-1)
>>> transcription = processor.batch_decode(predicted_ids)
```
"""
@add_start_docstrings(
"""
Onnx Model with a language modeling head on top for Connectionist Temporal Classification (CTC).
""",
MODEL_START_DOCSTRING,
)
class OVModelForCTC(OVModel):
"""
CTC model for OpenVINO.
"""
auto_model_class = AutoModelForCTC
export_feature = "automatic-speech-recognition"
@add_start_docstrings_to_model_forward(
AUDIO_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ CTC_EXAMPLE.format(
processor_class=_FEATURE_EXTRACTOR_FOR_DOC,
model_class="OVModelForCTC",
checkpoint="facebook/hubert-large-ls960-ft",
)
)
def forward(
self,
input_values: Optional[torch.Tensor] = None,
attention_mask: Optional[Union[torch.Tensor, np.ndarray]] = None,
**kwargs,
):
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
inputs = {
"input_values": input_values,
}
# Add the attention_mask when needed
if "attention_mask" in self.input_names:
inputs["attention_mask"] = attention_mask
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return CausalLMOutput(logits=logits)
AUDIO_XVECTOR_EXAMPLE = r"""
Example of Audio XVector:
```python
>>> from transformers import {processor_class}
>>> from optimum.intel import {model_class}
>>> from datasets import load_dataset
>>> import torch
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> # audio file is decoded on the fly
>>> inputs = feature_extractor(
... [d["array"] for d in dataset[:2]["audio"]], sampling_rate=sampling_rate, return_tensors="pt", padding=True
... )
>>> embeddings = model(**inputs).embeddings
>>> embeddings = torch.nn.functional.normalize(embeddings, dim=-1).cpu()
>>> cosine_sim = torch.nn.CosineSimilarity(dim=-1)
>>> similarity = cosine_sim(embeddings[0], embeddings[1])
>>> threshold = 0.7
>>> if similarity < threshold:
... print("Speakers are not the same!")
>>> round(similarity.item(), 2)
```
"""
@add_start_docstrings(
"""
Onnx Model with an XVector feature extraction head on top for tasks like Speaker Verification.
""",
MODEL_START_DOCSTRING,
)
class OVModelForAudioXVector(OVModel):
"""
Audio XVector model for OpenVINO.
"""
auto_model_class = AutoModelForAudioXVector
export_feature = "audio-xvector"
@add_start_docstrings_to_model_forward(
AUDIO_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ AUDIO_XVECTOR_EXAMPLE.format(
processor_class=_FEATURE_EXTRACTOR_FOR_DOC,
model_class="OVModelForAudioXVector",
checkpoint="anton-l/wav2vec2-base-superb-sv",
)
)
def forward(
self,
input_values: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
):
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
inputs = {
"input_values": input_values,
}
# Add the attention_mask when needed
if "attention_mask" in self.input_names:
inputs["attention_mask"] = attention_mask
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
embeddings = (
torch.from_numpy(outputs["embeddings"]).to(self.device) if not np_inputs else outputs["embeddings"]
)
return XVectorOutput(logits=logits, embeddings=embeddings)
AUDIO_FRAME_CLASSIFICATION_EXAMPLE = r"""
Example of audio frame classification:
```python
>>> from transformers import {processor_class}
>>> from optimum.intel import {model_class}
>>> from datasets import load_dataset
>>> import torch
>>> dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate
>>> feature_extractor = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> inputs = feature_extractor(dataset[0]["audio"]["array"], return_tensors="pt", sampling_rate=sampling_rate)
>>> logits = model(**inputs).logits
>>> probabilities = torch.sigmoid(torch.as_tensor(logits)[0])
>>> labels = (probabilities > 0.5).long()
>>> labels[0].tolist()
```
"""
@add_start_docstrings(
"""
OpenVINO Model for with a frame classification head on top for tasks like Speaker Diarization.
""",
MODEL_START_DOCSTRING,
)
class OVModelForAudioFrameClassification(OVModel):
"""
Audio Frame Classification model for OpenVINO.
"""
auto_model_class = AutoModelForAudioFrameClassification
export_feature = "audio-frame-classification"
@add_start_docstrings_to_model_forward(
AUDIO_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+ AUDIO_FRAME_CLASSIFICATION_EXAMPLE.format(
processor_class=_FEATURE_EXTRACTOR_FOR_DOC,
model_class="OVModelForAudioFrameClassification",
checkpoint="anton-l/wav2vec2-base-superb-sd",
)
)
def forward(
self,
input_values: Optional[torch.Tensor] = None,
attention_mask: Optional[torch.Tensor] = None,
**kwargs,
):
np_inputs = isinstance(input_values, np.ndarray)
if not np_inputs:
input_values = input_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
inputs = {
"input_values": input_values,
}
# Add the attention_mask when needed
if "attention_mask" in self.input_names:
inputs["attention_mask"] = attention_mask
outputs = self._inference(inputs)
logits = torch.from_numpy(outputs["logits"]).to(self.device) if not np_inputs else outputs["logits"]
return TokenClassifierOutput(logits=logits)
CUSTOM_TASKS_EXAMPLE = """
Example of custom tasks (e.g. a sentence transformers with a pooler head):
```python
>>> from transformers import {processor_class}
>>> from optimum.intel import {model_class}
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}")
>>> inputs = tokenizer("I love burritos!", return_tensors="np")
>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooler_output = outputs.pooler_output
```
"""
@add_start_docstrings(
"""
OpenVINO Model for custom tasks. It can be used to leverage the inference acceleration for any single-file OpenVINO model, that may use custom inputs and outputs.
""",
MODEL_START_DOCSTRING,
)
class OVModelForCustomTasks(OVModel):
@add_start_docstrings_to_model_forward(
CUSTOM_TASKS_EXAMPLE.format(
processor_class=_TOKENIZER_FOR_DOC,
model_class="OVModelForCustomTasks",
checkpoint="IlyasMoutawwakil/sbert-all-MiniLM-L6-v2-with-pooler",
)
)
def forward(self, **kwargs):
expected_inputs_names = set(self.input_names)
inputs_names = set(kwargs)
if not expected_inputs_names.issubset(inputs_names):
raise ValueError(
f"Got unexpected inputs: expecting the following inputs : {', '.join(expected_inputs_names)} but got : {', '.join(inputs_names)}."
)
np_inputs = isinstance(next(iter(kwargs.values())), np.ndarray)
inputs = {}
for input_name in self.input_names:
inputs[input_name] = kwargs.pop(input_name).cpu().numpy() if not np_inputs else kwargs.pop(input_name)
outputs = self._inference(inputs)
model_outputs = {}
for key, value in outputs.items():
key_name = next(iter(key.names))
if "." in key_name:
key_name = key_name.split(".")[0]
if key_name not in model_outputs:
model_outputs[key_name] = []
model_outputs[key_name].append(torch.from_numpy(value).to(self.device) if not np_inputs else value)
else:
model_outputs[key_name] = torch.from_numpy(value).to(self.device) if not np_inputs else value
return ModelOutput(**model_outputs)
class OVModelForZeroShotImageClassification(OVModel):
auto_model_class = AutoModelForZeroShotImageClassification
export_feature = "zero-shot-image-classification"
def forward(self, input_ids, pixel_values, attention_mask: Optional[torch.Tensor] = None, **kwargs):
self.compile()
np_inputs = isinstance(input_ids, np.ndarray)
if not np_inputs:
input_ids = input_ids.cpu().numpy()
pixel_values = pixel_values.cpu().numpy()
attention_mask = attention_mask.cpu().numpy() if attention_mask is not None else attention_mask
inputs = {"input_ids": input_ids, "pixel_values": pixel_values}
# Add the attention_mask when needed
if "attention_mask" in self.input_names:
inputs["attention_mask"] = attention_mask if attention_mask is not None else np.ones_like(input_ids)
outputs = self._inference(inputs)
logits_per_image = (
torch.from_numpy(outputs["logits_per_image"]).to(self.device)
if not np_inputs
else outputs["logits_per_image"]
)
logits_per_text = (
torch.from_numpy(outputs["logits_per_text"]).to(self.device)
if not np_inputs
else outputs["logits_per_text"]
)
text_embeds = (
torch.from_numpy(outputs["text_embeds"]).to(self.device) if not np_inputs else outputs["text_embeds"]
)
image_embeds = (
torch.from_numpy(outputs["image_embeds"]).to(self.device) if not np_inputs else outputs["image_embeds"]
)
return CLIPOutput(
logits_per_image=logits_per_image,
logits_per_text=logits_per_text,
text_embeds=text_embeds,
image_embeds=image_embeds,
)