optimum/neuron/modeling_decoder.py (170 lines of code) (raw):
# coding=utf-8
# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Base class for text-generation model architectures on neuron devices."""
import logging
import os
from abc import ABC, abstractmethod
from pathlib import Path
from typing import Optional, Union
import torch
from huggingface_hub import HfApi
from transformers import GenerationConfig, PretrainedConfig
from transformers.file_utils import add_start_docstrings
from transformers.generation import StoppingCriteriaList
from .configuration_utils import NeuronConfig
from .modeling_base import NeuronModel
from .models.auto_model import get_neuron_model_class
from .utils.system import get_available_cores
logger = logging.getLogger(__name__)
NEURON_CAUSALLM_MODEL_START_DOCSTRING = r"""
This model inherits from [`~neuron.NeuronModel`]. Check the superclass documentation for the generic methods the
library implements for all its model (such as downloading or saving)
"""
NEURON_CAUSALLM_MODEL_GENERATE_DOCSTRING = r"""
A streamlined generate() method overriding the transformers.GenerationMixin.generate() method.
This method uses the same logits processors/warpers and stopping criterias as the transformers library
`generate()` method but restricts the generation to greedy search and sampling.
It does not support transformers `generate()` advanced options.
Please refer to https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate
for details on generation configuration.
Parameters:
input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
The sequence used as a prompt for the generation.
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
Mask to avoid performing attention on padding token indices.
generation_config (`~transformers.generation.GenerationConfig`, *optional*):
The generation configuration to be used as base parametrization for the generation call. `**kwargs`
passed to generate matching the attributes of `generation_config` will override them. If
`generation_config` is not provided, default will be used, which had the following loading
priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
configuration. Please note that unspecified parameters will inherit [`~transformers.generation.GenerationConfig`]'s
default values, whose documentation should be checked to parameterize generation.
stopping_criteria (`Optional[transformers.generation.StoppingCriteriaList], defaults to `None`):
Custom stopping criteria that complement the default stopping criteria built from arguments and a
generation config.
Returns:
`torch.Tensor`: A `torch.FloatTensor`.
"""
TEXT_GENERATION_EXAMPLE = r"""
Example of text generation:
```python
>>> from transformers import {processor_class}
>>> from optimum.neuron import {model_class}
>>> import torch
>>> tokenizer = {processor_class}.from_pretrained("{checkpoint}")
>>> model = {model_class}.from_pretrained("{checkpoint}", export=True)
>>> inputs = tokenizer("My favorite moment of the day is", return_tensors="pt")
>>> gen_tokens = model.generate(**inputs, do_sample=True, temperature=0.9, min_length=20, max_length=20)
>>> tokenizer.batch_decode(gen_tokens) # doctest: +IGNORE_RESULT
```
"""
def get_neuron_causal_lm_model_class(config: PretrainedConfig):
cls = get_neuron_model_class(config.model_type, task="text-generation", mode="inference")
if not issubclass(cls, NeuronModelForCausalLM):
raise ValueError(f"Model {config.model_type} is not a causal language model. Please use another base model.")
return cls
@add_start_docstrings(
r"""
Neuron model with a causal language modeling head for inference on Neuron devices.
""",
NEURON_CAUSALLM_MODEL_START_DOCSTRING,
)
class NeuronModelForCausalLM(NeuronModel, ABC):
preprocessors = [] # Required by optimum OptimizedModel
@classmethod
def get_neuron_config(
cls,
model_name_or_path: Union[str, Path],
config: "PretrainedConfig",
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
batch_size: Optional[int] = None,
sequence_length: Optional[int] = None,
tensor_parallel_size: Optional[int] = None,
auto_cast_type: Optional[str] = None,
):
"""
Get the Neuron configuration for the target model class.
Can be called either from the NeuronModelForCausalLM class or from a specific model class.
In the first case, the actual model class will be deduced from the model configuration.
Args:
neuron_model_class (`type`):
The class of the target neuron model.
model_name_or_path (`str` or `Path`):
The model name or path to the model directory.
config (`PretrainedConfig`):
The model configuration.
token (`str`, *optional*):
The token to use for authentication with the Hugging Face Hub.
revision (`str`, *optional*):
The revision of the model to use. If not specified, the latest revision will be used.
batch_size (`int`, *optional*):
The batch size to use for inference. If not specified, defaults to 1.
sequence_length (`int`, *optional*):
The sequence length to use for inference. If not specified, defaults to the model's maximum sequence length.
tensor_parallel_size (`int`, *optional*):
The number of cores to use for tensor parallelism. If not specified, all available cores will be used.
auto_cast_type (`str`, *optional*):
The data type to use for automatic casting. If not specified, defaults to the model's data type.
Returns:
`NeuronConfig`: The Neuron configuration for the model.
"""
if os.path.isdir(model_name_or_path):
checkpoint_id = None
checkpoint_revision = None
else:
checkpoint_id = model_name_or_path
# Get the exact checkpoint revision (SHA1)
api = HfApi(token=token)
model_info = api.repo_info(model_name_or_path, revision=revision)
checkpoint_revision = model_info.sha
if batch_size is None:
batch_size = 1
# If the sequence_length was not specified, deduce it from the model configuration
if sequence_length is None:
if hasattr(config, "n_positions"):
sequence_length = config.n_positions
elif hasattr(config, "max_position_embeddings"):
sequence_length = config.max_position_embeddings
else:
sequence_length = 1024
if tensor_parallel_size is None:
# Use all available cores
tensor_parallel_size = get_available_cores()
if auto_cast_type is None:
auto_cast_type = "fp32"
if config.torch_dtype == "float16":
auto_cast_type = "fp16"
elif config.torch_dtype == "bfloat16":
auto_cast_type = "bf16"
if type(cls) is NeuronModelForCausalLM:
# Instantiation through the abstract class: find the correct model class
cls = get_neuron_causal_lm_model_class(config)
# Call the _get_neuron_config method of the specific model class
return cls._get_neuron_config(
checkpoint_id=checkpoint_id,
checkpoint_revision=checkpoint_revision,
batch_size=batch_size,
sequence_length=sequence_length,
tensor_parallel_size=tensor_parallel_size,
auto_cast_type=auto_cast_type,
)
@classmethod
def _from_transformers(cls, *args, **kwargs):
# Deprecate it when optimum uses `_export` as from_pretrained_method in a stable release.
return cls._export(*args, **kwargs)
@classmethod
def _export(
cls,
model_id: str,
config: "PretrainedConfig",
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
batch_size: Optional[int] = None,
sequence_length: Optional[int] = None,
num_cores: Optional[int] = None,
auto_cast_type: Optional[str] = "bf16",
task: Optional[str] = "text-generation",
**kwargs,
) -> "NeuronModelForCausalLM":
"""Implementation of the `optimum.OptimizedModel._export` method.
It accepts simplified parameters and converts them to a NeuronConfig object.
This NeuronConfig object is then passed to the `export` method that is in charge
of exporting the model to Neuron format.
Args:
model_id (`str`):
The model ID or path to the model directory.
config (`PretrainedConfig`):
The model configuration.
token (`str`, *optional*):
The token to use for authentication with the Hugging Face Hub.
revision (`str`, *optional*):
The revision of the model to use. If not specified, the latest revision will be used.
batch_size (`int`, *optional*):
The batch size to use for inference. If not specified, defaults to 1.
sequence_length (`int`, *optional*):
The sequence length to use for inference. If not specified, defaults to the model's maximum sequence length.
num_cores (`int`, *optional*):
The number of cores to use for tensor parallelism. If not specified, all available cores will be used.
auto_cast_type (`str`, *optional*):
The data type to use for automatic casting. If not specified, defaults to the model's data type.
task (`str`, *optional*):
The task for which the model is being exported. Defaults to "text-generation".
Returns:
`NeuronModelForCausalLM`: The exported Neuron model.
"""
if task != "text-generation":
raise ValueError(
f"Task {task} is not supported for causal language models. Please use another base model."
)
if cls is NeuronModelForCausalLM:
# Instantiation through the abstract class: find the correct model class
cls = get_neuron_causal_lm_model_class(config)
# Create the neuron config for the specified parameters
neuron_config = cls.get_neuron_config(
model_id,
config,
token=token,
revision=revision,
batch_size=batch_size,
sequence_length=sequence_length,
tensor_parallel_size=num_cores,
auto_cast_type=auto_cast_type,
)
return cls.export(
model_id,
config,
neuron_config,
token=token,
revision=revision,
**kwargs,
)
@classmethod
def _from_pretrained(
cls,
model_id: Union[str, "Path"],
config: "PretrainedConfig",
**kwargs,
) -> "NeuronModelForCausalLM":
# Find the correct model class
cls = get_neuron_causal_lm_model_class(config)
return cls._from_pretrained(model_id, config, **kwargs)
@add_start_docstrings(
NEURON_CAUSALLM_MODEL_GENERATE_DOCSTRING
+ TEXT_GENERATION_EXAMPLE.format(
processor_class="AutoTokenizer",
model_class="NeuronModelForCausalLM",
checkpoint="Qwen/Qwen2.5-0.5B-Instruct",
)
)
def generate(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
generation_config: Optional["GenerationConfig"] = None,
stopping_criteria: Optional["StoppingCriteriaList"] = None,
**kwargs,
) -> torch.LongTensor:
raise NotImplementedError
@classmethod
@abstractmethod
def _get_neuron_config(
cls,
checkpoint_id: str,
checkpoint_revision: str,
batch_size: int,
sequence_length: int,
tensor_parallel_size: int,
auto_cast_type: str,
):
raise NotImplementedError("The `get_neuron_config` method must be implemented in the subclass.")
@classmethod
def export(
cls,
model_id: str,
config: "PretrainedConfig",
neuron_config: "NeuronConfig",
token: Optional[Union[bool, str]] = None,
revision: Optional[str] = None,
load_weights: Optional[bool] = True,
**kwargs,
) -> "NeuronModelForCausalLM":
"""Export the model to Neuron format.
This method must be implemented by the subclass. It should handle the export of the model to Neuron format.
Args:
model_id (`str`):
The model ID or path to the model directory.
config (`PretrainedConfig`):
The model configuration.
neuron_config (`NeuronConfig`):
The Neuron configuration for the model.
token (`str`, *optional*):
The token to use for authentication with the Hugging Face Hub.
revision (`str`, *optional*):
The revision of the model to use. If not specified, the latest revision will be used.
load_weights (`bool`, *optional*, defaults to `True`):
Whether to load the model weights after exporting. If `False`, the model will be exported without weights.
Returns:
`NeuronModelForCausalLM`: The exported Neuron model.
"""
raise NotImplementedError(
"The `export` method must be implemented in the subclass. It should handle the export of the model to Neuron format."
)