optimum/neuron/modeling_decoder.py (170 lines of code) (raw):

# coding=utf-8 # Copyright 2023 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Base class for text-generation model architectures on neuron devices.""" import logging import os from abc import ABC, abstractmethod from pathlib import Path from typing import Optional, Union import torch from huggingface_hub import HfApi from transformers import GenerationConfig, PretrainedConfig from transformers.file_utils import add_start_docstrings from transformers.generation import StoppingCriteriaList from .configuration_utils import NeuronConfig from .modeling_base import NeuronModel from .models.auto_model import get_neuron_model_class from .utils.system import get_available_cores logger = logging.getLogger(__name__) NEURON_CAUSALLM_MODEL_START_DOCSTRING = r""" This model inherits from [`~neuron.NeuronModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving) """ NEURON_CAUSALLM_MODEL_GENERATE_DOCSTRING = r""" A streamlined generate() method overriding the transformers.GenerationMixin.generate() method. This method uses the same logits processors/warpers and stopping criterias as the transformers library `generate()` method but restricts the generation to greedy search and sampling. It does not support transformers `generate()` advanced options. Please refer to https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.GenerationMixin.generate for details on generation configuration. Parameters: input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`): The sequence used as a prompt for the generation. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. generation_config (`~transformers.generation.GenerationConfig`, *optional*): The generation configuration to be used as base parametrization for the generation call. `**kwargs` passed to generate matching the attributes of `generation_config` will override them. If `generation_config` is not provided, default will be used, which had the following loading priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model configuration. Please note that unspecified parameters will inherit [`~transformers.generation.GenerationConfig`]'s default values, whose documentation should be checked to parameterize generation. stopping_criteria (`Optional[transformers.generation.StoppingCriteriaList], defaults to `None`): Custom stopping criteria that complement the default stopping criteria built from arguments and a generation config. Returns: `torch.Tensor`: A `torch.FloatTensor`. """ TEXT_GENERATION_EXAMPLE = r""" Example of text generation: ```python >>> from transformers import {processor_class} >>> from optimum.neuron import {model_class} >>> import torch >>> tokenizer = {processor_class}.from_pretrained("{checkpoint}") >>> model = {model_class}.from_pretrained("{checkpoint}", export=True) >>> inputs = tokenizer("My favorite moment of the day is", return_tensors="pt") >>> gen_tokens = model.generate(**inputs, do_sample=True, temperature=0.9, min_length=20, max_length=20) >>> tokenizer.batch_decode(gen_tokens) # doctest: +IGNORE_RESULT ``` """ def get_neuron_causal_lm_model_class(config: PretrainedConfig): cls = get_neuron_model_class(config.model_type, task="text-generation", mode="inference") if not issubclass(cls, NeuronModelForCausalLM): raise ValueError(f"Model {config.model_type} is not a causal language model. Please use another base model.") return cls @add_start_docstrings( r""" Neuron model with a causal language modeling head for inference on Neuron devices. """, NEURON_CAUSALLM_MODEL_START_DOCSTRING, ) class NeuronModelForCausalLM(NeuronModel, ABC): preprocessors = [] # Required by optimum OptimizedModel @classmethod def get_neuron_config( cls, model_name_or_path: Union[str, Path], config: "PretrainedConfig", token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, batch_size: Optional[int] = None, sequence_length: Optional[int] = None, tensor_parallel_size: Optional[int] = None, auto_cast_type: Optional[str] = None, ): """ Get the Neuron configuration for the target model class. Can be called either from the NeuronModelForCausalLM class or from a specific model class. In the first case, the actual model class will be deduced from the model configuration. Args: neuron_model_class (`type`): The class of the target neuron model. model_name_or_path (`str` or `Path`): The model name or path to the model directory. config (`PretrainedConfig`): The model configuration. token (`str`, *optional*): The token to use for authentication with the Hugging Face Hub. revision (`str`, *optional*): The revision of the model to use. If not specified, the latest revision will be used. batch_size (`int`, *optional*): The batch size to use for inference. If not specified, defaults to 1. sequence_length (`int`, *optional*): The sequence length to use for inference. If not specified, defaults to the model's maximum sequence length. tensor_parallel_size (`int`, *optional*): The number of cores to use for tensor parallelism. If not specified, all available cores will be used. auto_cast_type (`str`, *optional*): The data type to use for automatic casting. If not specified, defaults to the model's data type. Returns: `NeuronConfig`: The Neuron configuration for the model. """ if os.path.isdir(model_name_or_path): checkpoint_id = None checkpoint_revision = None else: checkpoint_id = model_name_or_path # Get the exact checkpoint revision (SHA1) api = HfApi(token=token) model_info = api.repo_info(model_name_or_path, revision=revision) checkpoint_revision = model_info.sha if batch_size is None: batch_size = 1 # If the sequence_length was not specified, deduce it from the model configuration if sequence_length is None: if hasattr(config, "n_positions"): sequence_length = config.n_positions elif hasattr(config, "max_position_embeddings"): sequence_length = config.max_position_embeddings else: sequence_length = 1024 if tensor_parallel_size is None: # Use all available cores tensor_parallel_size = get_available_cores() if auto_cast_type is None: auto_cast_type = "fp32" if config.torch_dtype == "float16": auto_cast_type = "fp16" elif config.torch_dtype == "bfloat16": auto_cast_type = "bf16" if type(cls) is NeuronModelForCausalLM: # Instantiation through the abstract class: find the correct model class cls = get_neuron_causal_lm_model_class(config) # Call the _get_neuron_config method of the specific model class return cls._get_neuron_config( checkpoint_id=checkpoint_id, checkpoint_revision=checkpoint_revision, batch_size=batch_size, sequence_length=sequence_length, tensor_parallel_size=tensor_parallel_size, auto_cast_type=auto_cast_type, ) @classmethod def _from_transformers(cls, *args, **kwargs): # Deprecate it when optimum uses `_export` as from_pretrained_method in a stable release. return cls._export(*args, **kwargs) @classmethod def _export( cls, model_id: str, config: "PretrainedConfig", token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, batch_size: Optional[int] = None, sequence_length: Optional[int] = None, num_cores: Optional[int] = None, auto_cast_type: Optional[str] = "bf16", task: Optional[str] = "text-generation", **kwargs, ) -> "NeuronModelForCausalLM": """Implementation of the `optimum.OptimizedModel._export` method. It accepts simplified parameters and converts them to a NeuronConfig object. This NeuronConfig object is then passed to the `export` method that is in charge of exporting the model to Neuron format. Args: model_id (`str`): The model ID or path to the model directory. config (`PretrainedConfig`): The model configuration. token (`str`, *optional*): The token to use for authentication with the Hugging Face Hub. revision (`str`, *optional*): The revision of the model to use. If not specified, the latest revision will be used. batch_size (`int`, *optional*): The batch size to use for inference. If not specified, defaults to 1. sequence_length (`int`, *optional*): The sequence length to use for inference. If not specified, defaults to the model's maximum sequence length. num_cores (`int`, *optional*): The number of cores to use for tensor parallelism. If not specified, all available cores will be used. auto_cast_type (`str`, *optional*): The data type to use for automatic casting. If not specified, defaults to the model's data type. task (`str`, *optional*): The task for which the model is being exported. Defaults to "text-generation". Returns: `NeuronModelForCausalLM`: The exported Neuron model. """ if task != "text-generation": raise ValueError( f"Task {task} is not supported for causal language models. Please use another base model." ) if cls is NeuronModelForCausalLM: # Instantiation through the abstract class: find the correct model class cls = get_neuron_causal_lm_model_class(config) # Create the neuron config for the specified parameters neuron_config = cls.get_neuron_config( model_id, config, token=token, revision=revision, batch_size=batch_size, sequence_length=sequence_length, tensor_parallel_size=num_cores, auto_cast_type=auto_cast_type, ) return cls.export( model_id, config, neuron_config, token=token, revision=revision, **kwargs, ) @classmethod def _from_pretrained( cls, model_id: Union[str, "Path"], config: "PretrainedConfig", **kwargs, ) -> "NeuronModelForCausalLM": # Find the correct model class cls = get_neuron_causal_lm_model_class(config) return cls._from_pretrained(model_id, config, **kwargs) @add_start_docstrings( NEURON_CAUSALLM_MODEL_GENERATE_DOCSTRING + TEXT_GENERATION_EXAMPLE.format( processor_class="AutoTokenizer", model_class="NeuronModelForCausalLM", checkpoint="Qwen/Qwen2.5-0.5B-Instruct", ) ) def generate( self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, generation_config: Optional["GenerationConfig"] = None, stopping_criteria: Optional["StoppingCriteriaList"] = None, **kwargs, ) -> torch.LongTensor: raise NotImplementedError @classmethod @abstractmethod def _get_neuron_config( cls, checkpoint_id: str, checkpoint_revision: str, batch_size: int, sequence_length: int, tensor_parallel_size: int, auto_cast_type: str, ): raise NotImplementedError("The `get_neuron_config` method must be implemented in the subclass.") @classmethod def export( cls, model_id: str, config: "PretrainedConfig", neuron_config: "NeuronConfig", token: Optional[Union[bool, str]] = None, revision: Optional[str] = None, load_weights: Optional[bool] = True, **kwargs, ) -> "NeuronModelForCausalLM": """Export the model to Neuron format. This method must be implemented by the subclass. It should handle the export of the model to Neuron format. Args: model_id (`str`): The model ID or path to the model directory. config (`PretrainedConfig`): The model configuration. neuron_config (`NeuronConfig`): The Neuron configuration for the model. token (`str`, *optional*): The token to use for authentication with the Hugging Face Hub. revision (`str`, *optional*): The revision of the model to use. If not specified, the latest revision will be used. load_weights (`bool`, *optional*, defaults to `True`): Whether to load the model weights after exporting. If `False`, the model will be exported without weights. Returns: `NeuronModelForCausalLM`: The exported Neuron model. """ raise NotImplementedError( "The `export` method must be implemented in the subclass. It should handle the export of the model to Neuron format." )