optimum/executorch/modeling.py [437:475]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        echo: bool = False,
        pos_base: int = 0,
        max_seq_len: Optional[int] = None,
    ) -> List[int]:
        """
        Generate tokens from a prompt using the ExecuTorch model.

        Args:
            prompt_tokens (List[int]):
                List of token IDs representing the prompt.
            echo (`bool`, *optional*):
                Whether to include prompt tokens in the generated output. Defaults to `False`.
            pos_base (`int`, *optional*):
                Base position for the prompt tokens. Defaults to 0.
            max_seq_len (`int`, *optional*):
                Maximum sequence length for the generated output.
                Defaults to None and uses the model's `max_cache_size` attribute.
                Will be truncated to maximal cache size if larger than `max_cache_size`.

        Returns:
            List[int]: List of generated token IDs.

        Note:
            Temporarily implemented this method in Python due to limited access to ExecuTorch's c++ LLM runner via pybind.
            Expect improvements to the pybind interface in ExecuTorch version 0.4.1.
        """
        self.device = torch.device("cpu")
        if max_seq_len is None:
            # Default to max_cache_size if max_seq_len is not specified
            max_seq_len = self.max_cache_size
        elif max_seq_len > self.max_cache_size:
            logging.warning(
                f"max_seq_len={max_seq_len} is larger than max_cache_size={self.max_cache_size}. Generating tokens will be truncated to max_cache_size."
            )
            max_seq_len = self.max_cache_size

        if not hasattr(self, "decoder_start_token_id"):
            raise AttributeError("'decoder_start_token_id' is missing in the metadata of the PTE.")
        decoder_input_ids = torch.tensor([[self.decoder_start_token_id]], dtype=torch.long)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


optimum/executorch/modeling.py [985:1020]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        echo: bool = False,
        pos_base: int = 0,
        max_seq_len: Optional[int] = None,
    ) -> List[int]:
        """
        Generate tokens from a prompt using the ExecuTorch model.

        Args:
            input_features (List[int]):
                Log-mel spectrogram for 30-second audio chunk. Can be obtained using the WhisperProcessor. Should be of shape [1, 80, 3000] or
                [1, 128, 3000]. For details, check out the processor config.
            echo (`bool`, *optional*):
                Whether to include prompt tokens in the generated output. Defaults to `False`.
            pos_base (`int`, *optional*):
                Base position for the prompt tokens. Defaults to 0.
            max_seq_len (`int`, *optional*):
                Maximum sequence length for the generated output.
                Defaults to None and uses the model's `max_cache_size` attribute.
                Will be truncated to maximal cache size if larger than `max_cache_size`.

        Returns:
            List[int]: List of generated token IDs.
        """
        self.device = torch.device("cpu")
        if max_seq_len is None:
            # Default to max_cache_size if max_seq_len is not specified
            max_seq_len = self.max_cache_size
        elif max_seq_len > self.max_cache_size:
            logging.warning(
                f"max_seq_len={max_seq_len} is larger than max_cache_size={self.max_cache_size}. Generating tokens will be truncated to max_cache_size."
            )
            max_seq_len = self.max_cache_size

        if not hasattr(self, "decoder_start_token_id"):
            raise AttributeError("'decoder_start_token_id' is missing in the metadata of the PTE.")
        decoder_input_ids = torch.tensor([[self.decoder_start_token_id]], dtype=torch.long)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -