text-generation-inference/server/text_generation_server/generator.py [362:395]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        text = self.tokenizer.decode(input_tokens[0], skip_special_tokens=True)
        # These are just dummy params to allow Request creation
        parameters = NextTokenChooserParameters(
            temperature=1.0,
            top_k=None,
            top_p=None,
            do_sample=False,
            seed=None,
            repetition_penalty=1.0,
            typical_p=1.0,
        )
        stopping_parameters = StoppingCriteriaParameters(max_new_tokens=20, ignore_eos_token=True)
        dummy_request = Request(
            id=0,
            inputs=text,
            truncate=max_tokens,
            parameters=parameters,
            stopping_parameters=stopping_parameters,
        )
        return dummy_request


    def warmup(self, batch: Batch) -> int:
        """Verify if the hardware can support the target load.

        Args:
            batch (`Batch`):
                A batch corresponding to the maximum number of concurrent requests.

        Return:
            The maximum number of tokens the model supports.
        """
        logger.debug("Warming up the model")
        start = time.time()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



text-generation-inference/server/text_generation_server/jetstream_pt_support/generator.py [298:330]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        text = self.tokenizer.decode(input_tokens[0], skip_special_tokens=True)
        # These are just dummy params to allow Request creation
        parameters = NextTokenChooserParameters(
            temperature=1.0,
            top_k=None,
            top_p=None,
            do_sample=False,
            seed=None,
            repetition_penalty=1.0,
            typical_p=1.0,
        )
        stopping_parameters = StoppingCriteriaParameters(max_new_tokens=20, ignore_eos_token=True)
        dummy_request = Request(
            id=0,
            inputs=text,
            truncate=max_tokens,
            parameters=parameters,
            stopping_parameters=stopping_parameters,
        )
        return dummy_request

    def warmup(self, batch: Batch) -> int:
        """Verify if the hardware can support the target load.

        Args:
            batch (`Batch`):
                A batch corresponding to the maximum number of concurrent requests.

        Return:
            The maximum number of tokens the model supports.
        """
        logger.debug("Warming up the model")
        start = time.time()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



