in src/optimum/nvidia/runtime.py [0:0]
def convert_generation_config(config: "GenerationConfig") -> "SamplingParams":
return SamplingParams(
end_id=config.eos_token_id[-1]
if isinstance(config.eos_token_id, list)
else config.eos_token_id,
pad_id=config.pad_token_id[-1]
if isinstance(config.pad_token_id, list)
else config.pad_token_id,
top_k=config.top_k if config.do_sample else 1,
top_p=config.top_p,
temperature=config.temperature,
beam_width=config.num_beams if config.do_sample else 1,
bad_token_ids=config.bad_words_ids,
length_penalty=config.length_penalty,
repetition_penalty=config.repetition_penalty,
no_repeat_ngram_size=config.no_repeat_ngram_size
if config.no_repeat_ngram_size > 0
else 1,
min_tokens=config.min_length if config.min_length > 0 else 1,
max_tokens=config.max_new_tokens or 32, # SamplingParams::max_tokens' default
return_generation_logits=config.output_logits,
return_log_probs=not config.renormalize_logits,
)