in src/optimum/nvidia/export/config.py [0:0]
def validate(self) -> "ExportConfig":
if self.optimization_level < 0:
raise ValueError(
f"optimization_level should be >= 0, got {self.optimization_level}"
)
if self.max_num_tokens == -1:
if self.enabled_chunked_context:
# Should be N * tokens_per_block (8192 is the default)
self.max_num_tokens = 8192 # hardcode for now
warn(
f"max_num_tokens set to {self.max_num_tokens} with chunked context enabled might not be optimal."
)
else:
self.max_num_tokens = self.max_batch_size * self.max_input_len // 2
LOGGER.debug(f"Inferred max_num_tokens={self.max_num_tokens}")
return self