in src/optimum/nvidia/subpackage/commands/export.py [0:0]
def run(self):
# Retrieve args from CLI
args = self.args
# Do we have quantization?
if args.quantization:
tokenizer = AutoTokenizer.from_pretrained(args.model)
import_source_file(args.quantization, "recipe")
try:
from recipe import TARGET_QUANTIZATION_RECIPE
qconfig = TARGET_QUANTIZATION_RECIPE(tokenizer)
except ImportError:
raise ModuleNotFoundError(
f"Global variable 'TARGET_QUANTIZATION_RECIPE' was not found in {args.quantization}. "
"This is required to automatically detect and allocate the right recipe for quantization."
)
else:
qconfig = None
# Allocate model and derivatives needed to export
config = AutoConfig.from_pretrained(args.model)
export = ExportConfig.from_config(config, args.max_batch_size)
if args.max_input_length > 0:
export.max_input_len = args.max_input_length
if args.max_output_length > 0:
export.max_output_len = args.max_output_length
if args.max_new_tokens > 0:
export.max_num_tokens = args.max_new_tokens
# Import sharding
export = export.with_sharding(args.tp, args.pp)
# Export
model = AutoModelForCausalLM.from_pretrained(
args.model,
export_config=export,
quantization_config=qconfig,
export_only=True,
force_export=True,
)
if args.destination:
model.save_pretrained(args.destination)
if args.push_to_hub:
print(f"Exporting model to the Hugging Face Hub: {args.push_to_hub}")
model.push_to_hub(
args.push_to_hub,
commit_message=f"Optimum-CLI TensorRT-LLM {args.model} export",
)