def run()

in src/optimum/nvidia/subpackage/commands/export.py [0:0]
39 lines of code
9 McCabe index (conditional complexity)

    def run(self):
        # Retrieve args from CLI
        args = self.args

        # Do we have quantization?
        if args.quantization:
            tokenizer = AutoTokenizer.from_pretrained(args.model)
            import_source_file(args.quantization, "recipe")

            try:
                from recipe import TARGET_QUANTIZATION_RECIPE

                qconfig = TARGET_QUANTIZATION_RECIPE(tokenizer)
            except ImportError:
                raise ModuleNotFoundError(
                    f"Global variable 'TARGET_QUANTIZATION_RECIPE' was not found in {args.quantization}. "
                    "This is required to automatically detect and allocate the right recipe for quantization."
                )

        else:
            qconfig = None

        # Allocate model and derivatives needed to export
        config = AutoConfig.from_pretrained(args.model)
        export = ExportConfig.from_config(config, args.max_batch_size)

        if args.max_input_length > 0:
            export.max_input_len = args.max_input_length

        if args.max_output_length > 0:
            export.max_output_len = args.max_output_length

        if args.max_new_tokens > 0:
            export.max_num_tokens = args.max_new_tokens

        # Import sharding
        export = export.with_sharding(args.tp, args.pp)

        # Export
        model = AutoModelForCausalLM.from_pretrained(
            args.model,
            export_config=export,
            quantization_config=qconfig,
            export_only=True,
            force_export=True,
        )

        if args.destination:
            model.save_pretrained(args.destination)

        if args.push_to_hub:
            print(f"Exporting model to the Hugging Face Hub: {args.push_to_hub}")
            model.push_to_hub(
                args.push_to_hub,
                commit_message=f"Optimum-CLI TensorRT-LLM {args.model} export",
            )