optimum/commands/export/neuron.py

# Copyright 2023 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Defines the command line for the export with Neuron compiler.""" import subprocess import sys from pathlib import Path from typing import TYPE_CHECKING, Optional from ...exporters import TasksManager from ..base import BaseOptimumCLICommand, CommandInfo if TYPE_CHECKING: from argparse import ArgumentParser, Namespace, _SubParsersAction def parse_args_neuron(parser: "ArgumentParser"): required_group = parser.add_argument_group("Required arguments") required_group.add_argument( "-m", "--model", type=str, required=True, help="Model ID on huggingface.co or path on disk to load model from." ) required_group.add_argument( "output", type=Path, help="Path indicating the directory where to store generated Neuron compiled TorchScript model.", ) optional_group = parser.add_argument_group("Optional arguments") optional_group.add_argument( "--task", default="auto", help=( "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:" f" {str(list(TasksManager._TRANSFORMERS_TASKS_TO_MODEL_LOADERS.keys()) + list(TasksManager._DIFFUSERS_TASKS_TO_MODEL_LOADERS.keys()))}." ), ) optional_group.add_argument( "--subfolder", type=str, default="", help=( "In case the relevant files are located inside a subfolder of the model repo either locally or on huggingface.co, specify the folder name here." ), ) optional_group.add_argument( "--atol", type=float, default=None, help="If specified, the absolute difference tolerance when validating the model. Otherwise, the default atol for the model will be used.", ) optional_group.add_argument("--cache_dir", type=str, default=None, help="Path indicating where to store cache.") optional_group.add_argument( "--disable_neuron_cache", action="store_true", help="Whether to disable automatic caching of compiled models (not applicable for JIT compilation).", ) optional_group.add_argument( "--trust-remote-code", action="store_true", help="Allow to use custom code for the modeling hosted in the model repository. This option should only be set for repositories you trust and in which you have read the code, as it will execute on your local machine arbitrary code present in the model repository.", ) optional_group.add_argument( "--compiler_workdir", type=Path, help="Path indicating the directory where to store intermediary files generated by Neuron compiler.", ) optional_group.add_argument( "--inline-weights-neff", action="store_true", help="Whether to disable the weights / neff graph inline. You can only replace weights of neuron-compiled models when the weights-neff inlining has been disabled during the compilation.", ) optional_group.add_argument( "--disable-validation", action="store_true", help="Whether to disable the validation of the exported model on neuron device compared to the outputs of original PyTorch model on CPU.", ) optional_group.add_argument( "--auto_cast", type=str, default=None, choices=["none", "matmul", "all"], help='Whether to cast operations from FP32 to lower precision to speed up the inference. Can be `"none"`, `"matmul"` or `"all"`.', ) optional_group.add_argument( "--auto_cast_type", type=str, default="bf16", choices=["bf16", "fp16", "mixed", "tf32"], help='The data type to cast FP32 operations to when auto-cast mode is enabled. Can be `"bf16"`, `"fp16"`, `"mixed"` or `"tf32"`.', ) optional_group.add_argument( "--disable-fast-relayout", action="store_true", help="Whether to disable fast relayout optimization which improves performance by using the matrix multiplier for tensor transpose.", ) optional_group.add_argument( "--disable-fallback", action="store_true", help="Whether to disable CPU partitioning to force operations to Neuron. Defaults to `False`, as without fallback, there could be some compilation failures or performance problems.", ) optional_group.add_argument( "--dynamic-batch-size", action="store_true", help="Enable dynamic batch size for neuron compiled model. If this option is enabled, the input batch size can be dynamic during the inference, but it comes with a potential tradeoff in terms of latency.", ) input_group = parser.add_argument_group("Input shapes") doc_input = "that the Neuron-cc compiler exported model will be able to take as input." input_group.add_argument( "--batch_size", type=int, help=f"Batch size {doc_input}", ) input_group.add_argument( "--sequence_length", type=int, help=f"Sequence length {doc_input}", ) input_group.add_argument( "--num_choices", type=int, help=f"Only for the multiple-choice task. Num choices {doc_input}", ) class NeuronExportCommand(BaseOptimumCLICommand): COMMAND = CommandInfo(name="neuron", help="Export PyTorch models to Neuron compiled TorchScript models.") def __init__( self, subparsers: "_SubParsersAction", args: Optional["Namespace"] = None, command: Optional["CommandInfo"] = None, from_defaults_factory: bool = False, parser: Optional["ArgumentParser"] = None, ): super().__init__( subparsers, args=args, command=command, from_defaults_factory=from_defaults_factory, parser=parser ) self.args_string = " ".join(sys.argv[3:]) @staticmethod def parse_args(parser: "ArgumentParser"): return parse_args_neuron(parser) def run(self): full_command = f"python3 -m optimum.exporters.neuron {self.args_string}" subprocess.run(full_command, shell=True, check=True)

optimum/commands/export/neuron.py (133 lines of code) (raw):