tinynn/converter/base.py

import collections import io import os import typing import torch import numpy as np from .operators import CommonGraph, ExtendedOperator, GraphOptimizer, HybridQuantizer, HalfQuantizer from .operators.op_version import OPVersioner from .operators.tflite import Tensor from .operators.torch import OPERATOR_CONVERTER_DICT from .operators.torch.base import NoTrackOperator, TrackRevQParamsOperator, TrackQParamsOperator from .operators.torch.aten import ATenDequantizeOperator, ATenQuantizePerTensorOperator from ..util.converter_util import generate_converter_config from ..util.util import get_logger log = get_logger(__name__, 'INFO') class TFLiteConverter(object): def __init__( self, model: typing.Union[torch.jit.ScriptFunction, torch.jit.ScriptModule, torch.nn.Module], dummy_input: typing.Union[torch.Tensor, typing.Iterable[torch.Tensor]], tflite_path: str, input_transpose: typing.Optional[typing.Union[bool, typing.Iterable[bool]]] = None, output_transpose: typing.Optional[typing.Union[bool, typing.Iterable[bool]]] = None, nchw_transpose: bool = True, dump_jit_model_path: typing.Optional[str] = None, dump_dummy_input_path: typing.Optional[str] = None, dump_config_path: typing.Optional[str] = None, strict_symmetric_check: bool = False, preserve_tensors: bool = False, optimize: int = GraphOptimizer.ALL_OPTIMIZE, quantize_target_type: str = 'uint8', quantize_input_output_type: typing.Optional[str] = None, hybrid_quantization_from_float: bool = False, hybrid_per_channel: bool = False, hybrid_asymmetric_inputs: bool = True, hybrid_quantize_weight_type: typing.Optional[str] = None, fuse_quant_dequant: bool = False, fuse_input_indices: typing.Optional[typing.List[int]] = None, fuse_output_indices: typing.Optional[typing.List[int]] = None, gc_when_reload: bool = False, group_conv_rewrite: bool = False, rewrite_quantizable: bool = False, tflite_micro_rewrite: bool = False, map_bilstm_to_lstm: bool = False, float16_quantization: bool = False, enable_mtk_ops: bool = False, conv_transpose_with_bias: bool = True, max_transpose_dims: int = -1, hybrid_conv: bool = True, hybrid_int16_lstm: bool = False, unroll_rnn: bool = False, separated_rnn_gate_calc: bool = False, bypass_elementwise_passthrough_constraint: bool = False, hybrid_gen_single_op_models: bool = False, hybrid_config: typing.Optional[typing.Dict[str, bool]] = None, group_tensors: bool = False, missing_outputs_as_constants: bool = False, legacy_gelu: bool = False, ) -> None: """ The TFLiteConverter class Args: model (typing.Union[torch.jit.ScriptFunction, torch.jit.ScriptModule, torch.nn.Module]): The input model \ (either traced or non-traced) dummy_input (typing.Union[torch.Tensor, typing.Iterable[torch.Tensor]]): A viable input to the model tflite_path (str): Path to use for exporting input_transpose (typing.Optional[typing.Union[bool, typing.Iterable[bool]]], optional): Whether to \ transpose the input(s). Defaults to None(True for 4d-input, False otherwise). output_transpose (typing.Optional[typing.Union[bool, typing.Iterable[bool]]], optional): Whether to \ transpose the output(s). Defaults to None(True for 4d-input, False otherwise). nchw_transpose (bool): Whether to perform nchw->nhwc transposes on input and output tensors. \ `False` is specified, the arguments `input_transpose` and `output_transpose` will be ignored. dump_jit_model_path (typing.Optional[str]): The path for dumping the jit model. Defaults to None dump_dummy_input_path (typing.Optional[str]): The path for dumping the dummy input. Defaults to None dump_config_path (typing.Optional[str]): The path for dumping the json config. Defaults to None strict_symmetric_check (bool): Strict symmetric quantization checks. Defaults to False preserve_tensors (bool): Preserve the copies of the intermediate tensors. Defaults to False optimize (int): The level of graph optimization. Defaults to `GraphOptimizer.ALL_OPTIMIZE` quantize_target_type (str): Target type for quantization. Defaults to 'uint8' quantize_input_output_type (str): Input and output type for quantization. Defaults to None (inferred) hybrid_quantization_from_float (bool): Direct hybrid quantization from a float model. Defaults to False hybrid_per_channel (bool): Prefer per-channel kernels in hybrid quantization. Defaults to False hybrid_asymmetric_inputs (bool): Prefer asymmetric inputs while performing hybrid quantization hybrid_quantize_weight_type (typing.Optional[str]): Quantized weight type for hybrid quantization. \ If it is unset, then the value of `quantize_target_type` will be used. Defaults to None fuse_quant_dequant (bool): Remove quant and dequant nodes directly connected to i/o nodes. Defaults to False fuse_input_indices (typing.Optional[typing.List[int]]): Used together with `fuse_quant_dequant`. Indices \ of input nodes to fuse with `Quantize`. Defaults to None (which fuses all inputs available) fuse_output_indices (typing.Optional[typing.List[int]]): Used together with `fuse_quant_dequant`. Indices \ of output nodes to fuse with `Dequantize`. Defaults to None (which fuses all outputs available) gc_when_reload (bool): Apply GC when reloading the torchscript into memory group_conv_rewrite (bool): Rewriting for group [de]convolution. Defaults to False rewrite_quantizable (bool): Rewriting quantizable ops (e.g. BATCH_MATMUL, SOFTMAX, LOG_SOFTMAX) \ to use quantized kernels. Defaults to False tflite_micro_rewrite (bool): Rewriting for running on TFLite-micro. Defaults to False map_bilstm_to_lstm (bool): Translating bidirectional LSTM to TFLite ops with `UnidirectionalLSTM`. \ Defaults to False float16_quantization (bool): Quantize constants with float32 dtype to floa16 dtype. Defaults to False enable_mtk_ops (bool): Translating with custom MTK operators. Defaults to False conv_transpose_with_bias (bool): ConvTranspose ops with bias. Defaults to True max_transpose_dims (int): Max dimensions for the `Transpose` op. Defaults to -1, which means unlimited hybrid_conv (bool): Enable hybrid quantization for Conv2d and DepthwiseConv2d. Defaults to True hybrid_int16_lstm (bool): Enable hybrid int16 quantization for LSTM. Defaults to False unroll_rnn (bool): Unrolling LSTM (translate LSTM to seperate ops). Defaults to False separated_rnn_gate_calc (bool): Separated calculation for every gate in RNN. Effective only when \ `unroll_rnn=True`. Defaults to False bypass_elementwise_passthrough_constraint (bool): Bypass constraints in elementwise passthrough passes. \ Defaults to False hybrid_gen_single_op_models: Generate both floating point and quantized version of the model for hybrid \ quantizable ops. Defaults to False group_tensors (bool): Group tensors to save space. Defaults to False missing_outputs_as_constants (bool): View missing outputs as constants. Defaults to False legacy_gelu (bool): Fallback to the legacy behaviour for translating gelu. Defaults to False """ self.model = model self.lower_model = None self.graph = None self.tensor_map = {} self.tensor_map_copies = {} self.common_graph = CommonGraph() if type(dummy_input) in (tuple, list): self.dummy_input = dummy_input else: self.dummy_input = [dummy_input] self.flatten_inputs = [] self.tflite_path = tflite_path self.nchw_transpose = nchw_transpose if self.nchw_transpose: self.input_transpose = input_transpose self.output_transpose = output_transpose else: self.input_transpose = False self.output_transpose = False self.strict_symmetric_check = strict_symmetric_check self.dump_jit_model_path = dump_jit_model_path self.dump_dummy_input_path = dump_dummy_input_path self.dump_config_path = dump_config_path self.preserve_tensors = preserve_tensors self.optimize = optimize self.hybrid = hybrid_quantization_from_float self.hybrid_per_channel = hybrid_per_channel self.hybrid_asymmetric_inputs = hybrid_asymmetric_inputs self.fuse_quant_dequant = fuse_quant_dequant self.fuse_input_indices = fuse_input_indices self.fuse_output_indices = fuse_output_indices self.gc_when_reload = gc_when_reload self.group_conv_rewrite = group_conv_rewrite self.rewrite_quantizable = rewrite_quantizable self.tflite_micro_rewrite = tflite_micro_rewrite self.map_bilstm_to_lstm = map_bilstm_to_lstm self.float16_quantization = float16_quantization self.enable_mtk_ops = enable_mtk_ops self.conv_transpose_with_bias = conv_transpose_with_bias self.max_transpose_dims = max_transpose_dims self.hybrid_conv = hybrid_conv self.hybrid_int16_lstm = hybrid_int16_lstm self.unroll_rnn = unroll_rnn self.separated_rnn_gate_calc = separated_rnn_gate_calc self.bypass_elementwise_passthrough_constraint = bypass_elementwise_passthrough_constraint self.hybrid_gen_single_op_models = hybrid_gen_single_op_models self.hybrid_config = hybrid_config self.group_tensors = group_tensors self.missing_outputs_as_constants = missing_outputs_as_constants self.legacy_gelu = legacy_gelu if quantize_target_type == 'uint8': self.q_type = np.uint8 if self.strict_symmetric_check: log.warning('Symmetric quantized model with uint8 is unsupported in most backends of TFLite') elif quantize_target_type == 'int8': self.q_type = np.int8 elif quantize_target_type == 'int16': if not self.strict_symmetric_check: raise AttributeError('Int16 quantization requires strict_symmetric_check=True') self.q_type = np.int16 else: raise AttributeError(f'unknown quantize_target_type: {quantize_target_type}, expected: uint8, int8, int16') if quantize_input_output_type is not None: assert fuse_quant_dequant, 'Please set fuse_quant_dequant=True, otherwise quantize_input_type is ignored' assert quantize_input_output_type in ( 'int8', 'uint8', 'int16', ), f'unknown quantize_input_output_type: {quantize_input_output_type}, expected: uint8, int8, int16' if quantize_input_output_type == 'int16' and quantize_target_type != 'int16': raise AttributeError( 'quantize_input_output_type == \'int16\' and quantize_target_type != \'int16\' is not supported' ) self.quantize_input_output_type = quantize_input_output_type if hybrid_quantize_weight_type is None: hybrid_quantize_weight_type = quantize_target_type if hybrid_quantize_weight_type == 'uint8': if self.hybrid: if self.hybrid_per_channel: raise AttributeError('Per-channel kernels supports int8 only') log.warning( 'Unless you are using legacy TFLite (<1.14), please set quantize_target_type to int8 instead' ) self.hybrid_q_type = np.uint8 elif hybrid_quantize_weight_type == 'int8': self.hybrid_q_type = np.int8 elif hybrid_quantize_weight_type == 'int16': self.hybrid_q_type = np.int16 if self.hybrid: raise AttributeError('Hybrid kernels supports int8 and uint8 only') if dump_config_path and not dump_jit_model_path: raise AssertionError("when dump_config_path is set, dump_jit_model_path is required to be set") self.input_offset = 1 def init_jit_graph(self): # Multi-GPU modules doesn't support JIT tracing if isinstance(self.model, (torch.nn.DataParallel, torch.nn.parallel.DistributedDataParallel)): self.model = self.model.module if not isinstance(self.model, (torch.jit.ScriptFunction, torch.jit.ScriptModule)): if hasattr(self.model, 'cpu'): self.model.cpu() if hasattr(self.model, 'eval'): self.model.eval() with torch.no_grad(): script = torch.jit.trace(self.model, self.dummy_input) # Remove reference to original model to save memory self.model = None # Have to save it once, otherwise something weird happens if self.dump_jit_model_path is None: with io.BytesIO() as f: torch.jit.save(script, f) f.seek(0) script = torch.jit.load(f) else: jit_model_dir = os.path.abspath(os.path.dirname(self.dump_jit_model_path)) os.makedirs(jit_model_dir, exist_ok=True) torch.jit.save(script, self.dump_jit_model_path) if self.gc_when_reload: import gc script = None gc.collect() script = torch.jit.load(self.dump_jit_model_path) self.model = script if isinstance(self.model, torch.jit.ScriptFunction): self.input_offset = 0 if self.dump_dummy_input_path is not None: dummy_arrs = list(map(lambda x: x.detach().cpu().numpy(), self.dummy_input)) np.savez(self.dump_dummy_input_path, *dummy_arrs) if self.dump_config_path is not None: generate_converter_config( self.dummy_input, [], self.input_transpose, [], self.dump_jit_model_path, self.tflite_path, self.dump_config_path, ) def init_lowered_module(self): assert ( isinstance(self.model, torch.jit.ScriptFunction) or self.model.training is False or str(next(self.model.graph.inputs()).type()) == '__torch__.PlaceholderModule' ), ( 'Model is in training mode. Please run `model.eval()` before model conversion. If you are passing in a' ' TorchScript model, make sure you use `torch.jit.save` to dump the model to disk and then load it using' ' `torch.jit.load`.' ) graph = self.model.graph # Inline everything torch._C._jit_pass_inline(graph) # Remove fork/wait nodes torch._C._jit_pass_inline_fork_wait(graph) torch._C._jit_pass_lint(graph) torch._C._jit_pass_lower_all_tuples(graph) # we record now record some ops like ones/zeros # into a trace where we previously recorded constants # use constant prop to maintain our current level of onnx support # without implementing symbolics for all of them torch._C._jit_pass_constant_propagation(graph) # _split_tensor_list_constants(graph, graph) # run dce to eliminate dead parts of the graph that might have been # left behind by things like symbolic_override torch._C._jit_pass_dce(graph) torch._C._jit_pass_lint(graph) torch._C._jit_pass_canonicalize_graph_fuser_ops(graph) torch._C._jit_pass_lint(graph) torch._C._jit_pass_peephole(graph, True) torch._C._jit_pass_fuse_addmm(graph) torch._C._jit_pass_lint(graph) torch._C._jit_pass_peephole(graph, True) torch._C._jit_pass_lower_all_tuples(graph) self.graph = graph log.debug('Lowered graph:') log.debug(self.graph) def init_flatten_inputs(self): self.flatten_inputs.clear() for t in self.dummy_input: if isinstance(t, (list, tuple)): for rt in t: self.flatten_inputs.append(rt) else: self.flatten_inputs.append(t) def init_input_transpose(self): input_transpose = self.input_transpose if type(input_transpose) not in (tuple, list): input_transpose = [input_transpose] * len(self.flatten_inputs) for i, t in enumerate(self.flatten_inputs): if input_transpose[i] is None: if isinstance(t, torch.Tensor): input_transpose[i] = t.dim() == 4 else: input_transpose[i] = False self.input_transpose = input_transpose def init_common_graph(self): graph_inputs = [x.debugName() for x in list(self.graph.inputs())][self.input_offset :] graph_outputs = [x.debugName() for x in list(self.graph.outputs())] self.common_graph.inputs.extend(graph_inputs) self.common_graph.outputs.extend(graph_outputs) self.common_graph.input_transpose.extend(self.input_transpose) self.common_graph.output_transpose = self.output_transpose tensors = [] for i, node in enumerate(graph_inputs): tensors.append( Tensor( self.flatten_inputs[i], node, has_buffer=False, asymmetric=not self.strict_symmetric_check, q_type=self.q_type, ) ) self.common_graph.add_nodes(tensors, ExtendedOperator.INPUT_NODE) def init_inputs(self): graph_inputs = [x.debugName() for x in list(self.graph.inputs())] for i, node in enumerate(graph_inputs): if self.input_offset > 0 and i == 0: self.tensor_map[graph_inputs[i]] = self.model else: self.tensor_map[graph_inputs[i]] = self.flatten_inputs[i - self.input_offset] def unsupported_operations(self, unique=True) -> typing.List[str]: """Returns unsupported operations in the graph""" if self.graph is None: self.init_lowered_module() all_nodes = list(self.graph.nodes()) ops = [] for node in all_nodes: k = node.kind() converter_type = OPERATOR_CONVERTER_DICT.get(k, None) if converter_type is None: ops.append(k) if unique: return list(set(ops)) else: return ops def init_operations(self): log.debug('Initialize operators...') node_queue = collections.deque(self.graph.nodes()) scope_map = {} current_scope = None while node_queue: node = node_queue.popleft() k = node.kind() output_tensors = [] converter_type = OPERATOR_CONVERTER_DICT.get(k, NoTrackOperator) converter = converter_type( node, self.tensor_map, current_scope, not self.strict_symmetric_check, self.q_type, self.hybrid_q_type, self.map_bilstm_to_lstm, self.enable_mtk_ops, self.hybrid_asymmetric_inputs, self.unroll_rnn, self.separated_rnn_gate_calc, self.conv_transpose_with_bias, self.legacy_gelu, ) # Don't track the operator if all the input nodes are not tracked unless it has custom implementation # (e.g prim::* ops) if converter_type.run == NoTrackOperator.run and converter_type != NoTrackOperator: no_track_flag = True for n in converter.input_names: if self.common_graph.has_nested_names(n): nested_names = self.common_graph.get_list_expanded_names(n) for x in nested_names: if x in self.common_graph.tensor_map and self.common_graph.tensor_map[x].buffer is None: no_track_flag = False break elif n in self.common_graph.tensor_map and self.common_graph.tensor_map[n].buffer is None: no_track_flag = False break if no_track_flag: if converter_type == ATenDequantizeOperator: converter_type = TrackQParamsOperator elif converter_type == ATenQuantizePerTensorOperator: converter_type = TrackRevQParamsOperator else: converter_type = NoTrackOperator converter = converter_type( node, self.tensor_map, current_scope, not self.strict_symmetric_check, self.q_type, self.hybrid_q_type, self.map_bilstm_to_lstm, self.enable_mtk_ops, self.hybrid_asymmetric_inputs, self.unroll_rnn, self.separated_rnn_gate_calc, self.conv_transpose_with_bias, self.legacy_gelu, ) if k != 'prim::Constant': log.debug(f'{k} {converter.input_names} -> {converter.output_names} {converter_type.__name__}') # Don't fetch attrs and schemas for non-tracking nodes if converter_type not in (NoTrackOperator, TrackRevQParamsOperator, TrackQParamsOperator): try: attrs = converter.fetch_all_attrs(node) except StopIteration: attrs = None args = converter.fetch_annotated_args(node) else: attrs = None args = None converter.parse(node, attrs, args, self.common_graph) outputs = converter.output_names new_nodes = converter.output_nodes if output_tensors is not None: output_tensors.extend(converter.get_output_tensors()) if len(new_nodes) > 0: node_queue.extendleft(reversed(new_nodes)) if k == 'prim::PythonOp': s = node.scopeName() scope_map.setdefault(s, 0) scope_map[s] += 1 current_scope = f'{s}_{scope_map[s]}' converter.prepare_scope_tensors(node, attrs, args, self.common_graph, current_scope) elif k == 'prim::Return': current_scope = None assert len(output_tensors) == len(outputs) for t, name in zip(output_tensors, outputs): self.tensor_map[name] = t if self.preserve_tensors and isinstance(t, torch.Tensor): self.tensor_map_copies[name] = t.detach().clone() def __try_infer_type(self, params): try: inferred = torch._C._jit_try_infer_type(params) if hasattr(inferred, 'type'): return inferred.type().annotation_str finally: return str(inferred) def __unpack_params(self, params): return NoTrackOperator.unpack_params(None, params) def convert(self): """Converts the model to the TFLite format Raises: Exception: If unsupported ops are found, an Exception will be raised """ self.init_flatten_inputs() self.init_input_transpose() self.init_jit_graph() self.init_lowered_module() self.init_common_graph() self.init_inputs() self.init_operations() unsupported_ops = self.unsupported_operations() if len(unsupported_ops) > 0: log.error(f'Unsupported ops: {", ".join(unsupported_ops)}') raise Exception("Cannot continue due to fatal error") else: optimizer = GraphOptimizer( self.common_graph, self.optimize, self.fuse_quant_dequant, self.group_conv_rewrite, self.rewrite_quantizable, self.tflite_micro_rewrite, self.quantize_input_output_type, self.fuse_input_indices, self.fuse_output_indices, self.max_transpose_dims, self.bypass_elementwise_passthrough_constraint, self.group_tensors, self.conv_transpose_with_bias, self.hybrid_int16_lstm, ) optimizer.optimize() self.output_transpose = self.common_graph.output_transpose if self.hybrid: quantizer = HybridQuantizer( self.common_graph, self.hybrid_asymmetric_inputs, self.hybrid_q_type, self.hybrid_per_channel, self.hybrid_conv, self.hybrid_int16_lstm, self.hybrid_gen_single_op_models, self.hybrid_config, ) quantizer.quantize() optimizer.cleanup_dead_nodes() if self.float16_quantization: quantizer = HalfQuantizer(self.common_graph) quantizer.quantize() optimizer.cleanup_dead_nodes() versioner = OPVersioner(self.common_graph) versioner.process() if self.missing_outputs_as_constants: tensors = [] for output_name in self.common_graph.outputs: if output_name not in self.common_graph.tensor_map: tensors.append( Tensor( self.tensor_map[output_name], output_name, has_buffer=True, asymmetric=not self.strict_symmetric_check, q_type=self.q_type, ) ) self.common_graph.add_nodes(tensors, ExtendedOperator.CONSTANT_NODE) self.common_graph.add_outputs([t.name for t in tensors]) self.common_graph.convert(self.tflite_path) log.info(f'Generated model saved to {self.tflite_path}') def visualize(self, hide_constants=True): """Visualize the TinyNeuralNetwork Graph Args: hide_constants (bool, optional): Hide the constant nodes in the graph. Defaults to True. """ self.common_graph.visualize(hide_constants) def get_outputs(self): """Returns the output of the model, which is evaluated via tracing nodes one by one""" outputs = [] for name in self.common_graph.outputs: outputs.append(self.tensor_map[name]) return outputs def get_value(self, name, default_val=None): """Returns the output according to the name of the node. If the name doesn't exist, `default_val` is returned""" if self.preserve_tensors: val = self.tensor_map_copies.get(name, default_val) else: val = self.tensor_map.get(name, default_val) type_ = self.__try_infer_type(val) if type_.endswith('PackedParamsBase'): return self.__unpack_params(val) return val def tensor_names(self) -> typing.List[str]: """Returns the all the names of the intermediate tensors Returns: typing.List[str]: The names of the intermediate tensors """ if self.preserve_tensors: return list(self.tensor_map_copies.keys()) else: return list(self.tensor_map.keys()) def inputs_for_tflite(self) -> typing.List[np.ndarray]: """Prepare inputs for the TFLite backend Returns: typing.List[np.ndarray]: The input tensors """ arrs = [] for t, trans in zip(self.dummy_input, self.input_transpose): arr = t.detach().clone().numpy() if trans: arr = np.transpose(arr, (0, 2, 3, 1)) arrs.append(arr) return arrs

tinynn/converter/base.py (475 lines of code) (raw):