tinynn/converter/operators/hybrid

import copy import functools import igraph as ig import numpy as np import torch from tinynn.util.util import class_conditional, get_logger from . import tflite as tfl from .base import ExtendedOperator from .graph import CommonGraph log = get_logger(__name__) WEIGHT_MAPPING = { ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: [1, 2, 3, 4, 5, 6, 7, 8], ExtendedOperator.BIDIRECTIONAL_SEQUENCE_LSTM: [1, 2, 3, 4, 5, 6, 7, 8, 18, 19, 20, 21, 22, 23, 24, 25], } BIAS_MAPPING = { ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: {1: 12, 2: 13, 3: 14, 4: 15}, } STATE_MAPPING = { ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: [18], } CELL_STATE_MAPPING = { ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: [19], } class HybridQuantizer(object): graph: CommonGraph def __init__( self, graph, asymmetric, q_type, per_channel, enable_conv, enable_int16_lstm, gen_single_op_models, config ) -> None: super().__init__() self.graph = graph self.asymmetric = asymmetric self.q_type = q_type self.per_channel = per_channel self.enable_conv = enable_conv self.enable_int16_lstm = enable_int16_lstm self.gen_single_op_models = gen_single_op_models if config is None: config = {} self.config = config def quantize(self): self.quantize_pass() self.int16_lstm_pass() @class_conditional(lambda self: self.enable_int16_lstm) def int16_lstm_pass(self): filtered_nodes = self.graph.graph.vs.select(functools.partial(is_int16_quantizable_lstm_node)) actions = [] replaced_tensors = {} for node in filtered_nodes: if self.config.get(node['outputs'][0], True) is False: continue if node['node_type'] == ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM: lstm_input = node['op'].inputs[0] if lstm_input.dtype == np.int8: bias_indices = BIAS_MAPPING.get(node['node_type']) for weight_idx, bias_idx in bias_indices.items(): bias_t = node['op'].inputs[bias_idx] weight_t = node['op'].inputs[weight_idx] name = bias_t.name new_name = f'{name}_hybrid_q' bias_a = np.frombuffer(bias_t.buffer.data, dtype='float32').reshape(bias_t.shape) bias = torch.from_numpy(bias_a.copy()) bias_scale = weight_t.quantization.scale * lstm_input.quantization.scale new_bias = torch.round(bias.detach() / bias_scale).to(dtype=torch.int32) new_bias_t = tfl.Tensor(tfl.FakeQuantTensor(new_bias, bias_scale, 0), new_name) replaced_tensors.setdefault(new_bias_t.name, new_bias_t) new_bias_t = replaced_tensors[new_bias_t.name] actions.append((self.graph.replace_operator_input, (node, bias_idx, new_bias_t))) state_indices = STATE_MAPPING.get(node['node_type']) for state_idx in state_indices: node['op'].inputs[state_idx].quantization = copy.deepcopy(node['op'].outputs[0].quantization) node['op'].inputs[state_idx].tensor = node['op'].inputs[state_idx].tensor.astype(np.int8) node['op'].inputs[state_idx].dtype = node['op'].inputs[state_idx].tensor.dtype cell_state_indices = CELL_STATE_MAPPING.get(node['node_type']) for cell_state_idx in cell_state_indices: q_cell_output = self.graph.rev_q_mapping[node['op'].extra_hints['cell_output']].quantization q_cell_max = q_cell_output.scale * (127 - q_cell_output.zero_point) q_cell_min = q_cell_output.scale * (-128 - q_cell_output.zero_point) q_cell_abs_max = np.maximum(np.abs(q_cell_max), np.abs(q_cell_min)) cell_pot = np.power(2, np.maximum(np.ceil(np.log2(q_cell_abs_max)), 0)).item() node['op'].inputs[cell_state_idx].quantization = tfl.QuantizationParameters(cell_pot / 32768, 0) node['op'].inputs[cell_state_idx].tensor = ( node['op'].inputs[cell_state_idx].tensor.astype(np.int16) ) node['op'].inputs[cell_state_idx].dtype = node['op'].inputs[cell_state_idx].tensor.dtype # Add intermediates for int8x8_16 lstm name = node['op'].outputs[0].name input_to_input_intermediate = tfl.Tensor(np.zeros(0, dtype='float32'), f'{name}_intermediate_1') input_to_forget_intermediate = tfl.Tensor(np.zeros(0, dtype='float32'), f'{name}_intermediate_2') input_to_cell_intermediate = tfl.Tensor(np.zeros(0, dtype='float32'), f'{name}_intermediate_3') input_to_output_intermediate = tfl.Tensor(np.zeros(0, dtype='float32'), f'{name}_intermediate_4') effective_hidden_scale_intermediate = tfl.Tensor( tfl.FakeQuantTensor(np.zeros(0, dtype='int8'), node['op'].outputs[0].quantization.scale, 0), f'{name}_intermediate_5', ) actions.append((self.graph.append_operator_input, (node, input_to_input_intermediate, True))) actions.append((self.graph.append_operator_input, (node, input_to_forget_intermediate, True))) actions.append((self.graph.append_operator_input, (node, input_to_cell_intermediate, True))) actions.append((self.graph.append_operator_input, (node, input_to_output_intermediate, True))) actions.append( (self.graph.append_operator_input, (node, effective_hidden_scale_intermediate, True)) ) for func, args in actions: func(*args) def quantize_pass(self): filtered_nodes = self.graph.graph.vs.select(functools.partial(is_quantizable_node, with_conv=self.enable_conv)) actions = [] replaced_tensors = {} for node in filtered_nodes: if self.config.get(node['outputs'][0], True) is False: continue weight_indices = WEIGHT_MAPPING.get(node['node_type'], [1]) skip = False for weight_idx in weight_indices: new_weight = None weight_t = node['op'].inputs[weight_idx] if weight_t.buffer is None or str(weight_t.dtype) != 'float32': skip = True break if skip: continue for weight_idx in weight_indices: weight_t = node['op'].inputs[weight_idx] name = weight_t.name weight_a = np.frombuffer(weight_t.buffer.data, dtype='float32').reshape(weight_t.shape) weight = torch.from_numpy(weight_a.copy()) if ( node['node_type'] in ( ExtendedOperator.FULLY_CONNECTED, ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM, ExtendedOperator.BIDIRECTIONAL_SEQUENCE_LSTM, ) or not self.per_channel ): if node['node_type'] == ExtendedOperator.DEPTHWISE_CONV_2D: log.warning('DEPTHWISE_CONV_2D doesn\'t support hybrid per-tensor quantization') continue if self.asymmetric and hasattr(node['op'], 'asymmetricQuantizeInputs'): node['op'].asymmetricQuantizeInputs = True if self.q_type == np.uint8: new_weight = quantize(name, weight, torch.qint8, torch.per_tensor_symmetric, q_type=np.int8) new_weight.reinterpret_as(self.q_type) else: new_weight = quantize(name, weight, torch.qint8, torch.per_tensor_symmetric, q_type=self.q_type) elif node['node_type'] == ExtendedOperator.CONV_2D: new_weight = quantize(name, weight, torch.qint8, torch.per_channel_symmetric, 0, q_type=self.q_type) elif node['node_type'] == ExtendedOperator.DEPTHWISE_CONV_2D: new_weight = quantize( name, weight, torch.qint8, torch.per_channel_symmetric, -1, q_type=self.q_type ) if self.gen_single_op_models: node['op'].extra_hints['orig_float'] = copy.deepcopy(node['op']) replaced_tensors.setdefault(new_weight.name, new_weight) new_weight = replaced_tensors[new_weight.name] actions.append((self.graph.replace_operator_input, (node, weight_idx, new_weight))) for func, args in actions: func(*args) def is_quantizable_node(vertex: ig.Vertex, with_conv: bool): return vertex['node_type'] in ( ExtendedOperator.FULLY_CONNECTED, ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM, ExtendedOperator.BIDIRECTIONAL_SEQUENCE_LSTM, ) or ( with_conv and vertex['node_type'] in ( ExtendedOperator.CONV_2D, ExtendedOperator.DEPTHWISE_CONV_2D, ) ) def is_int16_quantizable_lstm_node(vertex: ig.Vertex): return vertex['node_type'] in (ExtendedOperator.UNIDIRECTIONAL_SEQUENCE_LSTM,) def quantize(name, tensor, dtype, qscheme, axis=None, q_type=np.uint8): assert qscheme in (torch.per_tensor_symmetric, torch.per_channel_symmetric) new_name = f'{name}_hybrid_q' if dtype == torch.quint8: quant_min, quant_max = 0, 255 else: quant_min, quant_max = -127, 127 if axis is not None: if axis < 0: axis += tensor.ndim dim = [i for i in range(tensor.ndim) if i != axis] else: dim = None if hasattr(torch, 'amin') and hasattr(torch, 'amax'): min_val = torch.amin(tensor, dim) max_val = torch.amax(tensor, dim) else: if dim is None: min_val = torch.min(tensor) max_val = torch.max(tensor) else: orig_dim = tensor.size(axis) if axis != 0: perm = [axis] + dim tensor_perm = tensor.permute(perm) else: tensor_perm = tensor tensor_2d = tensor_perm.reshape(orig_dim, -1) min_val, _ = torch.min(tensor_2d, 1) max_val, _ = torch.max(tensor_2d, 1) min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) scale = torch.ones(min_val_neg.size(), dtype=torch.float32) zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64) eps = torch.tensor(torch.finfo(torch.float32).eps) max_val_pos = torch.max(-min_val_neg, max_val_pos) scale = max_val_pos / (float(quant_max - quant_min) / 2) scale = torch.max(scale, eps) if dtype == torch.quint8: zero_point = zero_point.new_full(zero_point.size(), 128) if qscheme == torch.per_channel_symmetric: q_tensor = torch.quantize_per_channel(tensor, scale, zero_point, axis, dtype) else: q_tensor = torch.quantize_per_tensor(tensor, scale, zero_point, dtype) return tfl.Tensor(q_tensor, new_name, q_type=q_type)

tinynn/converter/operators/hybrid_quantizer.py (216 lines of code) (raw):