include/fbgemm/QuantUtilsAvx2.h (111 lines of code) (raw):

#pragma once #include <cstdint> #include "./FbgemmBuild.h" #include "./UtilsAvx2.h" namespace fbgemm { // Structs from gemmlowp // // A structure to hold quantization parameters 'scale' and 'zero_point'. // The meaning of these values is as the constants in the quantization equation // // real_value = scale * (quantized_value - zero_point) // // In other words, 'zero_point' is the quantized value that corresponds // to the real value 0, and 'scale' is the difference of real values // corresponding to consecutive quantized values. struct FBGEMM_API TensorQuantizationParams { float scale; std::int32_t zero_point; int precision; float Min() const; float Max() const; }; // Parameters when we scale from int32 intermediate matrix multiplication // results to 8-bit integers struct FBGEMM_API RequantizationParams { // For floating-point requantization float real_multiplier; // For fixed-point requantization std::int32_t multiplier; int right_shift; TensorQuantizationParams target_qparams; }; //////////////////////////////////////////////////////////////////////////////// // Utility functions template <typename T = std::uint8_t, bool LEGACY = true> void QuantizeAvx2( const float* src, T* dst, int len, const TensorQuantizationParams& qparams); template <typename T = std::uint8_t> void FusedQuantizeDequantizeAvx2( const float* src, float* dst, int len, const TensorQuantizationParams& qparams, float noise_ratio = 0.0f); /* * Random number generator in [0, 9]: https://www.jstatsoft.org/v08/i14/paper */ uint32_t FBGEMM_API Xor128(void); /** * @brief Find the min and max value in a float matrix. */ void FBGEMM_API FindMinMax(const float* m, float* min, float* max, int len); void RequantizeFixedPointAvx2( const std::int32_t* src, std::uint8_t* dst, int len, const RequantizationParams& params); void RequantizeAvx2( const std::int32_t* src, std::uint8_t* dst, int len, const RequantizationParams& params); /** * @brief Requantize with avx2 and bias is fused. */ template < bool A_SYMMETRIC, bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, bool FUSE_RELU, typename BIAS_TYPE = std::int32_t, bool DIRECT = false> FBGEMM_API void requantizeOutputProcessingAvx2( std::uint8_t* out, const std::int32_t* inp, const block_type_t& block, int ld_out, int ld_in, const requantizationParams_t<BIAS_TYPE>& r); template < bool A_SYMMETRIC, bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, bool FUSE_RELU, int C_PER_G, typename BIAS_TYPE = std::int32_t> FBGEMM_API void requantizeOutputProcessingGConvAvx2( std::uint8_t* out, const std::int32_t* inp, const block_type_t& block, int ld_out, int ld_in, const requantizationParams_t<BIAS_TYPE>& r); template < bool A_SYMMETRIC, bool B_SYMMETRIC, QuantizationGranularity Q_GRAN, bool HAS_BIAS, bool FUSE_RELU> FBGEMM_API void requantizeForFloatAvx2( float* out, const std::int32_t* inp, const block_type_t& block, int ld_out, int ld_in, const requantizationForFloatParams_t& r); template <typename InputType, int BIT_RATE> void FloatOrHalfToFusedNBitRowwiseQuantizedSBHalfAvx2( const InputType* input, size_t input_rows, int input_columns, std::uint8_t* output); template <typename InputType> void FloatOrHalfToFused8BitRowwiseQuantizedSBFloatAvx2( const InputType* input, size_t input_rows, int input_columns, std::uint8_t* output); template <typename OutputType, int BIT_RATE> void FusedNBitRowwiseQuantizedSBHalfToFloatOrHalfAvx2( const std::uint8_t* input, size_t input_rows, int input_columns, OutputType* output); template <typename OutputType> void Fused8BitRowwiseQuantizedSBFloatToFloatOrHalfAvx2( const std::uint8_t* input, size_t input_rows, int input_columns, OutputType* output); } // namespace fbgemm