void LLVMIRGen::generateLLVMIRForInstr()

in lib/LLVMIRCodeGen/LLVMIRGen.cpp [1913:3901]


void LLVMIRGen::generateLLVMIRForInstr(llvm::IRBuilder<> &builder,
                                       const glow::Instruction *I) {
  setCurrentDebugLocation(builder, I);
  assert((!canBePartOfDataParallelKernel(I)) &&
         "data parallel instructions are not handled here");
  switch (I->getKind()) {
  case Kinded::Kind::MatMulInstKind: {
    auto *MM = cast<MatMulInst>(I);
    auto *dest = MM->getDest();
    auto *lhs = MM->getLHS();
    auto *rhs = MM->getRHS();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *lhsPtr = emitValueAddress(builder, lhs);
    auto *rhsPtr = emitValueAddress(builder, rhs);

    auto *destDims = emitValueDims(builder, dest);
    auto *lhsDims = emitValueDims(builder, lhs);
    auto *rhsDims = emitValueDims(builder, rhs);

    auto *F = getFunction("matmul", dest->getElementType());

    if (lhs->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *lhsTy = lhs->getType();
      auto *rhsTy = rhs->getType();

      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *lhsOffset = emitConstI32(builder, lhsTy->getOffset());
      auto *rhsOffset = emitConstI32(builder, rhsTy->getOffset());

      auto outScaleParams = quantization::quantizeScaleOffset32To8(
          lhsTy->getScale() * rhsTy->getScale() / destTy->getScale(), 0);

      auto *outPre = emitConstI32(builder, outScaleParams.pre);
      auto *outPost = emitConstI32(builder, outScaleParams.post);
      auto *outScale = emitConstI32(builder, outScaleParams.scale);

      createCall(builder, F,
                 {destPtr, lhsPtr, rhsPtr, destDims, lhsDims, rhsDims,
                  destOffset, lhsOffset, rhsOffset, outPre, outPost, outScale});
    } else {
      createCall(builder, F,
                 {destPtr, lhsPtr, rhsPtr, destDims, lhsDims, rhsDims});
    }
    break;
  }

  case Kinded::Kind::QuantizationProfileInstKind: {
    auto *QP = cast<QuantizationProfileInst>(I);
    auto *hist = QP->getHistogram();
    auto *compInfo = QP->getComputationInfo();
    auto *inputTensor = QP->getInputTensor();

    auto *histPtr = emitValueAddress(builder, hist);
    auto *compInfoPtr = emitValueAddress(builder, compInfo);
    auto *inputTensorInfoPtr = emitValueAddress(builder, inputTensor);

    auto *histDims = emitValueDims(builder, hist);
    assert(inputTensor->getElementType() == ElemKind::FloatTy &&
           "None float Tensor type for Quantization Profile Instruction.");
    auto *tensorSize = emitConstDimT(builder, inputTensor->getType()->size());

    auto *F = getFunction("quantization_profile");
    createCall(
        builder, F,
        {inputTensorInfoPtr, tensorSize, compInfoPtr, histPtr, histDims});
    break;
  }

  case Kinded::Kind::FullyConnectedInstKind: {
    auto *FCI = cast<FullyConnectedInst>(I);
    auto *dest = FCI->getDest();
    auto *src = FCI->getSrc();
    auto *weights = FCI->getWeights();
    auto *bias = FCI->getBias();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *biasPtr = emitValueAddress(builder, bias);
    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);
    auto *weightsDims = emitValueDims(builder, weights);
    auto *biasDims = emitValueDims(builder, bias);

    if (src->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *srcTy = src->getType();
      auto *weightsTy = weights->getType();
      auto *biasTy = bias->getType();

      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *srcOffset = emitConstI32(builder, srcTy->getOffset());
      auto *weightsOffset = emitConstI32(builder, weightsTy->getOffset());
      auto *biasOffset = emitConstI32(builder, biasTy->getOffset());

      // Calculate the scale of the values that come out of the matrix
      // multiplication part of the calculation.
      float matMulScale = srcTy->getScale() * weightsTy->getScale();

      // Calculate the scaling parameters for the bias and output.
      auto biasScaleParam = quantization::quantizeScaleOffset32To8(
          biasTy->getScale() / matMulScale, 0);
      auto outScaleParam = quantization::quantizeScaleOffset32To8(
          matMulScale / destTy->getScale(), 0);

      // Pass the pre-shift, post-shift and integer scale parameters for the
      // bias and output calculation.
      auto *biasPre = emitConstI32(builder, biasScaleParam.pre);
      auto *biasPost = emitConstI32(builder, biasScaleParam.post);
      auto *biasScale = emitConstI32(builder, biasScaleParam.scale);
      auto *outPre = emitConstI32(builder, outScaleParam.pre);
      auto *outPost = emitConstI32(builder, outScaleParam.post);
      auto *outScale = emitConstI32(builder, outScaleParam.scale);

      auto *F =
          getFunction("fc", {dest->getElementType(), bias->getElementType()});
      createCall(builder, F,
                 {destPtr, srcPtr, weightsPtr, biasPtr, destDims, srcDims,
                  weightsDims, biasDims, destOffset, srcOffset, weightsOffset,
                  biasOffset, biasPre, biasPost, biasScale, outPre, outPost,
                  outScale});
    } else {
      auto *F = getFunction("fc", dest->getElementType());
      createCall(builder, F,
                 {destPtr, srcPtr, weightsPtr, biasPtr, destDims, srcDims,
                  weightsDims, biasDims});
    }
    break;
  }

  case Kinded::Kind::RowwiseQuantizedFullyConnectedInstKind: {
    auto *RWQFC = cast<RowwiseQuantizedFullyConnectedInst>(I);

    auto scalesT = getTensorForConstantValue(RWQFC->getScales());
    auto scalesH = scalesT.getHandle();
    size_t rowNum = scalesH.dims()[0];
    float inputScale = RWQFC->getSrc()->getType()->getScale();

    float bScale = RWQFC->getBias()->getType()->getScale();
    int32_t bOffset = RWQFC->getBias()->getType()->getOffset();

    float outputScale = RWQFC->getDest()->getType()->getScale();

    std::vector<llvm::Constant *> biasPreV(rowNum);
    std::vector<llvm::Constant *> biasPostV(rowNum);
    std::vector<llvm::Constant *> biasScaleV(rowNum);
    std::vector<llvm::Constant *> outputPreV(rowNum);
    std::vector<llvm::Constant *> outputPostV(rowNum);
    std::vector<llvm::Constant *> outputScaleV(rowNum);

    for (size_t i = 0; i < rowNum; i++) {
      // Calculate the scale of the values that come out of the matrix
      // multiplication part of the calculation.
      float matMulScale = inputScale * scalesH.raw(i);

      // Calculate the scaling parameters for the bias and output.
      auto biasScaleParam =
          quantization::quantizeScaleOffset32To8(bScale / matMulScale, bOffset);
      auto outScaleParam =
          quantization::quantizeScaleOffset32To8(matMulScale / outputScale, 0);

      // Pass the pre-shift, post-shift and integer scale parameters for the
      // bias and output calculation.
      biasPreV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                           biasScaleParam.pre, true);
      biasPostV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                            biasScaleParam.post, true);
      biasScaleV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                             biasScaleParam.scale, true);
      outputPreV[i] =
          llvm::ConstantInt::get(builder.getInt32Ty(), outScaleParam.pre, true);
      outputPostV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                              outScaleParam.post, true);
      outputScaleV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                               outScaleParam.scale, true);
    }

    auto *dest = RWQFC->getDest();
    auto *src = RWQFC->getSrc();
    auto *weights = RWQFC->getWeights();
    auto *bias = RWQFC->getBias();
    auto *weightsOffsets = RWQFC->getOffsets();

    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *biasPtr = emitValueAddress(builder, bias);
    auto *weightsOffsetsPtr = emitValueAddress(builder, weightsOffsets);
    auto *biasPrePtr = emitConstArray(builder, biasPreV, builder.getInt32Ty());
    auto *biasPostPtr =
        emitConstArray(builder, biasPostV, builder.getInt32Ty());
    auto *biasScalePtr =
        emitConstArray(builder, biasScaleV, builder.getInt32Ty());
    auto *outputPrePtr =
        emitConstArray(builder, outputPreV, builder.getInt32Ty());
    auto *outputPostPtr =
        emitConstArray(builder, outputPostV, builder.getInt32Ty());
    auto *outputScalePtr =
        emitConstArray(builder, outputScaleV, builder.getInt32Ty());

    auto *srcDims = emitValueDims(builder, src);
    auto *weightsDims = emitValueDims(builder, weights);
    auto *destDims = emitValueDims(builder, dest);
    auto *biasDims = emitValueDims(builder, bias);
    auto *row = emitConstDimT(builder, weightsOffsets->dims()[0]);

    auto *destOffset = emitConstI32(builder, dest->getType()->getOffset());
    auto *srcOffset = emitConstI32(builder, src->getType()->getOffset());
    auto *biasOffset = emitConstI32(builder, bOffset);

    llvm::Function *F = nullptr;
    if ((dest->getElementType() == ElemKind::Int8QTy) &&
        (bias->getElementType() == ElemKind::Int8QTy)) {
      F = getFunction("rowwise_quantized_fc_i8_i8");
    } else if ((dest->getElementType() == ElemKind::Int8QTy) &&
               (bias->getElementType() == ElemKind::Int32QTy)) {
      F = getFunction("rowwise_quantized_fc_i8_i32");
    } else {
      LOG(FATAL) << "Unsupported element/bias type for "
                    "RowwiseQuantizedFullyConnectedInst";
    }

    createCall(builder, F,
               {destPtr, srcPtr, weightsPtr, biasPtr, weightsOffsetsPtr,
                biasPrePtr, biasPostPtr, biasScalePtr, outputPrePtr,
                outputPostPtr, outputScalePtr, destDims, srcDims, weightsDims,
                biasDims, row, destOffset, srcOffset, biasOffset});
    break;
  }

  case Kinded::Kind::BatchedAddInstKind: {
    auto *BA = cast<BatchedAddInst>(I);
    auto *dest = BA->getDest();
    auto *batch = BA->getBatch();
    auto *slice = BA->getSlice();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *batchPtr = emitValueAddress(builder, batch);
    auto *slicePtr = emitValueAddress(builder, slice);

    auto bdim = flattenCdr(batch->dims());
    auto *numSlice = emitConstDimT(builder, bdim.first);
    auto *sliceSize = emitConstDimT(builder, bdim.second);

    if (batch->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *batchTy = batch->getType();
      auto *sliceTy = slice->getType();

      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *batchOffset = emitConstI32(builder, batchTy->getOffset());
      auto *sliceOffset = emitConstI32(builder, sliceTy->getOffset());

      float destScale = destTy->getScale();

      // Here, we select parameters for scaling both summands to the
      // destination scale.
      auto batchScaleParams = quantization::quantizeScaleOffset32To8(
          batchTy->getScale() / destScale, batchTy->getOffset());
      auto sliceScaleParams = quantization::quantizeScaleOffset32To8(
          sliceTy->getScale() / destScale, sliceTy->getOffset());

      auto *batchPre = emitConstI32(builder, batchScaleParams.pre);
      auto *batchPost = emitConstI32(builder, batchScaleParams.post);
      auto *batchScale = emitConstI32(builder, batchScaleParams.scale);
      auto *slicePre = emitConstI32(builder, sliceScaleParams.pre);
      auto *slicePost = emitConstI32(builder, sliceScaleParams.post);
      auto *sliceScale = emitConstI32(builder, sliceScaleParams.scale);

      llvm::Function *F = nullptr;
      if (sliceTy->getElementType() == ElemKind::Int8QTy) {
        F = getFunction("batchedadd", dest->getElementType());
      } else if (sliceTy->getElementType() == ElemKind::Int32QTy) {
        F = getFunction("batchedadd_i32", dest->getElementType());
      } else {
        LOG(FATAL) << "Type is not supported: "
                   << Type::getElementName(sliceTy->getElementType()).str();
      }
      createCall(builder, F,
                 {destPtr, batchPtr, slicePtr, numSlice, sliceSize, destOffset,
                  batchOffset, sliceOffset, batchPre, batchPost, batchScale,
                  slicePre, slicePost, sliceScale});
    } else {
      auto *F = getFunction("batchedadd", dest->getElementType());
      createCall(builder, F,
                 {destPtr, batchPtr, slicePtr, numSlice, sliceSize});
    }
    break;
  }

  case Kinded::Kind::BatchedReduceAddInstKind: {
    auto *BR = cast<BatchedReduceAddInst>(I);
    auto *dest = BR->getDest();
    auto *batch = BR->getBatch();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *batchPtr = emitValueAddress(builder, batch);
    auto *axis = emitConstDimT(builder, BR->getAxis());

    ShapeVector eBatchDims = expandDimsToMax(batch->dims());
    ShapeVector eDestDims = eBatchDims;
    eDestDims[BR->getAxis()] = 1;

    auto *batchDims =
        emitConstDimTArray(builder, llvm::makeArrayRef(eBatchDims));
    auto *destDims = emitConstDimTArray(builder, llvm::makeArrayRef(eDestDims));

    auto *F = getFunction("batchedreduceadd", dest->getElementType());

    if (batch->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *batchTy = batch->getType();

      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *batchOffset = emitConstI32(builder, batchTy->getOffset());

      // BatchedReduceAdd is an accumulation operation, with equations
      //    s_d * (i_d - o_d) = \sum s_b * (i_b - o_b)
      // => i_d - o_d = \sum (s_b / s_d) * (i_b - o_b)
      // => i_d = (s_b / s_d ) * [\sum (i_b - o_b)] + o_d
      auto batchScaleParams = quantization::quantizeScaleOffset32To8(
          batchTy->getScale() / destTy->getScale(), batchTy->getOffset());

      auto *batchPre = emitConstI32(builder, batchScaleParams.pre);
      auto *batchPost = emitConstI32(builder, batchScaleParams.post);
      auto *batchScale = emitConstI32(builder, batchScaleParams.scale);

      createCall(builder, F,
                 {destPtr, batchPtr, destDims, batchDims, destOffset,
                  batchOffset, batchPre, batchPost, batchScale, axis});
    } else {
      auto *destSize = emitConstDimT(builder, dest->size());

      createCall(builder, F,
                 {destPtr, batchPtr, destSize, destDims, batchDims, axis});
    }
    break;
  }

  case Kinded::Kind::BatchedReduceProdInstKind: {
    auto *BR = cast<BatchedReduceProdInst>(I);
    auto *dest = BR->getDest();
    auto *batch = BR->getBatch();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *batchPtr = emitValueAddress(builder, batch);
    auto *axis = emitConstDimT(builder, BR->getAxis());

    ShapeVector eBatchDims = expandDimsToMax(batch->dims());
    ShapeVector eDestDims = eBatchDims;
    eDestDims[BR->getAxis()] = 1;

    auto *batchDims =
        emitConstDimTArray(builder, llvm::makeArrayRef(eBatchDims));
    auto *destDims = emitConstDimTArray(builder, llvm::makeArrayRef(eDestDims));

    auto *F = getFunction("batchedreduceprod", dest->getElementType());

    assert(!batch->getType()->isQuantizedType() &&
           "Quantized implementation for ReduceProd not supported yet.");

    auto *destSize = emitConstDimT(builder, dest->size());

    createCall(builder, F,
               {destPtr, batchPtr, destSize, destDims, batchDims, axis});

    break;
  }

#define BATCHED_REDUCE_MINMAX_CASE(INST_NAME_, FUN_NAME_)                      \
  case Kinded::Kind::Batched##INST_NAME_##InstKind: {                          \
    auto *BR = cast<Batched##INST_NAME_##Inst>(I);                             \
    auto *dest = BR->getDest();                                                \
    auto *batch = BR->getBatch();                                              \
    auto axes = BR->getAxes();                                                 \
    auto *destPtr = emitValueAddress(builder, dest);                           \
    auto *batchPtr = emitValueAddress(builder, batch);                         \
                                                                               \
    ShapeVector eBatchDims = expandDimsToMax(batch->dims());                   \
    ShapeVector eDestDims = eBatchDims;                                        \
    for (dim_t i = 0; i < axes.size(); i++) {                                  \
      eDestDims[axes[i]] = 1;                                                  \
    }                                                                          \
                                                                               \
    auto *batchDims =                                                          \
        emitConstDimTArray(builder, llvm::makeArrayRef(eBatchDims));           \
    auto *destDims =                                                           \
        emitConstDimTArray(builder, llvm::makeArrayRef(eDestDims));            \
                                                                               \
    if (((batch->getElementType() != ElemKind::FloatTy) &&                     \
         (batch->getElementType() != ElemKind::Int32ITy) &&                    \
         (batch->getElementType() != ElemKind::Int64ITy)) ||                   \
        (batch->getElementType() != dest->getElementType())) {                 \
      std::string errStr = "Cannot get function for ";                         \
      std::string name = "INST_NAME_";                                         \
      errStr += name;                                                          \
      llvm_unreachable(errStr.c_str());                                        \
    }                                                                          \
                                                                               \
    llvm::Function *F = getFunction(FUN_NAME_, batch->getElementType());       \
    if (!batch->getType()->isQuantizedType()) {                                \
      auto *destSize = emitConstSizeT(builder, dest->size());                  \
                                                                               \
      createCall(builder, F,                                                   \
                 {destPtr, batchPtr, destSize, destDims, batchDims});          \
    }                                                                          \
    break;                                                                     \
  }
    BATCHED_REDUCE_MINMAX_CASE(ReduceMin, "reducemin")
    BATCHED_REDUCE_MINMAX_CASE(ReduceMax, "reducemax")
#undef BATCHED_REDUCE_MINMAX_CASE

  case Kinded::Kind::ConvolutionInstKind: {
    auto *CI = cast<ConvolutionInst>(I);
    assert(CI->getLayout() == NHWC &&
           "Glow CPU Backend supports only NHWC Convolutions");
    auto *dest = CI->getDest();
    auto *src = CI->getSrc();
    auto *filter = CI->getFilter();
    auto *bias = CI->getBias();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *filterPtr = emitValueAddress(builder, filter);
    auto *biasPtr = emitValueAddress(builder, bias);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);
    auto *filterDims = emitValueDims(builder, filter);
    auto *biasDims = emitValueDims(builder, bias);

    auto *kernels = emitConstDimTArray(builder, CI->getKernels());
    auto *strides = emitConstDimTArray(builder, CI->getStrides());
    auto *pads = emitConstDimTArray(builder, CI->getPads());
    auto *group = emitConstDimT(builder, CI->getGroup());
    auto *dilation = emitConstDimTArray(builder, CI->getDilation());

    auto destDepth = dest->dims()[3];

    // Try to 'block' the convolution on the 'depth' dimension. We will process
    // this number output slices each iteration.
    unsigned unrollDFactor = 1;

    // In libjit_convolution_f function, 'unrollDFactor' output
    // layers will be processed together. Therefore, the number of
    // output layers in each group should be divisible by 'unrollDFactor'
    bool groupDividedBy8 = ((destDepth / CI->getGroup()) % 8) == 0;
    if (groupDividedBy8) {
      unrollDFactor = 8;
    }

    auto *unrollD = emitConstI32(builder, unrollDFactor);

    auto *actType = emitConstI32(builder, CI->getFusedActivation());

    if (src->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *srcTy = src->getType();
      auto *filterTy = filter->getType();
      auto *biasTy = bias->getType();

      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *srcOffset = emitConstI32(builder, srcTy->getOffset());
      auto *filterOffset = emitConstI32(builder, filterTy->getOffset());
      auto *biasOffset = emitConstI32(builder, biasTy->getOffset());

      // Calculate the scale of the values that come out of the matrix
      // multiplication part of the calculation.
      float matMulScale = srcTy->getScale() * filterTy->getScale();

      // Calculate the scaling parameters for the bias and output.
      auto biasScaleParam = quantization::quantizeScaleOffset32To8(
          biasTy->getScale() / matMulScale, biasTy->getOffset());
      auto outScaleParam = quantization::quantizeScaleOffset32To8(
          matMulScale / destTy->getScale(), 0);

      // Pass the pre-shift, post-shift and integer scale parameters for the
      // bias and output calculation.
      auto *biasPre = emitConstI32(builder, biasScaleParam.pre);
      auto *biasPost = emitConstI32(builder, biasScaleParam.post);
      auto *biasScale = emitConstI32(builder, biasScaleParam.scale);
      auto *outPre = emitConstI32(builder, outScaleParam.pre);
      auto *outPost = emitConstI32(builder, outScaleParam.post);
      auto *outScale = emitConstI32(builder, outScaleParam.scale);

      // Emit parameters for fused activation.
      auto *actArgsQuant = emitConstQuantActivationArgs(builder, CI);

      auto *F = getFunction("conv2d",
                            {dest->getElementType(), bias->getElementType()});

      createCall(builder, F,
                 {destPtr,     srcPtr,     filterPtr,  biasPtr,   destDims,
                  srcDims,     filterDims, biasDims,   kernels,   strides,
                  pads,        group,      destOffset, srcOffset, filterOffset,
                  biasOffset,  biasPre,    biasPost,   biasScale, outPre,
                  outPost,     outScale,   unrollD,    dilation,  actType,
                  actArgsQuant});
    } else {

      // Emit parameters for fused activation.
      auto *actArgsFloat = emitConstFloatActivationArgs(builder, CI);

      auto *F = getFunction("conv2d", dest->getElementType());

      createCall(builder, F,
                 {destPtr, srcPtr, filterPtr, biasPtr, destDims, srcDims,
                  filterDims, biasDims, kernels, strides, pads, group, unrollD,
                  dilation, actType, actArgsFloat});
    }
    break;
  }

  case Kinded::Kind::ConvolutionGradInstKind: {
    auto *CG = cast<ConvolutionGradInst>(I);
    auto *srcGrad = CG->getSrcGrad();
    auto *destGrad = CG->getDestGrad();
    auto *src = CG->getSrc();
    auto *filterGrad = CG->getFilterGrad();
    auto *srcGradPtr = emitValueAddress(builder, srcGrad);
    auto *destGradPtr = emitValueAddress(builder, destGrad);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *filterGradPtr = emitValueAddress(builder, filterGrad);
    auto *biasGradPtr = emitValueAddress(builder, CG->getBiasGrad());
    auto *filterPtr = emitValueAddress(builder, CG->getFilter());

    auto *destGradDims = emitValueDims(builder, destGrad);
    auto *srcDims = emitValueDims(builder, src);
    auto *filterGradDims = emitValueDims(builder, filterGrad);

    auto *kernels = emitConstDimTArray(builder, CG->getKernels());
    auto *strides = emitConstDimTArray(builder, CG->getStrides());
    auto *pads = emitConstDimTArray(builder, CG->getPads());
    auto *group = emitConstDimT(builder, CG->getGroup());
    auto *dilation = emitConstDimTArray(builder, CG->getDilation());

    auto *F = getFunction("convolution_grad", srcGrad->getElementType());
    createCall(builder, F,
               {srcGradPtr, destGradPtr, srcPtr, filterGradPtr, biasGradPtr,
                filterPtr, destGradDims, srcDims, filterGradDims, kernels,
                strides, pads, group, dilation});
    break;
  }

  case Kinded::Kind::ConvTransposeInstKind: {
    auto *CI = cast<ConvTransposeInst>(I);
    auto *dest = CI->getDest();
    auto *src = CI->getSrc();
    auto *filter = CI->getFilter();
    auto *bias = CI->getBias();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *filterPtr = emitValueAddress(builder, filter);
    auto *biasPtr = emitValueAddress(builder, bias);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);
    auto *filterDims = emitValueDims(builder, filter);
    auto *biasDims = emitValueDims(builder, bias);

    auto *kernels = emitConstDimTArray(builder, CI->getKernels());
    auto *strides = emitConstDimTArray(builder, CI->getStrides());
    auto *pads = emitConstDimTArray(builder, CI->getPads());
    auto *group = emitConstDimT(builder, CI->getGroup());
    auto *dilation = emitConstDimTArray(builder, CI->getDilation());

    const char *kernelName = "conv_transpose";

    auto *F = getFunction(kernelName, dest->getElementType());

    if (src->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *srcTy = src->getType();
      auto *filterTy = filter->getType();

      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *srcOffset = emitConstI32(builder, srcTy->getOffset());
      auto *filterOffset = emitConstI32(builder, filterTy->getOffset());

      // Calculate the scale of the values that come out of the matrix
      // multiplication part of the calculation.
      float matMulScale = srcTy->getScale() * filterTy->getScale();

      // Calculate the scaling parameters for the bias and output.
      auto outScaleParam = quantization::quantizeScaleOffset32To8(
          matMulScale / destTy->getScale(), 0);

      // Pass the pre-shift, post-shift and integer scale parameters for the
      // output calculation.
      auto *outPre = emitConstI32(builder, outScaleParam.pre);
      auto *outPost = emitConstI32(builder, outScaleParam.post);
      auto *outScale = emitConstI32(builder, outScaleParam.scale);

      createCall(builder, F,
                 {destPtr, srcPtr, filterPtr, biasPtr, destDims, srcDims,
                  filterDims, biasDims, kernels, strides, pads, group,
                  destOffset, srcOffset, filterOffset, outPre, outPost,
                  outScale, dilation});
    } else {
      createCall(builder, F,
                 {destPtr, srcPtr, filterPtr, biasPtr, destDims, srcDims,
                  filterDims, biasDims, kernels, strides, pads, group,
                  dilation});
    }
    break;
  }

  case Kinded::Kind::ChannelwiseQuantizedConvolutionInstKind: {
    auto *CQCI = cast<ChannelwiseQuantizedConvolutionInst>(I);
    auto *dest = CQCI->getDest();
    auto *src = CQCI->getSrc();
    auto *filter = CQCI->getFilter();
    auto *bias = CQCI->getBias();
    auto *filterScales = CQCI->getFilterScales();
    auto *filterOffsets = CQCI->getFilterOffsets();
    auto *biasScales = CQCI->getBiasScales();
    auto *biasOffsets = CQCI->getBiasOffsets();

    auto *destTy = dest->getType();
    auto *srcTy = src->getType();

    auto filterScalesT = getTensorForConstantValue(filterScales);
    auto filterScalesH = filterScalesT.getHandle<float>();

    auto biasScalesT = getTensorForConstantValue(biasScales);
    auto biasScalesH = biasScalesT.getHandle<float>();

    // Compute quantization parameters for each channel.
    auto channelNum = dest->dims().back();
    std::vector<llvm::Constant *> biasPreV(channelNum);
    std::vector<llvm::Constant *> biasPostV(channelNum);
    std::vector<llvm::Constant *> biasScaleV(channelNum);
    std::vector<llvm::Constant *> outputPreV(channelNum);
    std::vector<llvm::Constant *> outputPostV(channelNum);
    std::vector<llvm::Constant *> outputScaleV(channelNum);
    for (size_t i = 0; i < channelNum; i++) {

      // Compute the scaling parameters for bias and output.
      float matMulScale = srcTy->getScale() * filterScalesH.raw(i);
      auto biasScaleParam = quantization::quantizeScaleOffset32To8(
          biasScalesH.raw(i) / matMulScale, 0);
      auto outScaleParam = quantization::quantizeScaleOffset32To8(
          matMulScale / destTy->getScale(), 0);

      // Pass the pre-shift, post-shift and integer scale parameters for the
      // bias and output calculation.
      biasPreV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                           biasScaleParam.pre, true);
      biasPostV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                            biasScaleParam.post, true);
      biasScaleV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                             biasScaleParam.scale, true);
      outputPreV[i] =
          llvm::ConstantInt::get(builder.getInt32Ty(), outScaleParam.pre, true);
      outputPostV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                              outScaleParam.post, true);
      outputScaleV[i] = llvm::ConstantInt::get(builder.getInt32Ty(),
                                               outScaleParam.scale, true);
    }

    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *filterPtr = emitValueAddress(builder, filter);
    auto *biasPtr = emitValueAddress(builder, bias);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);
    auto *filterDims = emitValueDims(builder, filter);
    auto *biasDims = emitValueDims(builder, bias);

    auto *kernels = emitConstDimTArray(builder, CQCI->getKernels());
    auto *strides = emitConstDimTArray(builder, CQCI->getStrides());
    auto *pads = emitConstDimTArray(builder, CQCI->getPads());
    auto *group = emitConstDimT(builder, CQCI->getGroup());
    auto *dilation = emitConstDimTArray(builder, CQCI->getDilation());

    auto *destOffset = emitConstI32(builder, destTy->getOffset());
    auto *srcOffset = emitConstI32(builder, srcTy->getOffset());
    auto *filterOffsetsPtr = emitValueAddress(builder, filterOffsets);
    auto *biasOffsetsPtr = emitValueAddress(builder, biasOffsets);

    auto *biasPrePtr = emitConstArray(builder, biasPreV, builder.getInt32Ty());
    auto *biasPostPtr =
        emitConstArray(builder, biasPostV, builder.getInt32Ty());
    auto *biasScalePtr =
        emitConstArray(builder, biasScaleV, builder.getInt32Ty());
    auto *outputPrePtr =
        emitConstArray(builder, outputPreV, builder.getInt32Ty());
    auto *outputPostPtr =
        emitConstArray(builder, outputPostV, builder.getInt32Ty());
    auto *outputScalePtr =
        emitConstArray(builder, outputScaleV, builder.getInt32Ty());

    bool isConv3D = (srcTy->dims().size() == 5);
    auto *F = getFunction(isConv3D ? "channelwise_quantized_conv3d"
                                   : "channelwise_quantized_conv2d",
                          {dest->getElementType(), bias->getElementType()});

    auto *actType = emitConstI32(builder, CQCI->getFusedActivation());
    auto *actArgsQuant = emitConstQuantActivationArgs(builder, CQCI);

    createCall(builder, F,
               {destPtr,        srcPtr,        filterPtr,      biasPtr,
                destDims,       srcDims,       filterDims,     biasDims,
                kernels,        strides,       pads,           group,
                dilation,       destOffset,    srcOffset,      filterOffsetsPtr,
                biasOffsetsPtr, biasPrePtr,    biasPostPtr,    biasScalePtr,
                outputPrePtr,   outputPostPtr, outputScalePtr, actType,
                actArgsQuant});
    break;
  }

  case Kinded::Kind::CrossEntropyLossInstKind: {
    auto *CI = cast<CrossEntropyLossInst>(I);
    auto *P = CI->getP();
    auto *labels = CI->getLabels();
    auto *CE = CI->getCE();

    auto *CEPtr = emitValueAddress(builder, CE);
    auto *PPtr = emitValueAddress(builder, P);
    auto *labelsPtr = emitValueAddress(builder, labels);
    auto *dims = emitValueDims(builder, P);

    auto *F = getFunction("cross_entropy_loss",
                          {CE->getElementType(), labels->getElementType()});
    createCall(builder, F, {CEPtr, PPtr, labelsPtr, dims});
    break;
  }

  case Kinded::Kind::LengthsToRangesInstKind: {
    auto *LTR = cast<LengthsToRangesInst>(I);
    auto *dest = LTR->getDest();
    auto *lengths = LTR->getLengths();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *size = emitConstDimT(builder, lengths->dims()[0]);
    auto *F = getFunction("lengths_to_ranges", dest->getElementType());
    createCall(builder, F, {destPtr, lengthsPtr, size});
    break;
  }

  case Kinded::Kind::LengthsSumInstKind: {
    auto *LS = cast<LengthsSumInst>(I);
    auto *dest = LS->getDest();
    auto *data = LS->getData();
    auto *lengths = LS->getLengths();

    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *lengthsPtr = emitValueAddress(builder, lengths);

    auto *lengthsSize = emitConstDimT(builder, lengths->size());
    auto *dataType = data->getType();
    auto *destSize = emitConstDimT(builder, dest->size());
    auto *sliceSize =
        emitConstDimT(builder, dataType->size() / dataType->dims()[0]);

    auto *F = getFunction("lengths_sum", data->getElementType());
    createCall(
        builder, F,
        {destPtr, dataPtr, lengthsPtr, destSize, lengthsSize, sliceSize});
    break;
  }

  case Kinded::Kind::LocalResponseNormalizationInstKind: {
    auto *LRN = cast<LocalResponseNormalizationInst>(I);
    auto *dest = LRN->getDest();
    auto *src = LRN->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *scalePtr = emitValueAddress(builder, LRN->getScale());

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);
    auto *halfWindow = emitConstDimT(builder, LRN->getHalfWindowSize());
    auto *alpha = emitConstF32(builder, LRN->getAlpha());
    auto *beta = emitConstF32(builder, LRN->getBeta());
    auto *k = emitConstF32(builder, LRN->getK());

    auto *F =
        getFunction("local_response_normalization", dest->getElementType());
    createCall(builder, F,
               {destPtr, srcPtr, scalePtr, destDims, srcDims, halfWindow, alpha,
                beta, k});
    break;
  }

  case Kinded::Kind::LocalResponseNormalizationGradInstKind: {
    auto *LRNG = llvm::cast<LocalResponseNormalizationGradInst>(I);
    auto *srcGrad = LRNG->getSrcGrad();
    auto *dest = LRNG->getDest();
    auto *srcGradPtr = emitValueAddress(builder, srcGrad);
    auto *destGradPtr = emitValueAddress(builder, LRNG->getDestGrad());
    auto *srcPtr = emitValueAddress(builder, LRNG->getSrc());
    auto *destPtr = emitValueAddress(builder, dest);
    auto *scalePtr = emitValueAddress(builder, LRNG->getScale());

    auto *destDims = emitValueDims(builder, dest);

    auto *halfWindow = emitConstDimT(builder, LRNG->getHalfWindowSize());
    auto *alpha = emitConstF32(builder, LRNG->getAlpha());
    auto *beta = emitConstF32(builder, LRNG->getBeta());

    auto *F = getFunction("local_response_normalization_grad",
                          srcGrad->getElementType());
    createCall(builder, F,
               {srcGradPtr, destGradPtr, srcPtr, destPtr, scalePtr, destDims,
                halfWindow, alpha, beta});
    break;
  }

  case Kinded::Kind::MaxPoolInstKind: {
    auto *PM = cast<MaxPoolInst>(I);
    assert(PM->getLayout() == NHWC &&
           "Glow CPU Backend supports only NHWC Pools");
    auto *dest = PM->getDest();
    auto *src = PM->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    auto *kernels = emitConstDimTArray(builder, PM->getKernels());
    auto *strides = emitConstDimTArray(builder, PM->getStrides());
    auto *pads = emitConstDimTArray(builder, PM->getPads());

    auto *F = getFunction("max_pool", dest->getElementType());

    if (src->getType()->isQuantizedType()) {
      auto *destOffset = emitConstI32(builder, dest->getType()->getOffset());
      createCall(builder, F,
                 {srcPtr, destPtr, srcDims, destDims, kernels, strides, pads,
                  destOffset});
    } else {
      createCall(builder, F,
                 {srcPtr, destPtr, srcDims, destDims, kernels, strides, pads});
    }
    break;
  }

  case Kinded::Kind::MaxPoolWithArgmaxInstKind: {
    auto *PMXY = cast<MaxPoolWithArgmaxInst>(I);
    assert(PMXY->getLayout() == NHWC &&
           "Glow CPU Backend supports only NHWC Pools");
    auto *dest = PMXY->getDest();
    auto *src = PMXY->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *argMax = PMXY->getArgmax();
    auto *argmaxPtr = emitValueAddress(builder, argMax);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    auto *kernels = emitConstDimTArray(builder, PMXY->getKernels());
    auto *strides = emitConstDimTArray(builder, PMXY->getStrides());
    auto *pads = emitConstDimTArray(builder, PMXY->getPads());

    auto *F = getFunction("max_pool_argmax",
                          {dest->getElementType(), argMax->getElementType()});
    createCall(builder, F,
               {srcPtr, destPtr, argmaxPtr, srcDims, destDims, kernels, strides,
                pads});
    break;
  }

  case Kinded::Kind::MaxPoolWithArgmaxGradInstKind: {
    auto *PMG = cast<MaxPoolWithArgmaxGradInst>(I);
    auto *srcGrad = PMG->getSrcGrad();
    auto *srcGradPtr = emitValueAddress(builder, srcGrad);
    auto *destGradPtr = emitValueAddress(builder, PMG->getDestGrad());
    auto *argMax = PMG->getArgmax();
    auto *argmaxPtr = emitValueAddress(builder, argMax);

    auto *srcGradDims = emitValueDims(builder, srcGrad);
    auto *destDims = emitValueDims(builder, PMG->getDest());

    auto *F = getFunction("max_pool_argmax_grad", {srcGrad->getElementType(),
                                                   argMax->getElementType()});
    createCall(builder, F,
               {srcGradPtr, destGradPtr, argmaxPtr, srcGradDims, destDims});
    break;
  }

  case Kinded::Kind::ArgMaxInstKind: {
    auto *AM = cast<ArgMaxInst>(I);
    auto *dest = AM->getDest();
    auto *src = AM->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *srcDims = emitValueDims(builder, src);
    auto *srcNumDims = emitConstSizeT(builder, src->dims().size());
    auto *axis = emitConstSizeT(builder, AM->getAxis());
    auto *F =
        getFunction("arg_max", {src->getElementType(), dest->getElementType()});
    createCall(builder, F, {srcPtr, destPtr, srcDims, srcNumDims, axis});
    break;
  }

  case Kinded::Kind::ArgMinInstKind: {
    auto *AM = cast<ArgMinInst>(I);
    auto *dest = AM->getDest();
    auto *src = AM->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *srcDims = emitValueDims(builder, src);
    auto *srcNumDims = emitConstSizeT(builder, src->dims().size());
    auto *axis = emitConstSizeT(builder, AM->getAxis());
    auto *F =
        getFunction("arg_min", {src->getElementType(), dest->getElementType()});
    createCall(builder, F, {srcPtr, destPtr, srcDims, srcNumDims, axis});
    break;
  }

  case Kinded::Kind::AvgPoolInstKind: {
    auto *PA = cast<AvgPoolInst>(I);
    assert(PA->getLayout() == NHWC &&
           "Glow CPU Backend supports only NHWC Pools");
    auto *dest = PA->getDest();
    auto *src = PA->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    auto *kernels = emitConstDimTArray(builder, PA->getKernels());
    auto *strides = emitConstDimTArray(builder, PA->getStrides());
    auto *pads = emitConstDimTArray(builder, PA->getPads());
    auto *countIncludePads = emitConstI1(builder, PA->getCountIncludePads());

    auto *F = getFunction("avg_pool", dest->getElementType());

    if (src->getType()->isQuantizedType()) {
      auto *destTy = dest->getType();
      auto *srcTy = src->getType();
      auto *destOffset = emitConstI32(builder, destTy->getOffset());
      auto *srcOffset = emitConstI32(builder, srcTy->getOffset());
      // When we count the padding pixels in the normalizing factor we include
      // the filter area in the scaling parameters since it is a constant.
      float scale = srcTy->getScale() / destTy->getScale();
      if (PA->getCountIncludePads()) {
        scale = scale / (PA->getKernels()[0] * PA->getKernels()[1]);
      }
      auto outScaleParam = quantization::quantizeScaleOffset32To8(scale, 0);
      auto *outPre = emitConstI32(builder, outScaleParam.pre);
      auto *outPost = emitConstI32(builder, outScaleParam.post);
      auto *outScale = emitConstI32(builder, outScaleParam.scale);
      createCall(builder, F,
                 {srcPtr, destPtr, srcDims, destDims, kernels, strides, pads,
                  countIncludePads, destOffset, srcOffset, outPre, outPost,
                  outScale});
    } else {
      createCall(builder, F,
                 {srcPtr, destPtr, srcDims, destDims, kernels, strides, pads,
                  countIncludePads});
    }
    break;
  }

  case Kinded::Kind::AdaptiveAvgPoolInstKind: {
    auto *PA = cast<AdaptiveAvgPoolInst>(I);

    auto *dest = PA->getDest();
    auto *src = PA->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    auto *F = getFunction("adaptive_avg_pool", dest->getElementType());
    createCall(builder, F, {srcPtr, destPtr, srcDims, destDims});
    break;
  }

  case Kinded::Kind::AvgPoolGradInstKind: {
    auto *PAG = cast<AvgPoolGradInst>(I);
    auto *srcGrad = PAG->getSrcGrad();
    auto *srcGradPtr = emitValueAddress(builder, srcGrad);
    auto *destGradPtr = emitValueAddress(builder, PAG->getDestGrad());

    auto *srcGradDims = emitValueDims(builder, srcGrad);
    auto *destDims = emitValueDims(builder, PAG->getDest());

    auto *kernels = emitConstDimTArray(builder, PAG->getKernels());
    auto *strides = emitConstDimTArray(builder, PAG->getStrides());
    auto *pads = emitConstDimTArray(builder, PAG->getPads());
    auto *countIncludePads = emitConstI1(builder, PAG->getCountIncludePads());

    auto *F = getFunction("avg_pool_grad", srcGrad->getElementType());
    createCall(builder, F,
               {srcGradPtr, destGradPtr, srcGradDims, destDims, kernels,
                strides, pads, countIncludePads});
    break;
  }

  case Kinded::Kind::SoftMaxInstKind: {
    auto *SM = cast<SoftMaxInst>(I);
    auto *dest = SM->getDest();
    auto *src = SM->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);
    auto *F = getFunction("softmax", dest->getElementType());

    if (src->getType()->isQuantizedType()) {
      std::vector<int32_t> lut;

      // Compute lookup table containing all the exponentials based on the
      // formula e^(scale * value), where scale is the input scale of
      // the quantized input data and value is a value from [-255, 0].
      for (int32_t i = 0; i < 256; i++) {
        auto exponent =
            FixedPointUInt32(exp(src->getType()->getScale() * (i - 255)), 1)
                .getFixedVal();
        lut.push_back(exponent);
      }

      auto *lutPtr = emitConstI32Array(builder, lut);
      auto *outOffset = emitConstI32(builder, dest->getType()->getOffset());
      float size = static_cast<float>(src->getType()->dims()[1]);
      auto *sumIntegerPart = emitConstI32(builder, ceil(log2(size)));

      if (ceil(log2(size)) == floor(log2(size))) {
        sumIntegerPart = emitConstI32(builder, ceil(log2(size)) + 1);
      }

      FixedPointUInt32 invScaleFixedPoint =
          FixedPointUInt32(1.f / dest->getType()->getScale());
      auto *invScale = emitConstI32(builder, invScaleFixedPoint.getFixedVal());
      auto *invScalePoint =
          emitConstI32(builder, invScaleFixedPoint.getIntBits());
      createCall(builder, F,
                 {srcPtr, destPtr, srcDims, lutPtr, outOffset, invScale,
                  sumIntegerPart, invScalePoint});
    } else {
      createCall(builder, F, {srcPtr, destPtr, srcDims, destDims});
    }

    break;
  }

  case Kinded::Kind::SoftMaxGradInstKind: {
    auto *SMG = cast<SoftMaxGradInst>(I);
    auto *srcGrad = SMG->getSrcGrad();
    auto *selected = SMG->getSelected();
    auto *srcGradPtr = emitValueAddress(builder, srcGrad);
    auto *destPtr = emitValueAddress(builder, SMG->getOrigDest());
    auto *selectedPtr = emitValueAddress(builder, selected);

    auto *srcGradDims = emitValueDims(builder, srcGrad);
    auto *selectedDims = emitValueDims(builder, selected);

    auto *F = getFunction("softmax_grad", {srcGrad->getElementType(),
                                           selected->getElementType()});
    createCall(builder, F,
               {srcGradPtr, destPtr, selectedPtr, srcGradDims, selectedDims});
    break;
  }

  case Kinded::Kind::TopKInstKind: {
    auto *TI = cast<TopKInst>(I);
    auto *input = TI->getInput();
    auto *valuesPtr = emitValueAddress(builder, TI->getValues());
    auto *indicesPtr = emitValueAddress(builder, TI->getIndices());
    auto *inputPtr = emitValueAddress(builder, input);
    auto *scratchPtr = emitValueAddress(builder, TI->getScratch());

    auto *k = emitConstDimT(builder, TI->getK());
    auto *n = emitConstDimT(builder, input->dims().back());
    auto *size = emitConstDimT(builder, input->size());

    auto indicesTy = TI->getIndices()->getElementType();
    auto *F = getFunction("topk", {input->getElementType(), indicesTy});

    createCall(builder, F,
               {valuesPtr, indicesPtr, inputPtr, scratchPtr, k, n, size});
    break;
  }

  case Kinded::Kind::SpaceToDepthInstKind: {
    auto *SI = cast<SpaceToDepthInst>(I);
    auto *dest = SI->getDest();
    auto *src = SI->getSrc();

    auto *dstPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *dstDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    unsigned blockSize = SI->getBlockSize();

    auto *F = getFunction("space_to_depth", src->getElementType());
    createCall(
        builder, F,
        {srcPtr, dstPtr, emitConstDimT(builder, blockSize), srcDims, dstDims});
    break;
  }

  case Kinded::Kind::TransposeInstKind: {
    auto *TI = cast<TransposeInst>(I);
    auto *dest = TI->getDest();
    auto *src = TI->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    // Convert the mask to size_t type.
    ShapeVector shuffSizeT;
    for (auto D : TI->getShuffle()) {
      shuffSizeT.push_back((size_t)D);
    }

    auto *shuffle = emitConstDimTArray(builder, llvm::makeArrayRef(shuffSizeT));
    auto *len = emitConstDimT(builder, TI->getShuffle().size());

    auto *F = getFunction("transpose", dest->getElementType());
    createCall(builder, F, {srcPtr, destPtr, srcDims, destDims, shuffle, len});
    break;
  }

  case Kinded::Kind::FlipInstKind: {
    auto *FI = cast<FlipInst>(I);
    auto *dest = FI->getDest();
    auto *src = FI->getSrc();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);
    auto *dims = emitValueDims(builder, src);
    auto *axis = emitConstDimT(builder, FI->getAxis());
    auto *dimsSize = emitConstDimT(builder, src->getType()->dims().size());
    auto *F = getFunction("flip", src->getElementType());
    createCall(builder, F, {srcPtr, destPtr, dims, axis, dimsSize});
    break;
  }

    // Alloc and Dealloc instructions are handled by the memory allocator.
  case Kinded::Kind::AllocActivationInstKind:
  case Kinded::Kind::DeallocActivationInstKind:
  case Kinded::Kind::TensorViewInstKind:
    break;

  case Kinded::Kind::InsertTensorInstKind: {
    auto *ITI = llvm::cast<InsertTensorInst>(I);
    auto *dest = ITI->getDest();
    auto *src = ITI->getSrc();
    auto offsets = ITI->getOffsets();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    auto *destDimsSize = emitConstDimT(builder, dest->getType()->dims().size());
    auto *srcDimsSize = emitConstDimT(builder, src->getType()->dims().size());
    auto *offsetsPtr = emitConstDimTArray(builder, offsets);
    auto *offsetsArraySize = emitConstDimT(builder, offsets.size());
    auto *count = emitConstDimT(builder, ITI->getCount());
    auto *axis = emitConstDimT(builder, ITI->getAxis());

    // Don't specialize the offsetPtr because we typically generate lots of
    // extracts from different offsets and specializing on this argument does
    // not speed things up.
    markArgAsUnspecialized(offsetsPtr);

    auto *F = getFunction("insert_tensor", dest->getElementType());
    createCall(builder, F,
               {destPtr, srcPtr, offsetsPtr, destDims, srcDims, destDimsSize,
                srcDimsSize, offsetsArraySize, count, axis});
    break;
  }

  case Kinded::Kind::ExtractTensorInstKind: {
    auto *ITI = llvm::cast<ExtractTensorInst>(I);
    auto *dest = ITI->getDest();
    auto *src = ITI->getSrc();
    auto offsets = ITI->getOffsets();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *srcPtr = emitValueAddress(builder, src);

    auto *destDims = emitValueDims(builder, dest);
    auto *srcDims = emitValueDims(builder, src);

    auto *destDimsSize = emitConstDimT(builder, dest->getType()->dims().size());
    auto *srcDimsSize = emitConstDimT(builder, src->getType()->dims().size());
    auto *offsetsPtr = emitConstDimTArray(builder, offsets);
    auto *offsetsArraySize = emitConstDimT(builder, offsets.size());

    // Don't specialize the offsetPtr because we typically generate lots of
    // extracts from different offsets and specializing on this argument does
    // not speed things up.
    markArgAsUnspecialized(offsetsPtr);

    auto *F = getFunction("extract_tensor", dest->getElementType());
    createCall(builder, F,
               {srcPtr, destPtr, offsetsPtr, srcDims, destDims, srcDimsSize,
                destDimsSize, offsetsArraySize});
    break;
  }

  case Kinded::Kind::GatherInstKind: {
    auto *GI = llvm::cast<GatherInst>(I);
    auto *dest = GI->getDest();
    auto *data = GI->getData();
    auto *indices = GI->getIndices();
    unsigned axis = GI->getBatchDims();

    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *indicesPtr = emitValueAddress(builder, indices);

    auto *indicesSize = emitConstDimT(builder, indices->size());

    auto *dataType = data->getType();

    // The size of the sample in the batch.
    size_t sampleSize = dataType->getSliceSize(axis);
    // The size of the slices that we gather.
    size_t sliceSize = dataType->getSliceSize(axis + 1);
    // The size of each sample in the batch.
    size_t numSamples = dataType->size() / sampleSize;

    auto *sliceSizeVal = emitConstDimT(builder, sliceSize);
    auto *numSamplesVal = emitConstDimT(builder, numSamples);
    auto *sampleSizeVal = emitConstDimT(builder, sampleSize);

    // Dispatching function depending on the input type of Indices.
    llvm::Function *F = nullptr;
    if (indices->getElementType() == ElemKind::Int64ITy) {
      F = getFunction("gather64", dest->getElementType());
    } else if (indices->getElementType() == ElemKind::Int32ITy) {
      F = getFunction("gather32", dest->getElementType());
    }
    if (!F) {
      llvm_unreachable("Cannot get function for Gather. "
                       "Indices input of Gather has to be int32 or int64");
    }
    createCall(builder, F,
               {destPtr, dataPtr, indicesPtr, indicesSize, sliceSizeVal,
                numSamplesVal, sampleSizeVal});
    break;
  }

  case Kinded::Kind::GatherNDInstKind: {
    auto *GI = llvm::cast<GatherNDInst>(I);
    auto *dest = GI->getDest();
    auto *data = GI->getData();
    auto *indices = GI->getIndices();
    unsigned batchDims = GI->getBatchDims();

    auto dataDims = data->dims();
    auto indicesDims = indices->dims();
    dim_t indicesDimLast = indicesDims.back();

    // Compute batch count.
    dim_t batchCount = 1;
    for (size_t idx = 0; idx < batchDims; ++idx) {
      batchCount *= dataDims[idx];
    }

    // Compute input slice count.
    dim_t inpSliceCount = 1;
    for (size_t idx = batchDims; idx < batchDims + indicesDimLast; ++idx) {
      inpSliceCount *= dataDims[idx];
    }

    // Compute output slice count.
    dim_t outSliceCount = 1;
    for (size_t idx = batchDims; idx < indicesDims.size() - 1; ++idx) {
      outSliceCount *= indicesDims[idx];
    }

    // Compute slice size (in bytes).
    dim_t sliceSize = data->getType()->getElementSize();
    for (size_t idx = batchDims + indicesDimLast; idx < dataDims.size();
         idx++) {
      sliceSize *= dataDims[idx];
    }

    // Get indices dimension products.
    std::vector<dim_t> indicesDimProd(indicesDimLast);
    indicesDimProd[indicesDimLast - 1] = 1;
    for (ssize_t idx = static_cast<ssize_t>(indicesDimLast) - 2; idx >= 0;
         idx--) {
      indicesDimProd[idx] =
          indicesDimProd[idx + 1] * dataDims[batchDims + idx + 1];
    }

    // Emit pointers.
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *indicesPtr = emitValueAddress(builder, indices);

    // Emit parameters.
    auto *batchCountArg = emitConstDimT(builder, batchCount);
    auto *inpSliceCountArg = emitConstDimT(builder, inpSliceCount);
    auto *outSliceCountArg = emitConstDimT(builder, outSliceCount);
    auto *sliceSizeArg = emitConstDimT(builder, sliceSize);
    auto *indicesDimLastArg = emitConstDimT(builder, indicesDimLast);
    auto *indicesDimProdArg =
        emitConstDimTArray(builder, llvm::makeArrayRef(indicesDimProd));

    llvm::Function *F = getFunction(
        "gather_nd", {data->getElementType(), indices->getElementType()});
    createCall(builder, F,
               {destPtr, dataPtr, indicesPtr, batchCountArg, inpSliceCountArg,
                outSliceCountArg, sliceSizeArg, indicesDimLastArg,
                indicesDimProdArg});
    break;
  }

  case Kinded::Kind::GatherRangesInstKind: {
    auto *GRI = llvm::cast<GatherRangesInst>(I);
    auto *output = GRI->getOutput();
    auto *lengths = GRI->getLengths();
    auto *data = GRI->getData();
    auto *ranges = GRI->getRanges();

    auto *outputPtr = emitValueAddress(builder, output);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *rangesPtr = emitValueAddress(builder, ranges);

    auto rangesType = ranges->getType();

    // The number of examples in ranges.
    size_t numExamples = rangesType->dims()[0];
    // The number of range pairs in each example.
    size_t exampleSize = rangesType->dims()[1];

    auto *numExamplesVal = emitConstDimT(builder, numExamples);
    auto *exampleSizeVal = emitConstDimT(builder, exampleSize);

    // Dispatching function depending on the input type of Ranges.
    llvm::Function *F = nullptr;
    if (ranges->getElementType() == ElemKind::Int64ITy) {
      F = getFunction("gatherranges64", output->getElementType());
    } else if (ranges->getElementType() == ElemKind::Int32ITy) {
      F = getFunction("gatherranges32", output->getElementType());
    }
    if (!F) {
      llvm_unreachable("Cannot get function for GatherRanges. "
                       "Ranges input of GatherRanges has to be int32 or int64");
    }
    createCall(builder, F,
               {outputPtr, lengthsPtr, dataPtr, rangesPtr, numExamplesVal,
                exampleSizeVal});
    break;
  }

  case Kinded::Kind::LengthsRangeFillInstKind: {
    auto *LRFI = llvm::cast<LengthsRangeFillInst>(I);
    auto *dest = LRFI->getDest();
    auto *lengths = LRFI->getLengths();

    auto *destPtr = emitValueAddress(builder, dest);
    auto *lengthsPtr = emitValueAddress(builder, lengths);

    auto *lengthsSize = emitConstDimT(builder, lengths->size());

    // Dispatching function depending on the input type of Ranges.
    auto *F = getFunction("lengths_range_fill", dest->getElementType());
    createCall(builder, F, {lengthsPtr, destPtr, lengthsSize});
    break;
  }

  case Kinded::Kind::ScatterDataInstKind: {
    auto *SDI = llvm::cast<ScatterDataInst>(I);
    auto *data = SDI->getData();
    auto *indices = SDI->getIndices();
    auto *slices = SDI->getSlices();

    auto *dataPtr = emitValueAddress(builder, data);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *slicesPtr = emitValueAddress(builder, slices);
    auto *dataDims = emitValueDims(builder, data);

    auto *indicesCnt = emitConstDimT(builder, indices->getType()->dims()[0]);
    auto *indicesSize = emitConstDimT(builder, indices->getType()->dims()[1]);
    auto *slicesType = slices->getType();
    auto *sliceSize =
        emitConstDimT(builder, slicesType->size() / slicesType->dims()[0]);
    auto *isCumulative = emitConstI1(builder, SDI->getCumulative());
    auto *F = getFunction("scatterdata",
                          {data->getElementType(), indices->getElementType()});
    if (data->getType()->isQuantizedType()) {
      auto *dataScale = emitConstF32(builder, data->getType()->getScale());
      auto *dataOffset = emitConstI32(builder, data->getType()->getOffset());
      auto *sliceScale = emitConstF32(builder, slices->getType()->getScale());
      auto *sliceOffset = emitConstI32(builder, slices->getType()->getOffset());
      createCall(builder, F,
                 {dataPtr, dataDims, indicesPtr, slicesPtr, indicesCnt,
                  indicesSize, sliceSize, isCumulative, dataScale, dataOffset,
                  sliceScale, sliceOffset});
    } else {
      createCall(builder, F,
                 {dataPtr, dataDims, indicesPtr, slicesPtr, indicesCnt,
                  indicesSize, sliceSize, isCumulative});
    }
    break;
  }

  case Kinded::Kind::SparseLengthsSumInstKind: {
    auto *SI = cast<SparseLengthsSumInst>(I);
    auto *dest = SI->getDest();
    auto *data = SI->getData();
    auto *indices = SI->getIndices();
    auto *lengths = SI->getLengths();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *segments = emitConstDimT(builder, lengths->dims()[0]);
    auto *lineSize = emitConstDimT(builder, data->size() / data->dims()[0]);
    auto *F = getFunction("sparse_lengths_sum",
                          {dest->getElementType(), indices->getElementType()});
    createCall(builder, F,
               {destPtr, dataPtr, indicesPtr, lengthsPtr, segments, lineSize});
    break;
  }

  case Kinded::Kind::SparseLengthsWeightedSumInstKind: {
    auto *SI = cast<SparseLengthsWeightedSumInst>(I);
    auto *dest = SI->getDest();
    auto *data = SI->getData();
    auto *weights = SI->getWeights();
    auto *indices = SI->getIndices();
    auto *lengths = SI->getLengths();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *segments = emitConstDimT(builder, lengths->dims()[0]);
    auto *lineSize = emitConstDimT(builder, data->size() / data->dims()[0]);
    auto *F = getFunction("sparse_lengths_weighted_sum",
                          {dest->getElementType(), indices->getElementType()});
    createCall(builder, F,
               {destPtr, dataPtr, weightsPtr, indicesPtr, lengthsPtr, segments,
                lineSize});
    break;
  }

  case Kinded::Kind::EmbeddingInstKind: {
    auto *SI = cast<EmbeddingInst>(I);
    auto *dest = SI->getDest();
    auto *weights = SI->getWeights();
    auto *indices = SI->getIndices();
    auto *padIdx = emitConstSizeT(builder, SI->getPadIdx());
    auto *scale = emitConstI1(builder, SI->getScale());
    auto *sparse = emitConstI1(builder, SI->getSparse());
    auto *destPtr = emitValueAddress(builder, dest);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *indDims = emitValueDims(builder, indices);
    auto *indSize = emitConstDimT(builder, indices->dims().size());
    assert(weights->dims().size() == 2 && "weights must be 2-D");
    auto *numEmbedding = emitConstDimT(builder, weights->dims()[0]);
    auto *embeddingDim = emitConstDimT(builder, weights->dims()[1]);
    auto *F = getFunction("embedding", dest->getElementType());
    createCall(builder, F,
               {destPtr, weightsPtr, indicesPtr, indDims, indSize, numEmbedding,
                embeddingDim, padIdx, scale, sparse});
    break;
  }

  case Kinded::Kind::EmbeddingBagInstKind: {
    auto *SI = cast<EmbeddingBagInst>(I);
    auto *dest = SI->getDest();
    auto *data = SI->getData();
    auto *weights = SI->getWeights();
    auto *indices = SI->getIndices();
    auto *offsets = SI->getOffsets();
    auto *hasEndOffset = emitConstI1(builder, SI->getHasEndOffset());
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *offsetsPtr = emitValueAddress(builder, offsets);
    auto *segments = emitConstDimT(builder, offsets->dims()[0]);
    auto *totalLength = emitConstDimT(builder, indices->dims()[0]);
    auto *lineSize = emitConstDimT(builder, data->size() / data->dims()[0]);
    auto *F = getFunction("embedding_bag", dest->getElementType());
    createCall(builder, F,
               {destPtr, dataPtr, weightsPtr, indicesPtr, offsetsPtr, segments,
                lineSize, totalLength, hasEndOffset});
    break;
  }

  case Kinded::Kind::SparseLengthsWeightedSumGradInstKind: {
    auto *SI = cast<SparseLengthsWeightedSumGradInst>(I);
    auto *destGrad = SI->getDestGrad();
    auto *dataGrad = SI->getDataGrad();
    auto *weightsGrad = SI->getWeightsGrad();
    auto *data = SI->getData();
    auto *weights = SI->getWeights();
    auto *indices = SI->getIndices();
    auto *lengths = SI->getLengths();
    auto *destGradPtr = emitValueAddress(builder, destGrad);
    auto *dataGradPtr = emitValueAddress(builder, dataGrad);
    auto *weightsGradPtr = emitValueAddress(builder, weightsGrad);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *segments = emitConstDimT(builder, lengths->dims()[0]);
    auto *dataGradRawSize =
        emitConstDimT(builder, dataGrad->size() * sizeof(float));
    auto *lineSize =
        emitConstDimT(builder, dataGrad->size() / dataGrad->dims()[0]);
    auto *F =
        getFunction("sparse_lengths_weighted_sum_grad",
                    {destGrad->getElementType(), indices->getElementType()});
    createCall(builder, F,
               {destGradPtr, dataGradPtr, weightsGradPtr, dataPtr, weightsPtr,
                indicesPtr, lengthsPtr, segments, lineSize, dataGradRawSize});
    break;
  }

  case Kinded::Kind::RowwiseQuantizedSparseLengthsWeightedSumInstKind: {
    auto *N = cast<RowwiseQuantizedSparseLengthsWeightedSumInst>(I);
    auto *dest = N->getDest();
    auto *data = N->getData();
    auto *scales = N->getScales();
    auto *offsets = N->getOffsets();
    auto *weights = N->getWeights();
    auto *indices = N->getIndices();
    auto *lengths = N->getLengths();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *scalesPtr = emitValueAddress(builder, scales);
    auto *offsetsPtr = emitValueAddress(builder, offsets);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *segments = emitConstDimT(builder, lengths->dims()[0]);
    auto *lineSize = emitConstDimT(builder, data->size() / data->dims()[0]);
    auto *F = getFunction("rowwise_quantized_sparse_lengths_weighted_sum",
                          {dest->getElementType(), indices->getElementType()});
    createCall(builder, F,
               {destPtr, dataPtr, scalesPtr, offsetsPtr, weightsPtr, indicesPtr,
                lengthsPtr, segments, lineSize});
    break;
  }

  case Kinded::Kind::FusedRowwiseQuantizedSparseLengthsWeightedSumInstKind: {
    auto *N = cast<FusedRowwiseQuantizedSparseLengthsWeightedSumInst>(I);
    auto *dest = N->getDest();
    auto *data = N->getData();
    auto *weights = N->getWeights();
    auto *indices = N->getIndices();
    auto *lengths = N->getLengths();
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *lengthsPtr = emitValueAddress(builder, lengths);
    auto *segments = emitConstDimT(builder, lengths->dims()[0]);
    auto *inLineSize = emitConstDimT(builder, data->size() / data->dims()[0]);
    auto *outLineSize = emitConstDimT(builder, dest->size() / dest->dims()[0]);
    auto *F = getFunction("fused_rowwise_quantized_sparse_lengths_weighted_sum",
                          {dest->getElementType(), indices->getElementType()});
    createCall(builder, F,
               {destPtr, dataPtr, weightsPtr, indicesPtr, lengthsPtr, segments,
                inLineSize, outLineSize});
    break;
  }

  case Kinded::Kind::EmbeddingBagByteRowwiseOffsetsInstKind: {
    auto *N = cast<EmbeddingBagByteRowwiseOffsetsInst>(I);
    auto *dest = N->getDest();
    auto *data = N->getData();
    auto *weights = N->getWeights();
    auto *indices = N->getIndices();
    auto *offsets = N->getOffsets();
    auto *hasEndOffset = emitConstI1(builder, N->getHasEndOffset());
    auto *destPtr = emitValueAddress(builder, dest);
    auto *dataPtr = emitValueAddress(builder, data);
    auto *weightsPtr = emitValueAddress(builder, weights);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *offsetsPtr = emitValueAddress(builder, offsets);
    auto *segments = emitConstDimT(builder, offsets->dims()[0]);
    auto *numIndices = emitConstDimT(builder, indices->dims()[0]);
    auto *inLineSize = emitConstDimT(builder, data->size() / data->dims()[0]);
    auto *outLineSize = emitConstDimT(builder, dest->size() / dest->dims()[0]);
    auto *F = getFunction("embedding_bag_byte_rowwise_offsets",
                          dest->getElementType());
    createCall(builder, F,
               {destPtr, dataPtr, weightsPtr, indicesPtr, offsetsPtr, segments,
                numIndices, inLineSize, outLineSize, hasEndOffset});
    break;
  }

  case Kinded::Kind::DebugPrintInstKind: {
    auto *DPI = llvm::cast<DebugPrintInst>(I);
    auto *src = DPI->getSrc();
    auto *srcPtr = emitValueAddress(builder, src);
    srcPtr = builder.CreateBitCast(srcPtr, builder.getInt8PtrTy());
    auto *srcDims = emitValueDims(builder, src);
    auto *srcDimsSize = emitConstDimT(builder, src->getType()->dims().size());
    auto *srcSize = emitConstSizeT(builder, src->getType()->size());
    auto *srcSizeBytes =
        emitConstSizeT(builder, src->getType()->getSizeInBytes());
    auto *srcElemKind =
        emitConstDimT(builder, static_cast<size_t>(src->getElementType()));
    auto *name = emitStringConst(builder, I->getName());
    auto *filename = emitStringConst(builder, DPI->getFileName());
    auto srcTypeStr = src->getType()->toString();

    std::string format = DPI->getFormat();
    if (format == "console") {
      // Dump tensor in console.
      auto *F = getFunction("dump_tensor_console");
      createCall(builder, F, {srcPtr, srcDims, srcDimsSize, srcElemKind, name});

    } else if (format == "bin") {
      // Dump tensor in file in binary format.
      auto *F = getFunction("dump_tensor_bin");
      auto *header = emitStringConst(builder, srcTypeStr);
      createCall(builder, F, {srcPtr, srcSizeBytes, filename, header});

    } else if (format == "txt") {
      // Dump tensor in file in text format.
      auto *F = getFunction("dump_tensor_txt", src->getElementType());
      auto *header = emitStringConst(builder, srcTypeStr);
      createCall(builder, F, {srcPtr, srcSize, filename, header});

    } else if (format == "rawbin") {
      // Dump tensor in file in raw binary format.
      auto *F = getFunction("dump_tensor_bin");
      auto *header = emitStringConst(builder, "");
      createCall(builder, F, {srcPtr, srcSizeBytes, filename, header});

    } else if (format == "rawtxt") {
      // Dump tensor in file in raw text format.
      auto *F = getFunction("dump_tensor_txt", src->getElementType());
      auto *header = emitStringConst(builder, "");
      createCall(builder, F, {srcPtr, srcSize, filename, header});

    } else {
      LOG(FATAL) << "Invalid 'Format' attribute for DebugPrint instruction!";
    }
    break;
  }

  case Kinded::Kind::InstrumentInstKind: {
    auto *instrumentI = llvm::cast<InstrumentInst>(I);
    auto *opInfo = instrumentI->getOperandsInfo();

    // Instruction being instrumented.
    Instruction *instrRef = instrumentI->getInstrRef();

    // Emit instruction ID and instruction kind.
    llvm::Type *intTy =
        llvm::Type::getIntNTy(getLLVMContext(), getLibjitIntWidth());
    auto *ID = llvm::ConstantInt::get(intTy, instrumentI->getID());
    auto *kind = llvm::ConstantInt::get(intTy, (int)(instrRef->getKind()));

    // Emit number of input and output operands.
    auto inpNum = instrRef->getNumInputs();
    auto outNum = instrRef->getNumOutputs();
    auto opNum = inpNum + outNum;
    auto *opInp = llvm::ConstantInt::get(intTy, inpNum);
    auto *opOut = llvm::ConstantInt::get(intTy, outNum);

    // Emit opInfo address as uint8_t*.
    assert(opInfo->getType()->getSizeInBytes() >= 2 * sizeof(int64_t) &&
           "Not enough memory allocated for instrumentation!");
    auto *opInfoPtr = emitValueAddress(builder, opInfo);
    opInfoPtr = builder.CreateBitCast(opInfoPtr, builder.getInt8PtrTy());

    // Emit opAddr address as uint8_t** starting from offset 0.
    auto *opAddrPtr =
        builder.CreateGEP(opInfoPtr, llvm::ConstantInt::get(intTy, 0));
    opAddrPtr = builder.CreateBitCast(opAddrPtr,
                                      builder.getInt8PtrTy()->getPointerTo());

    // Emit opSize address as int* starting from offset opNum * sizeof(int64_t).
    auto *opSizePtr = builder.CreateGEP(
        opInfoPtr, llvm::ConstantInt::get(intTy, opNum * sizeof(int64_t)));
    opSizePtr = builder.CreateBitCast(opSizePtr, intTy->getPointerTo());

    // Generate instrumentation.
    auto instrumentKind = instrumentI->getInstrumentKind();
    if (instrumentKind == InstrumentKind::Before) {

      // Operands addresses and sizes.
      std::vector<llvm::Value *> opAddrArray;
      std::vector<llvm::Value *> opSizeArray;

      // Get addresses and sizes for the input operands.
      for (const auto &op : instrRef->getOperands()) {
        if (op.second == OperandKind::Out) {
          continue;
        }
        // Emit operand address as uint8_t* variable.
        auto *opAddr = emitValueAddress(builder, op.first);
        opAddr = builder.CreateBitCast(opAddr, builder.getInt8PtrTy());
        opAddrArray.push_back(opAddr);
        // Emit operand size in bytes as int constant.
        auto *opSize = llvm::ConstantInt::get(
            intTy, op.first->getType()->getSizeInBytes());
        opSizeArray.push_back(opSize);
      }
      assert(opAddrArray.size() == inpNum && "Inconsistent size!");

      // Get addresses and sizes for the output operands.
      for (const auto &op : instrRef->getOperands()) {
        if (op.second == OperandKind::In) {
          continue;
        }
        // Emit operand address as uint8_t* variable.
        auto *opAddr = emitValueAddress(builder, op.first);
        opAddr = builder.CreateBitCast(opAddr, builder.getInt8PtrTy());
        opAddrArray.push_back(opAddr);
        // Emit operand size in bytes as int constant.
        auto *opSize = llvm::ConstantInt::get(
            intTy, op.first->getType()->getSizeInBytes());
        opSizeArray.push_back(opSize);
      }
      assert(opAddrArray.size() == opNum && "Inconsistent size!");

      // Write the addresses of the operands in the opAddr.
      emitArrayStore(builder, opAddrArray, opAddrPtr);

      // Write the sizes of the operands in opSize.
      emitArrayStore(builder, opSizeArray, opSizePtr);

      // Create callback call.
      auto *F = getFunction("instrument_before");
      createCall(builder, F, {ID, kind, opInp, opOut, opAddrPtr, opSizePtr});

    } else if (instrumentKind == InstrumentKind::After) {

      // Create callback call.
      auto *F = getFunction("instrument_after");
      createCall(builder, F, {ID, kind, opInp, opOut, opAddrPtr, opSizePtr});

    } else {
      llvm_unreachable("Instrumentation kind not supported!");
    }
    // Print the IR instrumentation callback API.
    printInstrumentIR_ = true;
    break;
  }

  case Kinded::Kind::TraceEventInstKind: {
    auto *TEI = llvm::cast<TraceEventInst>(I);
    auto *data = TEI->getData();
    auto *offset = emitConstDimT(builder, TEI->getIndex());
    auto *dataPtr = emitValueAddress(builder, data);
    auto *F = getFunction("write_timestamp");
    createCall(builder, F, {dataPtr, offset});
    break;
  }

  case Kinded::Kind::ResizeNearestInstKind: {
    auto *RNI = llvm::cast<ResizeNearestInst>(I);
    auto *result = RNI->getDest();
    auto *input = RNI->getSrc();
    auto *resultPtr = emitValueAddress(builder, result);
    auto *inputPtr = emitValueAddress(builder, input);

    auto *scalePtr = emitConstFloatArray(builder, RNI->getScale());
    auto *destDims = emitValueDims(builder, result);
    auto *srcDims = emitValueDims(builder, input);
    auto *F = getFunction("resizenearest", input->getElementType());
    createCall(builder, F, {resultPtr, inputPtr, scalePtr, srcDims, destDims});
    break;
  }

  case Kinded::Kind::ResizeBilinearInstKind: {
    auto *RBI = llvm::cast<ResizeBilinearInst>(I);
    auto *result = RBI->getDest();
    auto *input = RBI->getSrc();
    auto *resultPtr = emitValueAddress(builder, result);
    auto *inputPtr = emitValueAddress(builder, input);

    CHECK_EQ(RBI->getScale()[0], 1.0) << "Scaling batch not supported.";
    CHECK_EQ(RBI->getScale()[3], 1.0) << "Scaling channel not supported.";

    auto *scalePtr = emitConstFloatArray(builder, RBI->getScale());
    auto *destDims = emitValueDims(builder, result);
    auto *srcDims = emitValueDims(builder, input);
    auto *F = getFunction("resizebilinear", input->getElementType());
    createCall(builder, F, {resultPtr, inputPtr, scalePtr, srcDims, destDims});
    break;
  }

  case Kinded::Kind::NonMaxSuppressionInstKind: {
    auto *NMSI = llvm::cast<NonMaxSuppressionInst>(I);
    auto boxes = NMSI->getBoxes();
    auto scores = NMSI->getScores();
    auto indices = NMSI->getIndices();
    auto numDetected = NMSI->getNumberOfSelectedIndices();
    float iouThreshold = NMSI->getIouThreshold();
    int64_t maxBoxesPerClass = NMSI->getMaxOutputBoxesPerClass();
    float scoreThreshold = NMSI->getScoreThreshold();
    int centerPointBox = NMSI->getCenterPointBox();
    bool isV4 = NMSI->getIsTFVersion4();

    auto *boxesPtr = emitValueAddress(builder, boxes);
    auto *scoresPtr = emitValueAddress(builder, scores);
    auto *indicesPtr = emitValueAddress(builder, indices);
    auto *numDetectedPtr = emitValueAddress(builder, numDetected);

    auto *maxBoxesPerClassVal = emitConstI32(builder, maxBoxesPerClass);
    auto *centerPointBoxVal = emitConstI32(builder, centerPointBox);
    auto *iouThresholdVal = emitConstF32(builder, iouThreshold);
    auto *scoreThresholdVal = emitConstF32(builder, scoreThreshold);

    auto *boxesDimVal = emitValueDims(builder, boxes);
    auto *scoreDimVal = emitValueDims(builder, scores);
    auto *indicesDimVal = emitValueDims(builder, indices);
    auto *boxesDimSizeVal = emitConstDimT(builder, boxes->dims().size());
    auto *scoresDimSizeVal = emitConstDimT(builder, scores->dims().size());
    auto *indicesDimSizeVal = emitConstDimT(builder, indices->dims().size());
    auto *isV4Val = emitConstI1(builder, isV4);

    auto *F = getFunction("nms", indices->getElementType());
    createCall(builder, F,
               {indicesPtr, numDetectedPtr, boxesPtr, boxesDimVal,
                boxesDimSizeVal, scoresPtr, scoreDimVal, scoresDimSizeVal,
                indicesDimVal, indicesDimSizeVal, centerPointBoxVal,
                maxBoxesPerClassVal, iouThresholdVal, scoreThresholdVal,
                isV4Val});
    break;
  }

  case Kinded::Kind::TFLiteDetectionPostProcessInstKind: {
    auto *DPPI = llvm::cast<TFLiteDetectionPostProcessInst>(I);
    auto boxes = DPPI->getBoxes();
    auto scores = DPPI->getScores();
    auto anchors = DPPI->getAnchors();
    auto detectionBoxes = DPPI->getDetectionBoxes();
    auto detectionClasses = DPPI->getDetectionClasses();
    auto detectionScores = DPPI->getDetectionScores();
    auto numDetections = DPPI->getNumDetections();
    auto scratch = DPPI->getScratch();

    // Emit pointers.
    auto *boxesPtr = emitValueAddress(builder, boxes);
    auto *scoresPtr = emitValueAddress(builder, scores);
    auto *anchorsPtr = emitValueAddress(builder, anchors);
    auto *detectionBoxesPtr = emitValueAddress(builder, detectionBoxes);
    auto *detectionClassesPtr = emitValueAddress(builder, detectionClasses);
    auto *detectionScoresPtr = emitValueAddress(builder, detectionScores);
    auto *numDetectionsPtr = emitValueAddress(builder, numDetections);
    auto *scratchPtr = emitValueAddress(builder, scratch);

    // Emit parameters.
    auto *numBoxes = emitConstI32(builder, boxes->dims()[1]);
    auto *numTotalClasses = emitConstI32(builder, scores->dims()[2]);
    auto *numClasses = emitConstI32(builder, DPPI->getNumClasses());
    auto *maxDetections = emitConstI32(builder, DPPI->getMaxDetections());
    auto *maxClassesPerDetection =
        emitConstI32(builder, DPPI->getMaxClassesPerDetection());
    auto *maxDetectionsPerClass =
        emitConstI32(builder, DPPI->getMaxDetectionsPerClass());
    auto *iouThreshold = emitConstF32(builder, DPPI->getIouThreshold());
    auto *scoreThreshold = emitConstF32(builder, DPPI->getScoreThreshold());
    auto *xScaleInv = emitConstF32(builder, 1.0f / DPPI->getXScale());
    auto *yScaleInv = emitConstF32(builder, 1.0f / DPPI->getYScale());
    auto *hScaleInv = emitConstF32(builder, 1.0f / DPPI->getHScale());
    auto *wScaleInv = emitConstF32(builder, 1.0f / DPPI->getWScale());
    auto *regularNMS = emitConstI1(builder, DPPI->getRegularNMS());

    // Current implementation only supports batch size 1.
    assert(boxes->dims()[0] == 1 &&
           "TFLiteDetectionPostProcess batch not supported!");

    // Call function.
    auto *F = getFunction("tflite_detection_post_process_f");
    createCall(builder, F,
               {boxesPtr,
                scoresPtr,
                anchorsPtr,
                detectionBoxesPtr,
                detectionClassesPtr,
                detectionScoresPtr,
                numDetectionsPtr,
                scratchPtr,
                numBoxes,
                numTotalClasses,
                numClasses,
                maxDetections,
                maxClassesPerDetection,
                maxDetectionsPerClass,
                iouThreshold,
                scoreThreshold,
                xScaleInv,
                yScaleInv,
                hScaleInv,
                wScaleInv,
                regularNMS});
    break;
  }

  case Kinded::Kind::AudioSpectrogramInstKind: {
    auto *ASI = llvm::cast<AudioSpectrogramInst>(I);
    auto winOutScratch = ASI->getWinOutScratch();
    auto fftOutScratch = ASI->getFftOutScratch();
    auto spectrogram = ASI->getSpectrogram();
    auto input = ASI->getInput();
    auto window = ASI->getWindow();
    auto twiddleFactors = ASI->getTwiddleFactors();
    auto bitReverseIndices = ASI->getBitReverseIndices();
    auto complexToRealWeights = ASI->getComplexToRealWeights();
    int64_t windowSize = ASI->getWindowSize();
    int64_t windowStride = ASI->getWindowStride();
    bool magnitudeSquared = ASI->getMagnitudeSquared();

    auto *winOutScratchPtr = emitValueAddress(builder, winOutScratch);
    auto *fftOutScratchPtr = emitValueAddress(builder, fftOutScratch);
    auto *spectrogramPtr = emitValueAddress(builder, spectrogram);
    auto *inputPtr = emitValueAddress(builder, input);
    auto *windowPtr = emitValueAddress(builder, window);
    auto *twiddleFactorsPtr = emitValueAddress(builder, twiddleFactors);
    auto *bitReverseIndicesPtr = emitValueAddress(builder, bitReverseIndices);
    auto *complexToRealWeightsPtr =
        emitValueAddress(builder, complexToRealWeights);
    auto *spectrogramDimVal = emitValueDims(builder, spectrogram);
    auto *inputLengthVal = emitConstDimT(builder, input->size());
    auto *windowSizeVal = emitConstDimT(builder, windowSize);
    auto *windowStrideVal = emitConstDimT(builder, windowStride);
    auto *magnitudeSquaredVal = emitConstI1(builder, magnitudeSquared);

    auto *F = getFunction("audio_spectrogram", spectrogram->getElementType());
    createCall(builder, F,
               {winOutScratchPtr, fftOutScratchPtr, spectrogramPtr, inputPtr,
                windowPtr, twiddleFactorsPtr, bitReverseIndicesPtr,
                complexToRealWeightsPtr, spectrogramDimVal, inputLengthVal,
                windowSizeVal, windowStrideVal, magnitudeSquaredVal});
    break;
  }

  case Kinded::Kind::MFCCInstKind: {
    auto *MFCCI = llvm::cast<MFCCInst>(I);
    auto scratch = MFCCI->getScratch();
    auto coefficients = MFCCI->getCoefficients();
    auto spectrogram = MFCCI->getSpectrogram();
    auto melWeights = MFCCI->getMelWeights();
    auto melRanges = MFCCI->getMelRanges();
    auto dctMat = MFCCI->getDctMat();
    int64_t filterBankCount = MFCCI->getFilterBankCount();

    auto *scratchPtr = emitValueAddress(builder, scratch);
    auto *coefficientsPtr = emitValueAddress(builder, coefficients);
    auto *spectrogramPtr = emitValueAddress(builder, spectrogram);
    auto *melWeightsPtr = emitValueAddress(builder, melWeights);
    auto *melRangesPtr = emitValueAddress(builder, melRanges);
    auto *dctMatPtr = emitValueAddress(builder, dctMat);
    auto *coefficientsDimVal = emitValueDims(builder, coefficients);
    auto *spectrogramDimVal = emitValueDims(builder, spectrogram);
    auto *filterBankCountVal = emitConstDimT(builder, filterBankCount);

    auto *F = getFunction("mfcc", coefficients->getElementType());
    createCall(builder, F,
               {scratchPtr, coefficientsPtr, spectrogramPtr, melWeightsPtr,
                melRangesPtr, dctMatPtr, coefficientsDimVal, spectrogramDimVal,
                filterBankCountVal});
    break;
  }

  case Kinded::Kind::ConvertToInstKind: {
    auto *CTI = llvm::cast<ConvertToInst>(I);
    auto *input = CTI->getInput();
    auto *output = CTI->getResult();

    auto *inputVal = emitValueAddress(builder, input);
    auto *outptVal = emitValueAddress(builder, output);
    auto *dimsVal = emitValueDims(builder, output);
    auto *dimSizeVal = emitConstDimT(builder, output->dims().size());

    auto *F = getFunction("convertTo",
                          {output->getElementType(), input->getElementType()});

    createCall(builder, F, {outptVal, inputVal, dimsVal, dimSizeVal});
    break;
  }

  default:
    std::string sBuf;
    llvm::raw_string_ostream s(sBuf);
    I->dump(s);
    LOG(FATAL) << "Cannot select the instruction: " << s.str();
  }
}