void X86DAGToDAGISel::Select()

in llvm/lib/Target/X86/X86ISelDAGToDAG.cpp [4705:6102]


void X86DAGToDAGISel::Select(SDNode *Node) {
  MVT NVT = Node->getSimpleValueType(0);
  unsigned Opcode = Node->getOpcode();
  SDLoc dl(Node);

  if (Node->isMachineOpcode()) {
    LLVM_DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << '\n');
    Node->setNodeId(-1);
    return;   // Already selected.
  }

  switch (Opcode) {
  default: break;
  case ISD::INTRINSIC_W_CHAIN: {
    unsigned IntNo = Node->getConstantOperandVal(1);
    switch (IntNo) {
    default: break;
    case Intrinsic::x86_encodekey128:
    case Intrinsic::x86_encodekey256: {
      if (!Subtarget->hasKL())
        break;

      unsigned Opcode;
      switch (IntNo) {
      default: llvm_unreachable("Impossible intrinsic");
      case Intrinsic::x86_encodekey128: Opcode = X86::ENCODEKEY128; break;
      case Intrinsic::x86_encodekey256: Opcode = X86::ENCODEKEY256; break;
      }

      SDValue Chain = Node->getOperand(0);
      Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(3),
                                   SDValue());
      if (Opcode == X86::ENCODEKEY256)
        Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(4),
                                     Chain.getValue(1));

      MachineSDNode *Res = CurDAG->getMachineNode(
          Opcode, dl, Node->getVTList(),
          {Node->getOperand(2), Chain, Chain.getValue(1)});
      ReplaceNode(Node, Res);
      return;
    }
    case Intrinsic::x86_tileloadd64_internal:
    case Intrinsic::x86_tileloaddt164_internal: {
      if (!Subtarget->hasAMXTILE())
        break;
      unsigned Opc = IntNo == Intrinsic::x86_tileloadd64_internal
                         ? X86::PTILELOADDV
                         : X86::PTILELOADDT1V;
      // _tile_loadd_internal(row, col, buf, STRIDE)
      SDValue Base = Node->getOperand(4);
      SDValue Scale = getI8Imm(1, dl);
      SDValue Index = Node->getOperand(5);
      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
      SDValue Chain = Node->getOperand(0);
      MachineSDNode *CNode;
      SDValue Ops[] = {Node->getOperand(2),
                       Node->getOperand(3),
                       Base,
                       Scale,
                       Index,
                       Disp,
                       Segment,
                       Chain};
      CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
      ReplaceNode(Node, CNode);
      return;
    }
    }
    break;
  }
  case ISD::INTRINSIC_VOID: {
    unsigned IntNo = Node->getConstantOperandVal(1);
    switch (IntNo) {
    default: break;
    case Intrinsic::x86_sse3_monitor:
    case Intrinsic::x86_monitorx:
    case Intrinsic::x86_clzero: {
      bool Use64BitPtr = Node->getOperand(2).getValueType() == MVT::i64;

      unsigned Opc = 0;
      switch (IntNo) {
      default: llvm_unreachable("Unexpected intrinsic!");
      case Intrinsic::x86_sse3_monitor:
        if (!Subtarget->hasSSE3())
          break;
        Opc = Use64BitPtr ? X86::MONITOR64rrr : X86::MONITOR32rrr;
        break;
      case Intrinsic::x86_monitorx:
        if (!Subtarget->hasMWAITX())
          break;
        Opc = Use64BitPtr ? X86::MONITORX64rrr : X86::MONITORX32rrr;
        break;
      case Intrinsic::x86_clzero:
        if (!Subtarget->hasCLZERO())
          break;
        Opc = Use64BitPtr ? X86::CLZERO64r : X86::CLZERO32r;
        break;
      }

      if (Opc) {
        unsigned PtrReg = Use64BitPtr ? X86::RAX : X86::EAX;
        SDValue Chain = CurDAG->getCopyToReg(Node->getOperand(0), dl, PtrReg,
                                             Node->getOperand(2), SDValue());
        SDValue InFlag = Chain.getValue(1);

        if (IntNo == Intrinsic::x86_sse3_monitor ||
            IntNo == Intrinsic::x86_monitorx) {
          // Copy the other two operands to ECX and EDX.
          Chain = CurDAG->getCopyToReg(Chain, dl, X86::ECX, Node->getOperand(3),
                                       InFlag);
          InFlag = Chain.getValue(1);
          Chain = CurDAG->getCopyToReg(Chain, dl, X86::EDX, Node->getOperand(4),
                                       InFlag);
          InFlag = Chain.getValue(1);
        }

        MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other,
                                                      { Chain, InFlag});
        ReplaceNode(Node, CNode);
        return;
      }

      break;
    }
    case Intrinsic::x86_tilestored64_internal: {
      unsigned Opc = X86::PTILESTOREDV;
      // _tile_stored_internal(row, col, buf, STRIDE, c)
      SDValue Base = Node->getOperand(4);
      SDValue Scale = getI8Imm(1, dl);
      SDValue Index = Node->getOperand(5);
      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
      SDValue Chain = Node->getOperand(0);
      MachineSDNode *CNode;
      SDValue Ops[] = {Node->getOperand(2),
                       Node->getOperand(3),
                       Base,
                       Scale,
                       Index,
                       Disp,
                       Segment,
                       Node->getOperand(6),
                       Chain};
      CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
      ReplaceNode(Node, CNode);
      return;
    }
    case Intrinsic::x86_tileloadd64:
    case Intrinsic::x86_tileloaddt164:
    case Intrinsic::x86_tilestored64: {
      if (!Subtarget->hasAMXTILE())
        break;
      unsigned Opc;
      switch (IntNo) {
      default: llvm_unreachable("Unexpected intrinsic!");
      case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
      case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
      case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
      }
      // FIXME: Match displacement and scale.
      unsigned TIndex = Node->getConstantOperandVal(2);
      SDValue TReg = getI8Imm(TIndex, dl);
      SDValue Base = Node->getOperand(3);
      SDValue Scale = getI8Imm(1, dl);
      SDValue Index = Node->getOperand(4);
      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
      SDValue Chain = Node->getOperand(0);
      MachineSDNode *CNode;
      if (Opc == X86::PTILESTORED) {
        SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
      } else {
        SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
      }
      ReplaceNode(Node, CNode);
      return;
    }
    }
    break;
  }
  case ISD::BRIND:
  case X86ISD::NT_BRIND: {
    if (Subtarget->isTargetNaCl())
      // NaCl has its own pass where jmp %r32 are converted to jmp %r64. We
      // leave the instruction alone.
      break;
    if (Subtarget->isTarget64BitILP32()) {
      // Converts a 32-bit register to a 64-bit, zero-extended version of
      // it. This is needed because x86-64 can do many things, but jmp %r32
      // ain't one of them.
      SDValue Target = Node->getOperand(1);
      assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
      SDValue Brind = CurDAG->getNode(Opcode, dl, MVT::Other,
                                      Node->getOperand(0), ZextTarget);
      ReplaceNode(Node, Brind.getNode());
      SelectCode(ZextTarget.getNode());
      SelectCode(Brind.getNode());
      return;
    }
    break;
  }
  case X86ISD::GlobalBaseReg:
    ReplaceNode(Node, getGlobalBaseReg());
    return;

  case ISD::BITCAST:
    // Just drop all 128/256/512-bit bitcasts.
    if (NVT.is512BitVector() || NVT.is256BitVector() || NVT.is128BitVector() ||
        NVT == MVT::f128) {
      ReplaceUses(SDValue(Node, 0), Node->getOperand(0));
      CurDAG->RemoveDeadNode(Node);
      return;
    }
    break;

  case ISD::SRL:
    if (matchBitExtract(Node))
      return;
    LLVM_FALLTHROUGH;
  case ISD::SRA:
  case ISD::SHL:
    if (tryShiftAmountMod(Node))
      return;
    break;

  case X86ISD::VPTERNLOG: {
    uint8_t Imm = cast<ConstantSDNode>(Node->getOperand(3))->getZExtValue();
    if (matchVPTERNLOG(Node, Node, Node, Node, Node->getOperand(0),
                       Node->getOperand(1), Node->getOperand(2), Imm))
      return;
    break;
  }

  case X86ISD::ANDNP:
    if (tryVPTERNLOG(Node))
      return;
    break;

  case ISD::AND:
    if (NVT.isVector() && NVT.getVectorElementType() == MVT::i1) {
      // Try to form a masked VPTESTM. Operands can be in either order.
      SDValue N0 = Node->getOperand(0);
      SDValue N1 = Node->getOperand(1);
      if (N0.getOpcode() == ISD::SETCC && N0.hasOneUse() &&
          tryVPTESTM(Node, N0, N1))
        return;
      if (N1.getOpcode() == ISD::SETCC && N1.hasOneUse() &&
          tryVPTESTM(Node, N1, N0))
        return;
    }

    if (MachineSDNode *NewNode = matchBEXTRFromAndImm(Node)) {
      ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
      CurDAG->RemoveDeadNode(Node);
      return;
    }
    if (matchBitExtract(Node))
      return;
    if (AndImmShrink && shrinkAndImmediate(Node))
      return;

    LLVM_FALLTHROUGH;
  case ISD::OR:
  case ISD::XOR:
    if (tryShrinkShlLogicImm(Node))
      return;
    if (Opcode == ISD::OR && tryMatchBitSelect(Node))
      return;
    if (tryVPTERNLOG(Node))
      return;

    LLVM_FALLTHROUGH;
  case ISD::ADD:
  case ISD::SUB: {
    // Try to avoid folding immediates with multiple uses for optsize.
    // This code tries to select to register form directly to avoid going
    // through the isel table which might fold the immediate. We can't change
    // the patterns on the add/sub/and/or/xor with immediate paterns in the
    // tablegen files to check immediate use count without making the patterns
    // unavailable to the fast-isel table.
    if (!CurDAG->shouldOptForSize())
      break;

    // Only handle i8/i16/i32/i64.
    if (NVT != MVT::i8 && NVT != MVT::i16 && NVT != MVT::i32 && NVT != MVT::i64)
      break;

    SDValue N0 = Node->getOperand(0);
    SDValue N1 = Node->getOperand(1);

    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N1);
    if (!Cst)
      break;

    int64_t Val = Cst->getSExtValue();

    // Make sure its an immediate that is considered foldable.
    // FIXME: Handle unsigned 32 bit immediates for 64-bit AND.
    if (!isInt<8>(Val) && !isInt<32>(Val))
      break;

    // If this can match to INC/DEC, let it go.
    if (Opcode == ISD::ADD && (Val == 1 || Val == -1))
      break;

    // Check if we should avoid folding this immediate.
    if (!shouldAvoidImmediateInstFormsForSize(N1.getNode()))
      break;

    // We should not fold the immediate. So we need a register form instead.
    unsigned ROpc, MOpc;
    switch (NVT.SimpleTy) {
    default: llvm_unreachable("Unexpected VT!");
    case MVT::i8:
      switch (Opcode) {
      default: llvm_unreachable("Unexpected opcode!");
      case ISD::ADD: ROpc = X86::ADD8rr; MOpc = X86::ADD8rm; break;
      case ISD::SUB: ROpc = X86::SUB8rr; MOpc = X86::SUB8rm; break;
      case ISD::AND: ROpc = X86::AND8rr; MOpc = X86::AND8rm; break;
      case ISD::OR:  ROpc = X86::OR8rr;  MOpc = X86::OR8rm;  break;
      case ISD::XOR: ROpc = X86::XOR8rr; MOpc = X86::XOR8rm; break;
      }
      break;
    case MVT::i16:
      switch (Opcode) {
      default: llvm_unreachable("Unexpected opcode!");
      case ISD::ADD: ROpc = X86::ADD16rr; MOpc = X86::ADD16rm; break;
      case ISD::SUB: ROpc = X86::SUB16rr; MOpc = X86::SUB16rm; break;
      case ISD::AND: ROpc = X86::AND16rr; MOpc = X86::AND16rm; break;
      case ISD::OR:  ROpc = X86::OR16rr;  MOpc = X86::OR16rm;  break;
      case ISD::XOR: ROpc = X86::XOR16rr; MOpc = X86::XOR16rm; break;
      }
      break;
    case MVT::i32:
      switch (Opcode) {
      default: llvm_unreachable("Unexpected opcode!");
      case ISD::ADD: ROpc = X86::ADD32rr; MOpc = X86::ADD32rm; break;
      case ISD::SUB: ROpc = X86::SUB32rr; MOpc = X86::SUB32rm; break;
      case ISD::AND: ROpc = X86::AND32rr; MOpc = X86::AND32rm; break;
      case ISD::OR:  ROpc = X86::OR32rr;  MOpc = X86::OR32rm;  break;
      case ISD::XOR: ROpc = X86::XOR32rr; MOpc = X86::XOR32rm; break;
      }
      break;
    case MVT::i64:
      switch (Opcode) {
      default: llvm_unreachable("Unexpected opcode!");
      case ISD::ADD: ROpc = X86::ADD64rr; MOpc = X86::ADD64rm; break;
      case ISD::SUB: ROpc = X86::SUB64rr; MOpc = X86::SUB64rm; break;
      case ISD::AND: ROpc = X86::AND64rr; MOpc = X86::AND64rm; break;
      case ISD::OR:  ROpc = X86::OR64rr;  MOpc = X86::OR64rm;  break;
      case ISD::XOR: ROpc = X86::XOR64rr; MOpc = X86::XOR64rm; break;
      }
      break;
    }

    // Ok this is a AND/OR/XOR/ADD/SUB with constant.

    // If this is a not a subtract, we can still try to fold a load.
    if (Opcode != ISD::SUB) {
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
        SDValue Ops[] = { N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
        SDVTList VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
        MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
        // Update the chain.
        ReplaceUses(N0.getValue(1), SDValue(CNode, 2));
        // Record the mem-refs
        CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N0)->getMemOperand()});
        ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
        CurDAG->RemoveDeadNode(Node);
        return;
      }
    }

    CurDAG->SelectNodeTo(Node, ROpc, NVT, MVT::i32, N0, N1);
    return;
  }

  case X86ISD::SMUL:
    // i16/i32/i64 are handled with isel patterns.
    if (NVT != MVT::i8)
      break;
    LLVM_FALLTHROUGH;
  case X86ISD::UMUL: {
    SDValue N0 = Node->getOperand(0);
    SDValue N1 = Node->getOperand(1);

    unsigned LoReg, ROpc, MOpc;
    switch (NVT.SimpleTy) {
    default: llvm_unreachable("Unsupported VT!");
    case MVT::i8:
      LoReg = X86::AL;
      ROpc = Opcode == X86ISD::SMUL ? X86::IMUL8r : X86::MUL8r;
      MOpc = Opcode == X86ISD::SMUL ? X86::IMUL8m : X86::MUL8m;
      break;
    case MVT::i16:
      LoReg = X86::AX;
      ROpc = X86::MUL16r;
      MOpc = X86::MUL16m;
      break;
    case MVT::i32:
      LoReg = X86::EAX;
      ROpc = X86::MUL32r;
      MOpc = X86::MUL32m;
      break;
    case MVT::i64:
      LoReg = X86::RAX;
      ROpc = X86::MUL64r;
      MOpc = X86::MUL64m;
      break;
    }

    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
    bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
    // Multiply is commutative.
    if (!FoldedLoad) {
      FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
      if (FoldedLoad)
        std::swap(N0, N1);
    }

    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                          N0, SDValue()).getValue(1);

    MachineSDNode *CNode;
    if (FoldedLoad) {
      // i16/i32/i64 use an instruction that produces a low and high result even
      // though only the low result is used.
      SDVTList VTs;
      if (NVT == MVT::i8)
        VTs = CurDAG->getVTList(NVT, MVT::i32, MVT::Other);
      else
        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32, MVT::Other);

      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                        InFlag };
      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);

      // Update the chain.
      ReplaceUses(N1.getValue(1), SDValue(CNode, NVT == MVT::i8 ? 2 : 3));
      // Record the mem-refs
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
    } else {
      // i16/i32/i64 use an instruction that produces a low and high result even
      // though only the low result is used.
      SDVTList VTs;
      if (NVT == MVT::i8)
        VTs = CurDAG->getVTList(NVT, MVT::i32);
      else
        VTs = CurDAG->getVTList(NVT, NVT, MVT::i32);

      CNode = CurDAG->getMachineNode(ROpc, dl, VTs, {N1, InFlag});
    }

    ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
    ReplaceUses(SDValue(Node, 1), SDValue(CNode, NVT == MVT::i8 ? 1 : 2));
    CurDAG->RemoveDeadNode(Node);
    return;
  }

  case ISD::SMUL_LOHI:
  case ISD::UMUL_LOHI: {
    SDValue N0 = Node->getOperand(0);
    SDValue N1 = Node->getOperand(1);

    unsigned Opc, MOpc;
    unsigned LoReg, HiReg;
    bool IsSigned = Opcode == ISD::SMUL_LOHI;
    bool UseMULX = !IsSigned && Subtarget->hasBMI2();
    bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
    switch (NVT.SimpleTy) {
    default: llvm_unreachable("Unsupported VT!");
    case MVT::i32:
      Opc  = UseMULXHi ? X86::MULX32Hrr :
             UseMULX ? X86::MULX32rr :
             IsSigned ? X86::IMUL32r : X86::MUL32r;
      MOpc = UseMULXHi ? X86::MULX32Hrm :
             UseMULX ? X86::MULX32rm :
             IsSigned ? X86::IMUL32m : X86::MUL32m;
      LoReg = UseMULX ? X86::EDX : X86::EAX;
      HiReg = X86::EDX;
      break;
    case MVT::i64:
      Opc  = UseMULXHi ? X86::MULX64Hrr :
             UseMULX ? X86::MULX64rr :
             IsSigned ? X86::IMUL64r : X86::MUL64r;
      MOpc = UseMULXHi ? X86::MULX64Hrm :
             UseMULX ? X86::MULX64rm :
             IsSigned ? X86::IMUL64m : X86::MUL64m;
      LoReg = UseMULX ? X86::RDX : X86::RAX;
      HiReg = X86::RDX;
      break;
    }

    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
    // Multiply is commmutative.
    if (!foldedLoad) {
      foldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
      if (foldedLoad)
        std::swap(N0, N1);
    }

    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                          N0, SDValue()).getValue(1);
    SDValue ResHi, ResLo;
    if (foldedLoad) {
      SDValue Chain;
      MachineSDNode *CNode = nullptr;
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                        InFlag };
      if (UseMULXHi) {
        SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
        ResHi = SDValue(CNode, 0);
        Chain = SDValue(CNode, 1);
      } else if (UseMULX) {
        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
        ResHi = SDValue(CNode, 0);
        ResLo = SDValue(CNode, 1);
        Chain = SDValue(CNode, 2);
      } else {
        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
        Chain = SDValue(CNode, 0);
        InFlag = SDValue(CNode, 1);
      }

      // Update the chain.
      ReplaceUses(N1.getValue(1), Chain);
      // Record the mem-refs
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
    } else {
      SDValue Ops[] = { N1, InFlag };
      if (UseMULXHi) {
        SDVTList VTs = CurDAG->getVTList(NVT);
        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
        ResHi = SDValue(CNode, 0);
      } else if (UseMULX) {
        SDVTList VTs = CurDAG->getVTList(NVT, NVT);
        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
        ResHi = SDValue(CNode, 0);
        ResLo = SDValue(CNode, 1);
      } else {
        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
        InFlag = SDValue(CNode, 0);
      }
    }

    // Copy the low half of the result, if it is needed.
    if (!SDValue(Node, 0).use_empty()) {
      if (!ResLo) {
        assert(LoReg && "Register for low half is not defined!");
        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
                                       NVT, InFlag);
        InFlag = ResLo.getValue(2);
      }
      ReplaceUses(SDValue(Node, 0), ResLo);
      LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
                 dbgs() << '\n');
    }
    // Copy the high half of the result, if it is needed.
    if (!SDValue(Node, 1).use_empty()) {
      if (!ResHi) {
        assert(HiReg && "Register for high half is not defined!");
        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
                                       NVT, InFlag);
        InFlag = ResHi.getValue(2);
      }
      ReplaceUses(SDValue(Node, 1), ResHi);
      LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
                 dbgs() << '\n');
    }

    CurDAG->RemoveDeadNode(Node);
    return;
  }

  case ISD::SDIVREM:
  case ISD::UDIVREM: {
    SDValue N0 = Node->getOperand(0);
    SDValue N1 = Node->getOperand(1);

    unsigned ROpc, MOpc;
    bool isSigned = Opcode == ISD::SDIVREM;
    if (!isSigned) {
      switch (NVT.SimpleTy) {
      default: llvm_unreachable("Unsupported VT!");
      case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
      case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
      case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
      case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
      }
    } else {
      switch (NVT.SimpleTy) {
      default: llvm_unreachable("Unsupported VT!");
      case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
      case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
      case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
      case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
      }
    }

    unsigned LoReg, HiReg, ClrReg;
    unsigned SExtOpcode;
    switch (NVT.SimpleTy) {
    default: llvm_unreachable("Unsupported VT!");
    case MVT::i8:
      LoReg = X86::AL;  ClrReg = HiReg = X86::AH;
      SExtOpcode = 0; // Not used.
      break;
    case MVT::i16:
      LoReg = X86::AX;  HiReg = X86::DX;
      ClrReg = X86::DX;
      SExtOpcode = X86::CWD;
      break;
    case MVT::i32:
      LoReg = X86::EAX; ClrReg = HiReg = X86::EDX;
      SExtOpcode = X86::CDQ;
      break;
    case MVT::i64:
      LoReg = X86::RAX; ClrReg = HiReg = X86::RDX;
      SExtOpcode = X86::CQO;
      break;
    }

    SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
    bool foldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
    bool signBitIsZero = CurDAG->SignBitIsZero(N0);

    SDValue InFlag;
    if (NVT == MVT::i8) {
      // Special case for div8, just use a move with zero extension to AX to
      // clear the upper 8 bits (AH).
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain;
      MachineSDNode *Move;
      if (tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N0.getOperand(0) };
        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rm8
                                                    : X86::MOVZX16rm8;
        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, MVT::Other, Ops);
        Chain = SDValue(Move, 1);
        ReplaceUses(N0.getValue(1), Chain);
        // Record the mem-refs
        CurDAG->setNodeMemRefs(Move, {cast<LoadSDNode>(N0)->getMemOperand()});
      } else {
        unsigned Opc = (isSigned && !signBitIsZero) ? X86::MOVSX16rr8
                                                    : X86::MOVZX16rr8;
        Move = CurDAG->getMachineNode(Opc, dl, MVT::i16, N0);
        Chain = CurDAG->getEntryNode();
      }
      Chain  = CurDAG->getCopyToReg(Chain, dl, X86::AX, SDValue(Move, 0),
                                    SDValue());
      InFlag = Chain.getValue(1);
    } else {
      InFlag =
        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
                             LoReg, N0, SDValue()).getValue(1);
      if (isSigned && !signBitIsZero) {
        // Sign extend the low part into the high part.
        InFlag =
          SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
      } else {
        // Zero out the high part, effectively zero extending the input.
        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
        SDValue ClrNode =
            SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
        switch (NVT.SimpleTy) {
        case MVT::i16:
          ClrNode =
              SDValue(CurDAG->getMachineNode(
                          TargetOpcode::EXTRACT_SUBREG, dl, MVT::i16, ClrNode,
                          CurDAG->getTargetConstant(X86::sub_16bit, dl,
                                                    MVT::i32)),
                      0);
          break;
        case MVT::i32:
          break;
        case MVT::i64:
          ClrNode =
              SDValue(CurDAG->getMachineNode(
                          TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
                          CurDAG->getTargetConstant(0, dl, MVT::i64), ClrNode,
                          CurDAG->getTargetConstant(X86::sub_32bit, dl,
                                                    MVT::i32)),
                      0);
          break;
        default:
          llvm_unreachable("Unexpected division source");
        }

        InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, ClrReg,
                                      ClrNode, InFlag).getValue(1);
      }
    }

    if (foldedLoad) {
      SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                        InFlag };
      MachineSDNode *CNode =
        CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
      InFlag = SDValue(CNode, 1);
      // Update the chain.
      ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
      // Record the mem-refs
      CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
    } else {
      InFlag =
        SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
    }

    // Prevent use of AH in a REX instruction by explicitly copying it to
    // an ABCD_L register.
    //
    // The current assumption of the register allocator is that isel
    // won't generate explicit references to the GR8_ABCD_H registers. If
    // the allocator and/or the backend get enhanced to be more robust in
    // that regard, this can be, and should be, removed.
    if (HiReg == X86::AH && !SDValue(Node, 1).use_empty()) {
      SDValue AHCopy = CurDAG->getRegister(X86::AH, MVT::i8);
      unsigned AHExtOpcode =
          isSigned ? X86::MOVSX32rr8_NOREX : X86::MOVZX32rr8_NOREX;

      SDNode *RNode = CurDAG->getMachineNode(AHExtOpcode, dl, MVT::i32,
                                             MVT::Glue, AHCopy, InFlag);
      SDValue Result(RNode, 0);
      InFlag = SDValue(RNode, 1);

      Result =
          CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);

      ReplaceUses(SDValue(Node, 1), Result);
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                 dbgs() << '\n');
    }
    // Copy the division (low) result, if it is needed.
    if (!SDValue(Node, 0).use_empty()) {
      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                                LoReg, NVT, InFlag);
      InFlag = Result.getValue(2);
      ReplaceUses(SDValue(Node, 0), Result);
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                 dbgs() << '\n');
    }
    // Copy the remainder (high) result, if it is needed.
    if (!SDValue(Node, 1).use_empty()) {
      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
                                              HiReg, NVT, InFlag);
      InFlag = Result.getValue(2);
      ReplaceUses(SDValue(Node, 1), Result);
      LLVM_DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG);
                 dbgs() << '\n');
    }
    CurDAG->RemoveDeadNode(Node);
    return;
  }

  case X86ISD::FCMP:
  case X86ISD::STRICT_FCMP:
  case X86ISD::STRICT_FCMPS: {
    bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
                       Node->getOpcode() == X86ISD::STRICT_FCMPS;
    SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
    SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);

    // Save the original VT of the compare.
    MVT CmpVT = N0.getSimpleValueType();

    // Floating point needs special handling if we don't have FCOMI.
    if (Subtarget->hasCMov())
      break;

    bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;

    unsigned Opc;
    switch (CmpVT.SimpleTy) {
    default: llvm_unreachable("Unexpected type!");
    case MVT::f32:
      Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
      break;
    case MVT::f64:
      Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
      break;
    case MVT::f80:
      Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
      break;
    }

    SDValue Chain =
        IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
    SDValue Glue;
    if (IsStrictCmp) {
      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
      Chain = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
      Glue = Chain.getValue(1);
    } else {
      Glue = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N0, N1), 0);
    }

    // Move FPSW to AX.
    SDValue FNSTSW =
        SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, Glue), 0);

    // Extract upper 8-bits of AX.
    SDValue Extract =
        CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);

    // Move AH into flags.
    // Some 64-bit targets lack SAHF support, but they do support FCOMI.
    assert(Subtarget->hasLAHFSAHF() &&
           "Target doesn't support SAHF or FCOMI?");
    SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
    Chain = AH;
    SDValue SAHF = SDValue(
        CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);

    if (IsStrictCmp)
      ReplaceUses(SDValue(Node, 1), Chain);

    ReplaceUses(SDValue(Node, 0), SAHF);
    CurDAG->RemoveDeadNode(Node);
    return;
  }

  case X86ISD::CMP: {
    SDValue N0 = Node->getOperand(0);
    SDValue N1 = Node->getOperand(1);

    // Optimizations for TEST compares.
    if (!isNullConstant(N1))
      break;

    // Save the original VT of the compare.
    MVT CmpVT = N0.getSimpleValueType();

    // If we are comparing (and (shr X, C, Mask) with 0, emit a BEXTR followed
    // by a test instruction. The test should be removed later by
    // analyzeCompare if we are using only the zero flag.
    // TODO: Should we check the users and use the BEXTR flags directly?
    if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
      if (MachineSDNode *NewNode = matchBEXTRFromAndImm(N0.getNode())) {
        unsigned TestOpc = CmpVT == MVT::i64 ? X86::TEST64rr
                                             : X86::TEST32rr;
        SDValue BEXTR = SDValue(NewNode, 0);
        NewNode = CurDAG->getMachineNode(TestOpc, dl, MVT::i32, BEXTR, BEXTR);
        ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
        CurDAG->RemoveDeadNode(Node);
        return;
      }
    }

    // We can peek through truncates, but we need to be careful below.
    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
      N0 = N0.getOperand(0);

    // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
    // use a smaller encoding.
    // Look past the truncate if CMP is the only use of it.
    if (N0.getOpcode() == ISD::AND &&
        N0.getNode()->hasOneUse() &&
        N0.getValueType() != MVT::i8) {
      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
      if (!C) break;
      uint64_t Mask = C->getZExtValue();
      // We may have looked through a truncate so mask off any bits that
      // shouldn't be part of the compare.
      Mask &= maskTrailingOnes<uint64_t>(CmpVT.getScalarSizeInBits());

      // Check if we can replace AND+IMM64 with a shift. This is possible for
      // masks/ like 0xFF000000 or 0x00FFFFFF and if we care only about the zero
      // flag.
      if (CmpVT == MVT::i64 && !isInt<32>(Mask) &&
          onlyUsesZeroFlag(SDValue(Node, 0))) {
        if (isMask_64(~Mask)) {
          unsigned TrailingZeros = countTrailingZeros(Mask);
          SDValue Imm = CurDAG->getTargetConstant(TrailingZeros, dl, MVT::i64);
          SDValue Shift =
            SDValue(CurDAG->getMachineNode(X86::SHR64ri, dl, MVT::i64, MVT::i32,
                                           N0.getOperand(0), Imm), 0);
          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
                                                       MVT::i32, Shift, Shift);
          ReplaceNode(Node, Test);
          return;
        }
        if (isMask_64(Mask)) {
          unsigned LeadingZeros = countLeadingZeros(Mask);
          SDValue Imm = CurDAG->getTargetConstant(LeadingZeros, dl, MVT::i64);
          SDValue Shift =
            SDValue(CurDAG->getMachineNode(X86::SHL64ri, dl, MVT::i64, MVT::i32,
                                           N0.getOperand(0), Imm), 0);
          MachineSDNode *Test = CurDAG->getMachineNode(X86::TEST64rr, dl,
                                                       MVT::i32, Shift, Shift);
          ReplaceNode(Node, Test);
          return;
        }
      }

      MVT VT;
      int SubRegOp;
      unsigned ROpc, MOpc;

      // For each of these checks we need to be careful if the sign flag is
      // being used. It is only safe to use the sign flag in two conditions,
      // either the sign bit in the shrunken mask is zero or the final test
      // size is equal to the original compare size.

      if (isUInt<8>(Mask) &&
          (!(Mask & 0x80) || CmpVT == MVT::i8 ||
           hasNoSignFlagUses(SDValue(Node, 0)))) {
        // For example, convert "testl %eax, $8" to "testb %al, $8"
        VT = MVT::i8;
        SubRegOp = X86::sub_8bit;
        ROpc = X86::TEST8ri;
        MOpc = X86::TEST8mi;
      } else if (OptForMinSize && isUInt<16>(Mask) &&
                 (!(Mask & 0x8000) || CmpVT == MVT::i16 ||
                  hasNoSignFlagUses(SDValue(Node, 0)))) {
        // For example, "testl %eax, $32776" to "testw %ax, $32776".
        // NOTE: We only want to form TESTW instructions if optimizing for
        // min size. Otherwise we only save one byte and possibly get a length
        // changing prefix penalty in the decoders.
        VT = MVT::i16;
        SubRegOp = X86::sub_16bit;
        ROpc = X86::TEST16ri;
        MOpc = X86::TEST16mi;
      } else if (isUInt<32>(Mask) && N0.getValueType() != MVT::i16 &&
                 ((!(Mask & 0x80000000) &&
                   // Without minsize 16-bit Cmps can get here so we need to
                   // be sure we calculate the correct sign flag if needed.
                   (CmpVT != MVT::i16 || !(Mask & 0x8000))) ||
                  CmpVT == MVT::i32 ||
                  hasNoSignFlagUses(SDValue(Node, 0)))) {
        // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
        // NOTE: We only want to run that transform if N0 is 32 or 64 bits.
        // Otherwize, we find ourselves in a position where we have to do
        // promotion. If previous passes did not promote the and, we assume
        // they had a good reason not to and do not promote here.
        VT = MVT::i32;
        SubRegOp = X86::sub_32bit;
        ROpc = X86::TEST32ri;
        MOpc = X86::TEST32mi;
      } else {
        // No eligible transformation was found.
        break;
      }

      SDValue Imm = CurDAG->getTargetConstant(Mask, dl, VT);
      SDValue Reg = N0.getOperand(0);

      // Emit a testl or testw.
      MachineSDNode *NewNode;
      SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
      if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
        if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
          if (!LoadN->isSimple()) {
            unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
            if ((MOpc == X86::TEST8mi && NumVolBits != 8) ||
                (MOpc == X86::TEST16mi && NumVolBits != 16) ||
                (MOpc == X86::TEST32mi && NumVolBits != 32))
              break;
          }
        }
        SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
                          Reg.getOperand(0) };
        NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
        // Update the chain.
        ReplaceUses(Reg.getValue(1), SDValue(NewNode, 1));
        // Record the mem-refs
        CurDAG->setNodeMemRefs(NewNode,
                               {cast<LoadSDNode>(Reg)->getMemOperand()});
      } else {
        // Extract the subregister if necessary.
        if (N0.getValueType() != VT)
          Reg = CurDAG->getTargetExtractSubreg(SubRegOp, dl, VT, Reg);

        NewNode = CurDAG->getMachineNode(ROpc, dl, MVT::i32, Reg, Imm);
      }
      // Replace CMP with TEST.
      ReplaceNode(Node, NewNode);
      return;
    }
    break;
  }
  case X86ISD::PCMPISTR: {
    if (!Subtarget->hasSSE42())
      break;

    bool NeedIndex = !SDValue(Node, 0).use_empty();
    bool NeedMask = !SDValue(Node, 1).use_empty();
    // We can't fold a load if we are going to make two instructions.
    bool MayFoldLoad = !NeedIndex || !NeedMask;

    MachineSDNode *CNode;
    if (NeedMask) {
      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrr : X86::PCMPISTRMrr;
      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRMrm : X86::PCMPISTRMrm;
      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node);
      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
    }
    if (NeedIndex || !NeedMask) {
      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr : X86::PCMPISTRIrr;
      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPISTRIrm : X86::PCMPISTRIrm;
      CNode = emitPCMPISTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node);
      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
    }

    // Connect the flag usage to the last instruction created.
    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
    CurDAG->RemoveDeadNode(Node);
    return;
  }
  case X86ISD::PCMPESTR: {
    if (!Subtarget->hasSSE42())
      break;

    // Copy the two implicit register inputs.
    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EAX,
                                          Node->getOperand(1),
                                          SDValue()).getValue(1);
    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
                                  Node->getOperand(3), InFlag).getValue(1);

    bool NeedIndex = !SDValue(Node, 0).use_empty();
    bool NeedMask = !SDValue(Node, 1).use_empty();
    // We can't fold a load if we are going to make two instructions.
    bool MayFoldLoad = !NeedIndex || !NeedMask;

    MachineSDNode *CNode;
    if (NeedMask) {
      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrr : X86::PCMPESTRMrr;
      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRMrm : X86::PCMPESTRMrm;
      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::v16i8, Node,
                           InFlag);
      ReplaceUses(SDValue(Node, 1), SDValue(CNode, 0));
    }
    if (NeedIndex || !NeedMask) {
      unsigned ROpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr : X86::PCMPESTRIrr;
      unsigned MOpc = Subtarget->hasAVX() ? X86::VPCMPESTRIrm : X86::PCMPESTRIrm;
      CNode = emitPCMPESTR(ROpc, MOpc, MayFoldLoad, dl, MVT::i32, Node, InFlag);
      ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
    }
    // Connect the flag usage to the last instruction created.
    ReplaceUses(SDValue(Node, 2), SDValue(CNode, 1));
    CurDAG->RemoveDeadNode(Node);
    return;
  }

  case ISD::SETCC: {
    if (NVT.isVector() && tryVPTESTM(Node, SDValue(Node, 0), SDValue()))
      return;

    break;
  }

  case ISD::STORE:
    if (foldLoadStoreIntoMemOperand(Node))
      return;
    break;

  case X86ISD::SETCC_CARRY: {
    // We have to do this manually because tblgen will put the eflags copy in
    // the wrong place if we use an extract_subreg in the pattern.
    MVT VT = Node->getSimpleValueType(0);

    // Copy flags to the EFLAGS register and glue it to next node.
    SDValue EFLAGS =
        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
                             Node->getOperand(1), SDValue());

    // Create a 64-bit instruction if the result is 64-bits otherwise use the
    // 32-bit version.
    unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
    MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
    SDValue Result = SDValue(
        CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);

    // For less than 32-bits we need to extract from the 32-bit node.
    if (VT == MVT::i8 || VT == MVT::i16) {
      int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
      Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
    }

    ReplaceUses(SDValue(Node, 0), Result);
    CurDAG->RemoveDeadNode(Node);
    return;
  }
  case X86ISD::SBB: {
    if (isNullConstant(Node->getOperand(0)) &&
        isNullConstant(Node->getOperand(1))) {
      MVT VT = Node->getSimpleValueType(0);

      // Create zero.
      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
      SDValue Zero =
          SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
      if (VT == MVT::i64) {
        Zero = SDValue(
            CurDAG->getMachineNode(
                TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
                CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
                CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
            0);
      }

      // Copy flags to the EFLAGS register and glue it to next node.
      SDValue EFLAGS =
          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
                               Node->getOperand(2), SDValue());

      // Create a 64-bit instruction if the result is 64-bits otherwise use the
      // 32-bit version.
      unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
      MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
      VTs = CurDAG->getVTList(SBBVT, MVT::i32);
      SDValue Result =
          SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
                                         EFLAGS.getValue(1)}),
                  0);

      // Replace the flag use.
      ReplaceUses(SDValue(Node, 1), Result.getValue(1));

      // Replace the result use.
      if (!SDValue(Node, 0).use_empty()) {
        // For less than 32-bits we need to extract from the 32-bit node.
        if (VT == MVT::i8 || VT == MVT::i16) {
          int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
          Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
        }
        ReplaceUses(SDValue(Node, 0), Result);
      }

      CurDAG->RemoveDeadNode(Node);
      return;
    }
    break;
  }
  case X86ISD::MGATHER: {
    auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
    SDValue IndexOp = Mgt->getIndex();
    SDValue Mask = Mgt->getMask();
    MVT IndexVT = IndexOp.getSimpleValueType();
    MVT ValueVT = Node->getSimpleValueType(0);
    MVT MaskVT = Mask.getSimpleValueType();

    // This is just to prevent crashes if the nodes are malformed somehow. We're
    // otherwise only doing loose type checking in here based on type what
    // a type constraint would say just like table based isel.
    if (!ValueVT.isVector() || !MaskVT.isVector())
      break;

    unsigned NumElts = ValueVT.getVectorNumElements();
    MVT ValueSVT = ValueVT.getVectorElementType();

    bool IsFP = ValueSVT.isFloatingPoint();
    unsigned EltSize = ValueSVT.getSizeInBits();

    unsigned Opc = 0;
    bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
    if (AVX512Gather) {
      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
      else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
    } else {
      assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
             "Unexpected mask VT!");
      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
        Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
        Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
    }

    if (!Opc)
      break;

    SDValue Base, Scale, Index, Disp, Segment;
    if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
                          Base, Scale, Index, Disp, Segment))
      break;

    SDValue PassThru = Mgt->getPassThru();
    SDValue Chain = Mgt->getChain();
    // Gather instructions have a mask output not in the ISD node.
    SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);

    MachineSDNode *NewNode;
    if (AVX512Gather) {
      SDValue Ops[] = {PassThru, Mask, Base,    Scale,
                       Index,    Disp, Segment, Chain};
      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
    } else {
      SDValue Ops[] = {PassThru, Base,    Scale, Index,
                       Disp,     Segment, Mask,  Chain};
      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
    }
    CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
    ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
    CurDAG->RemoveDeadNode(Node);
    return;
  }
  case X86ISD::MSCATTER: {
    auto *Sc = cast<X86MaskedScatterSDNode>(Node);
    SDValue Value = Sc->getValue();
    SDValue IndexOp = Sc->getIndex();
    MVT IndexVT = IndexOp.getSimpleValueType();
    MVT ValueVT = Value.getSimpleValueType();

    // This is just to prevent crashes if the nodes are malformed somehow. We're
    // otherwise only doing loose type checking in here based on type what
    // a type constraint would say just like table based isel.
    if (!ValueVT.isVector())
      break;

    unsigned NumElts = ValueVT.getVectorNumElements();
    MVT ValueSVT = ValueVT.getVectorElementType();

    bool IsFP = ValueSVT.isFloatingPoint();
    unsigned EltSize = ValueSVT.getSizeInBits();

    unsigned Opc;
    if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
      Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
      Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
    else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
      Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
    else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
      Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
    else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
      Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
      Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
    else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
      Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
      Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
      Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
    else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
      Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
      Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
      Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
    else
      break;

    SDValue Base, Scale, Index, Disp, Segment;
    if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
                          Base, Scale, Index, Disp, Segment))
      break;

    SDValue Mask = Sc->getMask();
    SDValue Chain = Sc->getChain();
    // Scatter instructions have a mask output not in the ISD node.
    SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};

    MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
    CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
    CurDAG->RemoveDeadNode(Node);
    return;
  }
  case ISD::PREALLOCATED_SETUP: {
    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
    auto CallId = MFI->getPreallocatedIdForCallSite(
        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
    SDValue Chain = Node->getOperand(0);
    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
    MachineSDNode *New = CurDAG->getMachineNode(
        TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
    CurDAG->RemoveDeadNode(Node);
    return;
  }
  case ISD::PREALLOCATED_ARG: {
    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
    auto CallId = MFI->getPreallocatedIdForCallSite(
        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
    SDValue Chain = Node->getOperand(0);
    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
    SDValue ArgIndex = Node->getOperand(2);
    SDValue Ops[3];
    Ops[0] = CallIdValue;
    Ops[1] = ArgIndex;
    Ops[2] = Chain;
    MachineSDNode *New = CurDAG->getMachineNode(
        TargetOpcode::PREALLOCATED_ARG, dl,
        CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
                          MVT::Other),
        Ops);
    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
    ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
    CurDAG->RemoveDeadNode(Node);
    return;
  }
  case X86ISD::AESENCWIDE128KL:
  case X86ISD::AESDECWIDE128KL:
  case X86ISD::AESENCWIDE256KL:
  case X86ISD::AESDECWIDE256KL: {
    if (!Subtarget->hasWIDEKL())
      break;

    unsigned Opcode;
    switch (Node->getOpcode()) {
    default:
      llvm_unreachable("Unexpected opcode!");
    case X86ISD::AESENCWIDE128KL:
      Opcode = X86::AESENCWIDE128KL;
      break;
    case X86ISD::AESDECWIDE128KL:
      Opcode = X86::AESDECWIDE128KL;
      break;
    case X86ISD::AESENCWIDE256KL:
      Opcode = X86::AESENCWIDE256KL;
      break;
    case X86ISD::AESDECWIDE256KL:
      Opcode = X86::AESDECWIDE256KL;
      break;
    }

    SDValue Chain = Node->getOperand(0);
    SDValue Addr = Node->getOperand(1);

    SDValue Base, Scale, Index, Disp, Segment;
    if (!selectAddr(Node, Addr, Base, Scale, Index, Disp, Segment))
      break;

    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM0, Node->getOperand(2),
                                 SDValue());
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM1, Node->getOperand(3),
                                 Chain.getValue(1));
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM2, Node->getOperand(4),
                                 Chain.getValue(1));
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM3, Node->getOperand(5),
                                 Chain.getValue(1));
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM4, Node->getOperand(6),
                                 Chain.getValue(1));
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM5, Node->getOperand(7),
                                 Chain.getValue(1));
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM6, Node->getOperand(8),
                                 Chain.getValue(1));
    Chain = CurDAG->getCopyToReg(Chain, dl, X86::XMM7, Node->getOperand(9),
                                 Chain.getValue(1));

    MachineSDNode *Res = CurDAG->getMachineNode(
        Opcode, dl, Node->getVTList(),
        {Base, Scale, Index, Disp, Segment, Chain, Chain.getValue(1)});
    CurDAG->setNodeMemRefs(Res, cast<MemSDNode>(Node)->getMemOperand());
    ReplaceNode(Node, Res);
    return;
  }
  }

  SelectCode(Node);
}