sub GHASH_16_ENCRYPT_N_GHASH_N()

in crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl [2155:2692]


sub GHASH_16_ENCRYPT_N_GHASH_N {
  my $AES_KEYS           = $_[0];     # [in] key pointer
  my $GCM128_CTX         = $_[1];     # [in] context pointer
  my $CIPH_PLAIN_OUT     = $_[2];     # [in] pointer to output buffer
  my $PLAIN_CIPH_IN      = $_[3];     # [in] pointer to input buffer
  my $DATA_OFFSET        = $_[4];     # [in] data offset
  my $LENGTH             = $_[5];     # [in] data length
  my $CTR_BE             = $_[6];     # [in/out] ZMM counter blocks (last 4) in big-endian
  my $CTR_CHECK          = $_[7];     # [in/out] GP with 8-bit counter for overflow check
  my $HASHKEY_OFFSET     = $_[8];     # [in] numerical offset for the highest hash key
                                      # (can be in form of register or numerical value)
  my $GHASHIN_BLK_OFFSET = $_[9];     # [in] numerical offset for GHASH blocks in
  my $SHFMSK             = $_[10];    # [in] ZMM with byte swap mask for pshufb
  my $B00_03             = $_[11];    # [clobbered] temporary ZMM
  my $B04_07             = $_[12];    # [clobbered] temporary ZMM
  my $B08_11             = $_[13];    # [clobbered] temporary ZMM
  my $B12_15             = $_[14];    # [clobbered] temporary ZMM
  my $GH1H_UNUSED        = $_[15];    # [clobbered] temporary ZMM
  my $GH1L               = $_[16];    # [clobbered] temporary ZMM
  my $GH1M               = $_[17];    # [clobbered] temporary ZMM
  my $GH1T               = $_[18];    # [clobbered] temporary ZMM
  my $GH2H               = $_[19];    # [clobbered] temporary ZMM
  my $GH2L               = $_[20];    # [clobbered] temporary ZMM
  my $GH2M               = $_[21];    # [clobbered] temporary ZMM
  my $GH2T               = $_[22];    # [clobbered] temporary ZMM
  my $GH3H               = $_[23];    # [clobbered] temporary ZMM
  my $GH3L               = $_[24];    # [clobbered] temporary ZMM
  my $GH3M               = $_[25];    # [clobbered] temporary ZMM
  my $GH3T               = $_[26];    # [clobbered] temporary ZMM
  my $AESKEY1            = $_[27];    # [clobbered] temporary ZMM
  my $AESKEY2            = $_[28];    # [clobbered] temporary ZMM
  my $GHKEY1             = $_[29];    # [clobbered] temporary ZMM
  my $GHKEY2             = $_[30];    # [clobbered] temporary ZMM
  my $GHDAT1             = $_[31];    # [clobbered] temporary ZMM
  my $GHDAT2             = $_[32];    # [clobbered] temporary ZMM
  my $ZT01               = $_[33];    # [clobbered] temporary ZMM
  my $ADDBE_4x4          = $_[34];    # [in] ZMM with 4x128bits 4 in big-endian
  my $ADDBE_1234         = $_[35];    # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  my $GHASH_TYPE         = $_[36];    # [in] "start", "start_reduce", "mid", "end_reduce"
  my $TO_REDUCE_L        = $_[37];    # [in] ZMM for low 4x128-bit GHASH sum
  my $TO_REDUCE_H        = $_[38];    # [in] ZMM for hi 4x128-bit GHASH sum
  my $TO_REDUCE_M        = $_[39];    # [in] ZMM for medium 4x128-bit GHASH sum
  my $ENC_DEC            = $_[40];    # [in] cipher direction
  my $HASH_IN_OUT        = $_[41];    # [in/out] XMM ghash in/out value
  my $IA0                = $_[42];    # [clobbered] GP temporary
  my $IA1                = $_[43];    # [clobbered] GP temporary
  my $MASKREG            = $_[44];    # [clobbered] mask register
  my $NUM_BLOCKS         = $_[45];    # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
  my $PBLOCK_LEN         = $_[46];    # [in] partial block length

  die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
    if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);

  my $rndsuffix = &random_string();

  my $GH1H = $HASH_IN_OUT;

  # ; this is to avoid additional move in do_reduction case

  my $LAST_GHASH_BLK  = $GH1L;
  my $LAST_CIPHER_BLK = $GH1T;

  my $RED_POLY = $GH2T;
  my $RED_P1   = $GH2L;
  my $RED_T1   = $GH2H;
  my $RED_T2   = $GH2M;

  my $DATA1 = $GH3H;
  my $DATA2 = $GH3L;
  my $DATA3 = $GH3M;
  my $DATA4 = $GH3T;

  # ;; do reduction after the 16 blocks ?
  my $do_reduction = 0;

  # ;; is 16 block chunk a start?
  my $is_start = 0;

  if ($GHASH_TYPE eq "start_reduce") {
    $is_start     = 1;
    $do_reduction = 1;
  }

  if ($GHASH_TYPE eq "start") {
    $is_start = 1;
  }

  if ($GHASH_TYPE eq "end_reduce") {
    $do_reduction = 1;
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; - get load/store mask
  # ;; - load plain/cipher text
  # ;; get load/store mask
  $code .= <<___;
        lea               byte64_len_to_mask_table(%rip),$IA0
        mov               $LENGTH,$IA1
___
  if ($NUM_BLOCKS > 12) {
    $code .= "sub               \$`3*64`,$IA1\n";
  } elsif ($NUM_BLOCKS > 8) {
    $code .= "sub               \$`2*64`,$IA1\n";
  } elsif ($NUM_BLOCKS > 4) {
    $code .= "sub               \$`1*64`,$IA1\n";
  }
  $code .= "kmovq             ($IA0,$IA1,8),$MASKREG\n";

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; prepare counter blocks

  $code .= <<___;
        cmp               \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
        jae               .L_16_blocks_overflow_${rndsuffix}
___

  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07,     $B08_11,    $B12_15,    $CTR_BE,
    $B00_03,     $B04_07,  $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
  $code .= <<___;
        jmp               .L_16_blocks_ok_${rndsuffix}

.L_16_blocks_overflow_${rndsuffix}:
        vpshufb           $SHFMSK,$CTR_BE,$CTR_BE
        vpaddd            ddq_add_1234(%rip),$CTR_BE,$B00_03
___
  if ($NUM_BLOCKS > 4) {
    $code .= <<___;
        vmovdqa64         ddq_add_4444(%rip),$B12_15
        vpaddd            $B12_15,$B00_03,$B04_07
___
  }
  if ($NUM_BLOCKS > 8) {
    $code .= "vpaddd            $B12_15,$B04_07,$B08_11\n";
  }
  if ($NUM_BLOCKS > 12) {
    $code .= "vpaddd            $B12_15,$B08_11,$B12_15\n";
  }
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
    $B04_07,     $B08_11,   $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  $code .= <<___;
.L_16_blocks_ok_${rndsuffix}:

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; - pre-load constants
        # ;; - add current hash into the 1st block
        vbroadcastf64x2    `(16 * 0)`($AES_KEYS),$AESKEY1
___
  if ($is_start != 0) {
    $code .= "vpxorq            `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
  } else {
    $code .= "vmovdqa64         `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
  }

  $code .= "vmovdqu64         @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; save counter for the next round
  # ;; increment counter overflow check register
  if ($NUM_BLOCKS <= 4) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
  } elsif ($NUM_BLOCKS <= 8) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
  } elsif ($NUM_BLOCKS <= 12) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
  } else {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
  }
  $code .= "vshufi64x2        \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";

  $code .= <<___;
        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; pre-load constants
        vbroadcastf64x2    `(16 * 1)`($AES_KEYS),$AESKEY2
        vmovdqu64         @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
        vmovdqa64         `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; stitch AES rounds with GHASH

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 0 - ARK

  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,  $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  $code .= "vbroadcastf64x2    `(16 * 2)`($AES_KEYS),$AESKEY1\n";

  $code .= <<___;
        # ;;==================================================
        # ;; GHASH 4 blocks (15 to 12)
        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH1H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH1L      # ; a0*b0
        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH1M      # ; a1*b0
        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH1T      # ; a0*b1
        vmovdqu64         @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
        vmovdqa64         `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 1
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  $code .= "vbroadcastf64x2    `(16 * 3)`($AES_KEYS),$AESKEY2\n";

  $code .= <<___;
        # ;; =================================================
        # ;; GHASH 4 blocks (11 to 8)
        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
        vmovdqu64         @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
        vmovdqa64         `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 2
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  $code .= "vbroadcastf64x2    `(16 * 4)`($AES_KEYS),$AESKEY1\n";

  $code .= <<___;
        # ;; =================================================
        # ;; GHASH 4 blocks (7 to 4)
        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH3M      # ; a0*b1
        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH3T      # ; a1*b0
        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH3H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH3L      # ; a0*b0
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES rounds 3
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  $code .= "vbroadcastf64x2    `(16 * 5)`($AES_KEYS),$AESKEY2\n";

  $code .= <<___;
        # ;; =================================================
        # ;; Gather (XOR) GHASH for 12 blocks
        vpternlogq        \$0x96,$GH3H,$GH2H,$GH1H
        vpternlogq        \$0x96,$GH3L,$GH2L,$GH1L
        vpternlogq        \$0x96,$GH3T,$GH2T,$GH1T
        vpternlogq        \$0x96,$GH3M,$GH2M,$GH1M
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES rounds 4
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  $code .= "vbroadcastf64x2    `(16 * 6)`($AES_KEYS),$AESKEY1\n";

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; load plain/cipher text
  &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES rounds 5
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  $code .= "vbroadcastf64x2    `(16 * 7)`($AES_KEYS),$AESKEY2\n";

  $code .= <<___;
        # ;; =================================================
        # ;; GHASH 4 blocks (3 to 0)
        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 6
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  $code .= "vbroadcastf64x2    `(16 * 8)`($AES_KEYS),$AESKEY1\n";

  # ;; =================================================
  # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
  # ;; - add GH2[MTLH] to GH1[MTLH]
  $code .= "vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M\n";
  if ($do_reduction != 0) {

    if ($is_start != 0) {
      $code .= "vpxorq            $GH2M,$GH1M,$GH1M\n";
    } else {
      $code .= <<___;
        vpternlogq        \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
        vpternlogq        \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
        vpternlogq        \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
___
    }

  } else {

    # ;; Update H/M/L hash sums if not carrying reduction
    if ($is_start != 0) {
      $code .= <<___;
        vpxorq            $GH2H,$GH1H,$TO_REDUCE_H
        vpxorq            $GH2L,$GH1L,$TO_REDUCE_L
        vpxorq            $GH2M,$GH1M,$TO_REDUCE_M
___
    } else {
      $code .= <<___;
        vpternlogq        \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
        vpternlogq        \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
        vpternlogq        \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
___
    }

  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 7
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
  $code .= "vbroadcastf64x2    `(16 * 9)`($AES_KEYS),$AESKEY2\n";

  # ;; =================================================
  # ;; prepare mid sum for adding to high & low
  # ;; load polynomial constant for reduction
  if ($do_reduction != 0) {
    $code .= <<___;
        vpsrldq           \$8,$GH1M,$GH2M
        vpslldq           \$8,$GH1M,$GH1M

        vmovdqa64         POLY2(%rip),@{[XWORD($RED_POLY)]}
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 8
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
  $code .= "vbroadcastf64x2    `(16 * 10)`($AES_KEYS),$AESKEY1\n";

  # ;; =================================================
  # ;; Add mid product to high and low
  if ($do_reduction != 0) {
    if ($is_start != 0) {
      $code .= <<___;
        vpternlogq        \$0x96,$GH2M,$GH2H,$GH1H      # ; TH = TH1 + TH2 + TM>>64
        vpternlogq        \$0x96,$GH1M,$GH2L,$GH1L      # ; TL = TL1 + TL2 + TM<<64
___
    } else {
      $code .= <<___;
        vpxorq            $GH2M,$GH1H,$GH1H      # ; TH = TH1 + TM>>64
        vpxorq            $GH1M,$GH1L,$GH1L      # ; TL = TL1 + TM<<64
___
    }
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 9
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);

  # ;; =================================================
  # ;; horizontal xor of low and high 4x128
  if ($do_reduction != 0) {
    &VHPXORI4x128($GH1H, $GH2H);
    &VHPXORI4x128($GH1L, $GH2L);
  }

  if (($NROUNDS >= 11)) {
    $code .= "vbroadcastf64x2    `(16 * 11)`($AES_KEYS),$AESKEY2\n";
  }

  # ;; =================================================
  # ;; first phase of reduction
  if ($do_reduction != 0) {
    $code .= <<___;
        vpclmulqdq        \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
        vpslldq           \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]}                    # ; shift-L 2 DWs
        vpxorq            @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]}      # ; first phase of the reduct
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES rounds up to 11 (AES192) or 13 (AES256)
  # ;; AES128 is done
  if (($NROUNDS >= 11)) {
    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
      $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
      $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
    $code .= "vbroadcastf64x2    `(16 * 12)`($AES_KEYS),$AESKEY1\n";

    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
      $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
      $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
    if (($NROUNDS == 13)) {
      $code .= "vbroadcastf64x2    `(16 * 13)`($AES_KEYS),$AESKEY2\n";

      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
        $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
        $B04_07,     $B08_11,   $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
      $code .= "vbroadcastf64x2    `(16 * 14)`($AES_KEYS),$AESKEY1\n";

      &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
        $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
        $B04_07,     $B08_11,   $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
    }
  }

  # ;; =================================================
  # ;; second phase of the reduction
  if ($do_reduction != 0) {
    $code .= <<___;
        vpclmulqdq        \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
        vpsrldq           \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]}      # ; shift-R 1-DW to obtain 2-DWs shift-R
        vpclmulqdq        \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
        vpslldq           \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]}      # ; shift-L 1-DW for result without shifts
        # ;; GH1H = GH1H + RED_T1 + RED_T2
        vpternlogq        \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; the last AES round
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07,  $B08_11,  $B12_15,  $B00_03,
    $B04_07,     $B08_11,       $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; XOR against plain/cipher text
  &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
    $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
    $B04_07,     $B08_11,  $B12_15, $DATA1,  $DATA2,  $DATA3,  $DATA4);

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; retrieve the last cipher counter block (partially XOR'ed with text)
  # ;; - this is needed for partial block cases
  if ($NUM_BLOCKS <= 4) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  } elsif ($NUM_BLOCKS <= 8) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  } elsif ($NUM_BLOCKS <= 12) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  } else {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; store cipher/plain text
  $code .= "mov       $CIPH_PLAIN_OUT,$IA0\n";
  &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);

  # ;; =================================================
  # ;; shuffle cipher text blocks for GHASH computation
  if ($ENC_DEC eq "ENC") {

    # ;; zero bytes outside the mask before hashing
    if ($NUM_BLOCKS <= 4) {
      $code .= "vmovdqu8           $B00_03,${B00_03}{$MASKREG}{z}\n";
    } elsif ($NUM_BLOCKS <= 8) {
      $code .= "vmovdqu8          $B04_07,${B04_07}{$MASKREG}{z}\n";
    } elsif ($NUM_BLOCKS <= 12) {
      $code .= "vmovdqu8          $B08_11,${B08_11}{$MASKREG}{z}\n";
    } else {
      $code .= "vmovdqu8          $B12_15,${B12_15}{$MASKREG}{z}\n";
    }

    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
      $NUM_BLOCKS, "vpshufb", $DATA1,  $DATA2,  $DATA3,  $DATA4,  $B00_03,
      $B04_07,     $B08_11,   $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  } else {

    # ;; zero bytes outside the mask before hashing
    if ($NUM_BLOCKS <= 4) {
      $code .= "vmovdqu8          $DATA1,${DATA1}{$MASKREG}{z}\n";
    } elsif ($NUM_BLOCKS <= 8) {
      $code .= "vmovdqu8          $DATA2,${DATA2}{$MASKREG}{z}\n";
    } elsif ($NUM_BLOCKS <= 12) {
      $code .= "vmovdqu8          $DATA3,${DATA3}{$MASKREG}{z}\n";
    } else {
      $code .= "vmovdqu8          $DATA4,${DATA4}{$MASKREG}{z}\n";
    }

    &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
      $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2,  $DATA3,  $DATA4,  $DATA1,
      $DATA2,      $DATA3,    $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
  }

  # ;; =================================================
  # ;; Extract the last block for partial / multi_call cases
  if ($NUM_BLOCKS <= 4) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
  } elsif ($NUM_BLOCKS <= 8) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
  } elsif ($NUM_BLOCKS <= 12) {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
  } else {
    $code .= "vextracti32x4     \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
  }

  if ($do_reduction != 0) {

    # ;; GH1H holds reduced hash value
    # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
    # ;; - register rename trick obsoletes the above move
  }

  # ;; =================================================
  # ;; GHASH last N blocks
  # ;; - current hash value in HASH_IN_OUT or
  # ;;   product parts in TO_REDUCE_H/M/L
  # ;; - DATA1-DATA4 include blocks for GHASH

  if ($do_reduction == 0) {
    &INITIAL_BLOCKS_PARTIAL_GHASH(
      $AES_KEYS,            $GCM128_CTX, $LENGTH,                  $NUM_BLOCKS,
      &XWORD($HASH_IN_OUT), $ENC_DEC,    $DATA1,                   $DATA2,
      $DATA3,               $DATA4,      &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
      $B00_03,              $B04_07,     $B08_11,                  $B12_15,
      $GHDAT1,              $GHDAT2,     $AESKEY1,                 $AESKEY2,
      $GHKEY1,              $IA0,        $PBLOCK_LEN,              $TO_REDUCE_H,
      $TO_REDUCE_M,         $TO_REDUCE_L);
  } else {
    &INITIAL_BLOCKS_PARTIAL_GHASH(
      $AES_KEYS,            $GCM128_CTX, $LENGTH,                  $NUM_BLOCKS,
      &XWORD($HASH_IN_OUT), $ENC_DEC,    $DATA1,                   $DATA2,
      $DATA3,               $DATA4,      &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
      $B00_03,              $B04_07,     $B08_11,                  $B12_15,
      $GHDAT1,              $GHDAT2,     $AESKEY1,                 $AESKEY2,
      $GHKEY1,              $IA0,        $PBLOCK_LEN);
  }
}