sub GHASH_16_ENCRYPT_16_PARALLEL()

in crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl [2842:3271]


sub GHASH_16_ENCRYPT_16_PARALLEL {
  my $AES_KEYS           = $_[0];     # [in] key pointer
  my $CIPH_PLAIN_OUT     = $_[1];     # [in] pointer to output buffer
  my $PLAIN_CIPH_IN      = $_[2];     # [in] pointer to input buffer
  my $DATA_OFFSET        = $_[3];     # [in] data offset
  my $CTR_BE             = $_[4];     # [in/out] ZMM counter blocks (last 4) in big-endian
  my $CTR_CHECK          = $_[5];     # [in/out] GP with 8-bit counter for overflow check
  my $HASHKEY_OFFSET     = $_[6];     # [in] numerical offset for the highest hash key (hash key index value)
  my $AESOUT_BLK_OFFSET  = $_[7];     # [in] numerical offset for AES-CTR out
  my $GHASHIN_BLK_OFFSET = $_[8];     # [in] numerical offset for GHASH blocks in
  my $SHFMSK             = $_[9];     # [in] ZMM with byte swap mask for pshufb
  my $ZT1                = $_[10];    # [clobbered] temporary ZMM (cipher)
  my $ZT2                = $_[11];    # [clobbered] temporary ZMM (cipher)
  my $ZT3                = $_[12];    # [clobbered] temporary ZMM (cipher)
  my $ZT4                = $_[13];    # [clobbered] temporary ZMM (cipher)
  my $ZT5                = $_[14];    # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
  my $ZT6                = $_[15];    # [clobbered] temporary ZMM (cipher)
  my $ZT7                = $_[16];    # [clobbered] temporary ZMM (cipher)
  my $ZT8                = $_[17];    # [clobbered] temporary ZMM (cipher)
  my $ZT9                = $_[18];    # [clobbered] temporary ZMM (cipher)
  my $ZT10               = $_[19];    # [clobbered] temporary ZMM (ghash)
  my $ZT11               = $_[20];    # [clobbered] temporary ZMM (ghash)
  my $ZT12               = $_[21];    # [clobbered] temporary ZMM (ghash)
  my $ZT13               = $_[22];    # [clobbered] temporary ZMM (ghash)
  my $ZT14               = $_[23];    # [clobbered] temporary ZMM (ghash)
  my $ZT15               = $_[24];    # [clobbered] temporary ZMM (ghash)
  my $ZT16               = $_[25];    # [clobbered] temporary ZMM (ghash)
  my $ZT17               = $_[26];    # [clobbered] temporary ZMM (ghash)
  my $ZT18               = $_[27];    # [clobbered] temporary ZMM (ghash)
  my $ZT19               = $_[28];    # [clobbered] temporary ZMM
  my $ZT20               = $_[29];    # [clobbered] temporary ZMM
  my $ZT21               = $_[30];    # [clobbered] temporary ZMM
  my $ZT22               = $_[31];    # [clobbered] temporary ZMM
  my $ZT23               = $_[32];    # [clobbered] temporary ZMM
  my $ADDBE_4x4          = $_[33];    # [in] ZMM with 4x128bits 4 in big-endian
  my $ADDBE_1234         = $_[34];    # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
  my $TO_REDUCE_L        = $_[35];    # [in/out] ZMM for low 4x128-bit GHASH sum
  my $TO_REDUCE_H        = $_[36];    # [in/out] ZMM for hi 4x128-bit GHASH sum
  my $TO_REDUCE_M        = $_[37];    # [in/out] ZMM for medium 4x128-bit GHASH sum
  my $DO_REDUCTION       = $_[38];    # [in] "no_reduction", "final_reduction", "first_time"
  my $ENC_DEC            = $_[39];    # [in] cipher direction
  my $DATA_DISPL         = $_[40];    # [in] fixed numerical data displacement/offset
  my $GHASH_IN           = $_[41];    # [in] current GHASH value or "no_ghash_in"
  my $IA0                = $_[42];    # [clobbered] temporary GPR

  my $B00_03 = $ZT1;
  my $B04_07 = $ZT2;
  my $B08_11 = $ZT3;
  my $B12_15 = $ZT4;

  my $GH1H = $ZT5;

  # ; @note: do not change this mapping
  my $GH1L = $ZT6;
  my $GH1M = $ZT7;
  my $GH1T = $ZT8;

  my $GH2H = $ZT9;
  my $GH2L = $ZT10;
  my $GH2M = $ZT11;
  my $GH2T = $ZT12;

  my $RED_POLY = $GH2T;
  my $RED_P1   = $GH2L;
  my $RED_T1   = $GH2H;
  my $RED_T2   = $GH2M;

  my $GH3H = $ZT13;
  my $GH3L = $ZT14;
  my $GH3M = $ZT15;
  my $GH3T = $ZT16;

  my $DATA1 = $ZT13;
  my $DATA2 = $ZT14;
  my $DATA3 = $ZT15;
  my $DATA4 = $ZT16;

  my $AESKEY1 = $ZT17;
  my $AESKEY2 = $ZT18;

  my $GHKEY1 = $ZT19;
  my $GHKEY2 = $ZT20;
  my $GHDAT1 = $ZT21;
  my $GHDAT2 = $ZT22;

  my $rndsuffix = &random_string();

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; prepare counter blocks

  $code .= <<___;
        cmpb              \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
        jae               .L_16_blocks_overflow_${rndsuffix}
        vpaddd            $ADDBE_1234,$CTR_BE,$B00_03
        vpaddd            $ADDBE_4x4,$B00_03,$B04_07
        vpaddd            $ADDBE_4x4,$B04_07,$B08_11
        vpaddd            $ADDBE_4x4,$B08_11,$B12_15
        jmp               .L_16_blocks_ok_${rndsuffix}
.L_16_blocks_overflow_${rndsuffix}:
        vpshufb           $SHFMSK,$CTR_BE,$CTR_BE
        vmovdqa64         ddq_add_4444(%rip),$B12_15
        vpaddd            ddq_add_1234(%rip),$CTR_BE,$B00_03
        vpaddd            $B12_15,$B00_03,$B04_07
        vpaddd            $B12_15,$B04_07,$B08_11
        vpaddd            $B12_15,$B08_11,$B12_15
        vpshufb           $SHFMSK,$B00_03,$B00_03
        vpshufb           $SHFMSK,$B04_07,$B04_07
        vpshufb           $SHFMSK,$B08_11,$B08_11
        vpshufb           $SHFMSK,$B12_15,$B12_15
.L_16_blocks_ok_${rndsuffix}:
___

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; pre-load constants
  $code .= "vbroadcastf64x2    `(16 * 0)`($AES_KEYS),$AESKEY1\n";
  if ($GHASH_IN ne "no_ghash_in") {
    $code .= "vpxorq            `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
  } else {
    $code .= "vmovdqa64         `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
  }

  $code .= <<___;
        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; save counter for the next round
        # ;; increment counter overflow check register
        vshufi64x2        \$0b11111111,$B12_15,$B12_15,$CTR_BE
        addb              \$16,@{[BYTE($CTR_CHECK)]}
        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; pre-load constants
        vbroadcastf64x2    `(16 * 1)`($AES_KEYS),$AESKEY2
        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
        vmovdqa64         `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; stitch AES rounds with GHASH

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES round 0 - ARK

        vpxorq            $AESKEY1,$B00_03,$B00_03
        vpxorq            $AESKEY1,$B04_07,$B04_07
        vpxorq            $AESKEY1,$B08_11,$B08_11
        vpxorq            $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 2)`($AES_KEYS),$AESKEY1

        # ;;==================================================
        # ;; GHASH 4 blocks (15 to 12)
        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH1H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH1L      # ; a0*b0
        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH1M      # ; a1*b0
        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH1T      # ; a0*b1
        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
        vmovdqa64         `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES round 1
        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 3)`($AES_KEYS),$AESKEY2

        # ;; =================================================
        # ;; GHASH 4 blocks (11 to 8)
        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
        vmovdqu64         @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
        vmovdqa64         `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES round 2
        vaesenc           $AESKEY1,$B00_03,$B00_03
        vaesenc           $AESKEY1,$B04_07,$B04_07
        vaesenc           $AESKEY1,$B08_11,$B08_11
        vaesenc           $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 4)`($AES_KEYS),$AESKEY1

        # ;; =================================================
        # ;; GHASH 4 blocks (7 to 4)
        vpclmulqdq        \$0x10,$GHKEY1,$GHDAT1,$GH3M      # ; a0*b1
        vpclmulqdq        \$0x01,$GHKEY1,$GHDAT1,$GH3T      # ; a1*b0
        vpclmulqdq        \$0x11,$GHKEY1,$GHDAT1,$GH3H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY1,$GHDAT1,$GH3L      # ; a0*b0
        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES rounds 3
        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 5)`($AES_KEYS),$AESKEY2

        # ;; =================================================
        # ;; Gather (XOR) GHASH for 12 blocks
        vpternlogq        \$0x96,$GH3H,$GH2H,$GH1H
        vpternlogq        \$0x96,$GH3L,$GH2L,$GH1L
        vpternlogq        \$0x96,$GH3T,$GH2T,$GH1T
        vpternlogq        \$0x96,$GH3M,$GH2M,$GH1M

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES rounds 4
        vaesenc           $AESKEY1,$B00_03,$B00_03
        vaesenc           $AESKEY1,$B04_07,$B04_07
        vaesenc           $AESKEY1,$B08_11,$B08_11
        vaesenc           $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 6)`($AES_KEYS),$AESKEY1

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; load plain/cipher text (recycle GH3xx registers)
        vmovdqu8          `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
        vmovdqu8          `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
        vmovdqu8          `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
        vmovdqu8          `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES rounds 5
        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 7)`($AES_KEYS),$AESKEY2

        # ;; =================================================
        # ;; GHASH 4 blocks (3 to 0)
        vpclmulqdq        \$0x10,$GHKEY2,$GHDAT2,$GH2M      # ; a0*b1
        vpclmulqdq        \$0x01,$GHKEY2,$GHDAT2,$GH2T      # ; a1*b0
        vpclmulqdq        \$0x11,$GHKEY2,$GHDAT2,$GH2H      # ; a1*b1
        vpclmulqdq        \$0x00,$GHKEY2,$GHDAT2,$GH2L      # ; a0*b0
        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; AES round 6
        vaesenc           $AESKEY1,$B00_03,$B00_03
        vaesenc           $AESKEY1,$B04_07,$B04_07
        vaesenc           $AESKEY1,$B08_11,$B08_11
        vaesenc           $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 8)`($AES_KEYS),$AESKEY1
___

  # ;; =================================================
  # ;; gather GHASH in GH1L (low) and GH1H (high)
  if ($DO_REDUCTION eq "first_time") {
    $code .= <<___;
        vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M      # ; TM
        vpxorq            $GH2M,$GH1M,$TO_REDUCE_M      # ; TM
        vpxorq            $GH2H,$GH1H,$TO_REDUCE_H      # ; TH
        vpxorq            $GH2L,$GH1L,$TO_REDUCE_L      # ; TL
___
  }
  if ($DO_REDUCTION eq "no_reduction") {
    $code .= <<___;
        vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M             # ; TM
        vpternlogq        \$0x96,$GH2M,$GH1M,$TO_REDUCE_M      # ; TM
        vpternlogq        \$0x96,$GH2H,$GH1H,$TO_REDUCE_H      # ; TH
        vpternlogq        \$0x96,$GH2L,$GH1L,$TO_REDUCE_L      # ; TL
___
  }
  if ($DO_REDUCTION eq "final_reduction") {
    $code .= <<___;
        # ;; phase 1: add mid products together
        # ;; also load polynomial constant for reduction
        vpternlogq        \$0x96,$GH2T,$GH1T,$GH1M      # ; TM
        vpternlogq        \$0x96,$GH2M,$TO_REDUCE_M,$GH1M

        vpsrldq           \$8,$GH1M,$GH2M
        vpslldq           \$8,$GH1M,$GH1M

        vmovdqa64         POLY2(%rip),@{[XWORD($RED_POLY)]}
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 7
  $code .= <<___;
        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 9)`($AES_KEYS),$AESKEY2
___

  # ;; =================================================
  # ;; Add mid product to high and low
  if ($DO_REDUCTION eq "final_reduction") {
    $code .= <<___;
        vpternlogq        \$0x96,$GH2M,$GH2H,$GH1H      # ; TH = TH1 + TH2 + TM>>64
        vpxorq            $TO_REDUCE_H,$GH1H,$GH1H
        vpternlogq        \$0x96,$GH1M,$GH2L,$GH1L      # ; TL = TL1 + TL2 + TM<<64
        vpxorq            $TO_REDUCE_L,$GH1L,$GH1L
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 8
  $code .= <<___;
        vaesenc           $AESKEY1,$B00_03,$B00_03
        vaesenc           $AESKEY1,$B04_07,$B04_07
        vaesenc           $AESKEY1,$B08_11,$B08_11
        vaesenc           $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 10)`($AES_KEYS),$AESKEY1
___

  # ;; =================================================
  # ;; horizontal xor of low and high 4x128
  if ($DO_REDUCTION eq "final_reduction") {
    &VHPXORI4x128($GH1H, $GH2H);
    &VHPXORI4x128($GH1L, $GH2L);
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES round 9
  $code .= <<___;
        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
___
  if (($NROUNDS >= 11)) {
    $code .= "vbroadcastf64x2    `(16 * 11)`($AES_KEYS),$AESKEY2\n";
  }

  # ;; =================================================
  # ;; first phase of reduction
  if ($DO_REDUCTION eq "final_reduction") {
    $code .= <<___;
        vpclmulqdq        \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
        vpslldq           \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]}                    # ; shift-L 2 DWs
        vpxorq            @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]}      # ; first phase of the reduct
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; AES rounds up to 11 (AES192) or 13 (AES256)
  # ;; AES128 is done
  if (($NROUNDS >= 11)) {
    $code .= <<___;
        vaesenc           $AESKEY1,$B00_03,$B00_03
        vaesenc           $AESKEY1,$B04_07,$B04_07
        vaesenc           $AESKEY1,$B08_11,$B08_11
        vaesenc           $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 12)`($AES_KEYS),$AESKEY1

        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
___
    if (($NROUNDS == 13)) {
      $code .= <<___;
        vbroadcastf64x2    `(16 * 13)`($AES_KEYS),$AESKEY2

        vaesenc           $AESKEY1,$B00_03,$B00_03
        vaesenc           $AESKEY1,$B04_07,$B04_07
        vaesenc           $AESKEY1,$B08_11,$B08_11
        vaesenc           $AESKEY1,$B12_15,$B12_15
        vbroadcastf64x2    `(16 * 14)`($AES_KEYS),$AESKEY1

        vaesenc           $AESKEY2,$B00_03,$B00_03
        vaesenc           $AESKEY2,$B04_07,$B04_07
        vaesenc           $AESKEY2,$B08_11,$B08_11
        vaesenc           $AESKEY2,$B12_15,$B12_15
___
    }
  }

  # ;; =================================================
  # ;; second phase of the reduction
  if ($DO_REDUCTION eq "final_reduction") {
    $code .= <<___;
        vpclmulqdq        \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
        vpsrldq           \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]}      # ; shift-R 1-DW to obtain 2-DWs shift-R
        vpclmulqdq        \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
        vpslldq           \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]}      # ; shift-L 1-DW for result without shifts
        # ;; GH1H = GH1H x RED_T1 x RED_T2
        vpternlogq        \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;; the last AES round
  $code .= <<___;
        vaesenclast       $AESKEY1,$B00_03,$B00_03
        vaesenclast       $AESKEY1,$B04_07,$B04_07
        vaesenclast       $AESKEY1,$B08_11,$B08_11
        vaesenclast       $AESKEY1,$B12_15,$B12_15

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; XOR against plain/cipher text
        vpxorq            $DATA1,$B00_03,$B00_03
        vpxorq            $DATA2,$B04_07,$B04_07
        vpxorq            $DATA3,$B08_11,$B08_11
        vpxorq            $DATA4,$B12_15,$B12_15

        # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
        # ;; store cipher/plain text
        mov               $CIPH_PLAIN_OUT,$IA0
        vmovdqu8          $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
        vmovdqu8          $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
        vmovdqu8          $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
        vmovdqu8          $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
___

  # ;; =================================================
  # ;; shuffle cipher text blocks for GHASH computation
  if ($ENC_DEC eq "ENC") {
    $code .= <<___;
        vpshufb           $SHFMSK,$B00_03,$B00_03
        vpshufb           $SHFMSK,$B04_07,$B04_07
        vpshufb           $SHFMSK,$B08_11,$B08_11
        vpshufb           $SHFMSK,$B12_15,$B12_15
___
  } else {
    $code .= <<___;
        vpshufb           $SHFMSK,$DATA1,$B00_03
        vpshufb           $SHFMSK,$DATA2,$B04_07
        vpshufb           $SHFMSK,$DATA3,$B08_11
        vpshufb           $SHFMSK,$DATA4,$B12_15
___
  }

  # ;; =================================================
  # ;; store shuffled cipher text for ghashing
  $code .= <<___;
        vmovdqa64         $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
        vmovdqa64         $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
        vmovdqa64         $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
        vmovdqa64         $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
___
}