sub GCM_ENC_DEC()

in crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl [3553:4039]


sub GCM_ENC_DEC {
  my $AES_KEYS       = $_[0];    # [in] AES Key schedule
  my $GCM128_CTX     = $_[1];    # [in] context pointer
  my $PBLOCK_LEN     = $_[2];    # [in/out] length of partial block at the moment of previous update
  my $PLAIN_CIPH_IN  = $_[3];    # [in] input buffer pointer
  my $PLAIN_CIPH_LEN = $_[4];    # [in] buffer length
  my $CIPH_PLAIN_OUT = $_[5];    # [in] output buffer pointer
  my $ENC_DEC        = $_[6];    # [in] cipher direction

  my $IA0 = "%r10";
  my $IA1 = "%r12";
  my $IA2 = "%r13";
  my $IA3 = "%r15";
  my $IA4 = "%rax";
  my $IA5 = "%r11";
  my $IA6 = "%rbx";
  my $IA7 = "%r14";

  my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;

  my $CTR_CHECK   = $IA3;
  my $DATA_OFFSET = $IA4;
  my $HASHK_PTR   = $IA6;

  my $HKEYS_READY = $IA7;

  my $CTR_BLOCKz = "%zmm2";
  my $CTR_BLOCKx = "%xmm2";

  # ; hardcoded in GCM_INIT

  my $AAD_HASHz = "%zmm14";
  my $AAD_HASHx = "%xmm14";

  my $ZTMP0  = "%zmm0";
  my $ZTMP1  = "%zmm3";
  my $ZTMP2  = "%zmm4";
  my $ZTMP3  = "%zmm5";
  my $ZTMP4  = "%zmm6";
  my $ZTMP5  = "%zmm7";
  my $ZTMP6  = "%zmm10";
  my $ZTMP7  = "%zmm11";
  my $ZTMP8  = "%zmm12";
  my $ZTMP9  = "%zmm13";
  my $ZTMP10 = "%zmm15";
  my $ZTMP11 = "%zmm16";
  my $ZTMP12 = "%zmm17";

  my $ZTMP13 = "%zmm19";
  my $ZTMP14 = "%zmm20";
  my $ZTMP15 = "%zmm21";
  my $ZTMP16 = "%zmm30";
  my $ZTMP17 = "%zmm31";
  my $ZTMP18 = "%zmm1";
  my $ZTMP19 = "%zmm18";
  my $ZTMP20 = "%zmm8";
  my $ZTMP21 = "%zmm22";
  my $ZTMP22 = "%zmm23";

  my $GH        = "%zmm24";
  my $GL        = "%zmm25";
  my $GM        = "%zmm26";
  my $SHUF_MASK = "%zmm29";

  # ; Unused in the small packet path
  my $ADDBE_4x4  = "%zmm27";
  my $ADDBE_1234 = "%zmm28";

  my $MASKREG = "%k1";

  my $rndsuffix = &random_string();

  # ;; reduction every 48 blocks, depth 32 blocks
  # ;; @note 48 blocks is the maximum capacity of the stack frame
  my $big_loop_nblocks = 48;
  my $big_loop_depth   = 32;

  # ;;; Macro flow depending on packet size
  # ;;; - LENGTH <= 16 blocks
  # ;;;   - cipher followed by hashing (reduction)
  # ;;; - 16 blocks < LENGTH < 32 blocks
  # ;;;   - cipher 16 blocks
  # ;;;   - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  # ;;; - 32 blocks < LENGTH < 48 blocks
  # ;;;   - cipher 2 x 16 blocks
  # ;;;   - hash 16 blocks
  # ;;;   - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  # ;;; - LENGTH >= 48 blocks
  # ;;;   - cipher 2 x 16 blocks
  # ;;;   - while (data_to_cipher >= 48 blocks):
  # ;;;     - cipher 16 blocks & hash 16 blocks
  # ;;;     - cipher 16 blocks & hash 16 blocks
  # ;;;     - cipher 16 blocks & hash 16 blocks (reduction)
  # ;;;   - if (data_to_cipher >= 32 blocks):
  # ;;;     - cipher 16 blocks & hash 16 blocks
  # ;;;     - cipher 16 blocks & hash 16 blocks
  # ;;;     - hash 16 blocks (reduction)
  # ;;;     - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  # ;;;   - elif (data_to_cipher >= 16 blocks):
  # ;;;     - cipher 16 blocks & hash 16 blocks
  # ;;;     - hash 16 blocks
  # ;;;     - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
  # ;;;   - else:
  # ;;;     - hash 16 blocks
  # ;;;     - cipher N blocks & hash 16 blocks, hash N blocks (reduction)

  if ($win64) {
    $code .= "cmpq              \$0,$PLAIN_CIPH_LEN\n";
  } else {
    $code .= "or                $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
  }
  $code .= "je                 .L_enc_dec_abort_${rndsuffix}\n";

  $code .= "xor                $HKEYS_READY, $HKEYS_READY\n";

  $code .= "vmovdqu64         `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";

  # BE -> LE conversion
  $code .= "vpshufb           SHUF_MASK(%rip),$AAD_HASHx,$AAD_HASHx\n";

  # ;; Used for the update flow - if there was a previous partial
  # ;; block fill the remaining bytes here.
  &PARTIAL_BLOCK(
    $GCM128_CTX,  $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
    $DATA_OFFSET, $AAD_HASHx,  $ENC_DEC,        $IA0,           $IA1,
    $IA2,         $ZTMP0,      $ZTMP1,          $ZTMP2,         $ZTMP3,
    $ZTMP4,       $ZTMP5,      $ZTMP6,          $ZTMP7,         $MASKREG);

  $code .= "vmovdqu64         `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";

  # ;; Save the amount of data left to process in $LENGTH
  # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
  if ($win64) {
    $code .= "mov               $PLAIN_CIPH_LEN,$LENGTH\n";
  }

  # ;; There may be no more data if it was consumed in the partial block.
  $code .= <<___;
        sub               $DATA_OFFSET,$LENGTH
        je                .L_enc_dec_done_${rndsuffix}
___

  $code .= <<___;
        cmp               \$`(16 * 16)`,$LENGTH
        jbe              .L_message_below_equal_16_blocks_${rndsuffix}

        vmovdqa64         SHUF_MASK(%rip),$SHUF_MASK
        vmovdqa64         ddq_addbe_4444(%rip),$ADDBE_4x4
        vmovdqa64         ddq_addbe_1234(%rip),$ADDBE_1234

        # ;; start the pipeline
        # ;; - 32 blocks aes-ctr
        # ;; - 16 blocks ghash + aes-ctr

        # ;; set up CTR_CHECK
        vmovd             $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
        and               \$255,@{[DWORD($CTR_CHECK)]}
        # ;; in LE format after init, convert to BE
        vshufi64x2        \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
        vpshufb           $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
___

  # ;; ==== AES-CTR - first 16 blocks
  my $aesout_offset      = ($STACK_LOCAL_OFFSET + (0 * 16));
  my $data_in_out_offset = 0;
  &INITIAL_BLOCKS_16(
    $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS,      $DATA_OFFSET,        "no_ghash", $CTR_BLOCKz,
    $CTR_CHECK,     $ADDBE_4x4,      $ADDBE_1234,    $ZTMP0,              $ZTMP1,     $ZTMP2,
    $ZTMP3,         $ZTMP4,          $ZTMP5,         $ZTMP6,              $ZTMP7,     $ZTMP8,
    $SHUF_MASK,     $ENC_DEC,        $aesout_offset, $data_in_out_offset, $IA0);

  # ;; Get Htable pointer
  $code .= "lea               `$CTX_OFFSET_HTable`($GCM128_CTX),$IA1\n";
  &precompute_hkeys_on_stack($IA1, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, "first16");

  $code .= <<___;
        cmp               \$`(32 * 16)`,$LENGTH
        jb                .L_message_below_32_blocks_${rndsuffix}
___

  # ;; ==== AES-CTR - next 16 blocks
  $aesout_offset      = ($STACK_LOCAL_OFFSET + (16 * 16));
  $data_in_out_offset = (16 * 16);
  &INITIAL_BLOCKS_16(
    $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS,      $DATA_OFFSET,        "no_ghash", $CTR_BLOCKz,
    $CTR_CHECK,     $ADDBE_4x4,      $ADDBE_1234,    $ZTMP0,              $ZTMP1,     $ZTMP2,
    $ZTMP3,         $ZTMP4,          $ZTMP5,         $ZTMP6,              $ZTMP7,     $ZTMP8,
    $SHUF_MASK,     $ENC_DEC,        $aesout_offset, $data_in_out_offset, $IA0);

  &precompute_hkeys_on_stack($IA1, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, "last32");
  $code .= "mov     \$1,$HKEYS_READY\n";

  $code .= <<___;
        add               \$`(32 * 16)`,$DATA_OFFSET
        sub               \$`(32 * 16)`,$LENGTH

        cmp               \$`($big_loop_nblocks * 16)`,$LENGTH
        jb                .L_no_more_big_nblocks_${rndsuffix}
___

  # ;; ====
  # ;; ==== AES-CTR + GHASH - 48 blocks loop
  # ;; ====
  $code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n";

  # ;; ==== AES-CTR + GHASH - 16 blocks, start
  $aesout_offset      = ($STACK_LOCAL_OFFSET + (32 * 16));
  $data_in_out_offset = (0 * 16);
  my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  &GHASH_16_ENCRYPT_16_PARALLEL(
    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
    48,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
    $GH,       $GM,             "first_time",    $ENC_DEC,     $data_in_out_offset, $AAD_HASHz,
    $IA0);

  # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
  $aesout_offset      = ($STACK_LOCAL_OFFSET + (0 * 16));
  $data_in_out_offset = (16 * 16);
  $ghashin_offset     = ($STACK_LOCAL_OFFSET + (16 * 16));
  &GHASH_16_ENCRYPT_16_PARALLEL(
    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
    32,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
    $GH,       $GM,             "no_reduction",  $ENC_DEC,     $data_in_out_offset, "no_ghash_in",
    $IA0);

  # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
  $aesout_offset      = ($STACK_LOCAL_OFFSET + (16 * 16));
  $data_in_out_offset = (32 * 16);
  $ghashin_offset     = ($STACK_LOCAL_OFFSET + (32 * 16));
  &GHASH_16_ENCRYPT_16_PARALLEL(
    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,    $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
    16,        $aesout_offset,  $ghashin_offset,   $SHUF_MASK,   $ZTMP0,              $ZTMP1,
    $ZTMP2,    $ZTMP3,          $ZTMP4,            $ZTMP5,       $ZTMP6,              $ZTMP7,
    $ZTMP8,    $ZTMP9,          $ZTMP10,           $ZTMP11,      $ZTMP12,             $ZTMP13,
    $ZTMP14,   $ZTMP15,         $ZTMP16,           $ZTMP17,      $ZTMP18,             $ZTMP19,
    $ZTMP20,   $ZTMP21,         $ZTMP22,           $ADDBE_4x4,   $ADDBE_1234,         $GL,
    $GH,       $GM,             "final_reduction", $ENC_DEC,     $data_in_out_offset, "no_ghash_in",
    $IA0);

  # ;; === xor cipher block 0 with GHASH (ZT4)
  $code .= <<___;
        vmovdqa64         $ZTMP4,$AAD_HASHz

        add               \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
        sub               \$`($big_loop_nblocks * 16)`,$LENGTH
        cmp               \$`($big_loop_nblocks * 16)`,$LENGTH
        jae               .L_encrypt_big_nblocks_${rndsuffix}

.L_no_more_big_nblocks_${rndsuffix}:

        cmp               \$`(32 * 16)`,$LENGTH
        jae               .L_encrypt_32_blocks_${rndsuffix}

        cmp               \$`(16 * 16)`,$LENGTH
        jae               .L_encrypt_16_blocks_${rndsuffix}
___

  # ;; =====================================================
  # ;; =====================================================
  # ;; ==== GHASH 1 x 16 blocks
  # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  # ;; ====      then GHASH N blocks
  $code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n";

  # ;; calculate offset to the right hash key
  $code .= <<___;
mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
and               \$~15,@{[DWORD($IA0)]}
mov               \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
sub               @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
___

  # ;; ==== GHASH 32 blocks and follow with reduction
  &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
    "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);

  # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  $code .= "add               \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
  &GCM_ENC_DEC_LAST(
    $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $LENGTH,
    $CTR_BLOCKz, $CTR_CHECK,  $HASHK_PTR,      $ghashin_offset, $SHUF_MASK,   $ZTMP0,
    $ZTMP1,      $ZTMP2,      $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,
    $ZTMP7,      $ZTMP8,      $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,
    $ZTMP13,     $ZTMP14,     $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,
    $ZTMP19,     $ZTMP20,     $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,
    "mid",       $GL,         $GH,             $GM,             $ENC_DEC,     $AAD_HASHz,
    $IA0,        $IA5,        $MASKREG,        $PBLOCK_LEN);

  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  $code .= "jmp           .L_ghash_done_${rndsuffix}\n";

  # ;; =====================================================
  # ;; =====================================================
  # ;; ==== GHASH & encrypt 1 x 16 blocks
  # ;; ==== GHASH & encrypt 1 x 16 blocks
  # ;; ==== GHASH 1 x 16 blocks (reduction)
  # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  # ;; ====      then GHASH N blocks
  $code .= ".L_encrypt_32_blocks_${rndsuffix}:\n";

  # ;; ==== AES-CTR + GHASH - 16 blocks, start
  $aesout_offset  = ($STACK_LOCAL_OFFSET + (32 * 16));
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  $data_in_out_offset = (0 * 16);
  &GHASH_16_ENCRYPT_16_PARALLEL(
    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
    48,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
    $GH,       $GM,             "first_time",    $ENC_DEC,     $data_in_out_offset, $AAD_HASHz,
    $IA0);

  # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
  $aesout_offset  = ($STACK_LOCAL_OFFSET + (0 * 16));
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
  $data_in_out_offset = (16 * 16);
  &GHASH_16_ENCRYPT_16_PARALLEL(
    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
    32,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
    $GH,       $GM,             "no_reduction",  $ENC_DEC,     $data_in_out_offset, "no_ghash_in",
    $IA0);

  # ;; ==== GHASH 16 blocks with reduction
  &GHASH_16(
    "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
    "%rsp", &HashKeyOffsetByIdx(16, "frame"),
    0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);

  # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  $code .= <<___;
        sub               \$`(32 * 16)`,$LENGTH
        add               \$`(32 * 16)`,$DATA_OFFSET
___

  # ;; calculate offset to the right hash key
  $code .= "mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
  $code .= <<___;
        and               \$~15,@{[DWORD($IA0)]}
        mov               \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
        sub               @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
___
  &GCM_ENC_DEC_LAST(
    $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $LENGTH,
    $CTR_BLOCKz, $CTR_CHECK,  $HASHK_PTR,      $ghashin_offset, $SHUF_MASK,   $ZTMP0,
    $ZTMP1,      $ZTMP2,      $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,
    $ZTMP7,      $ZTMP8,      $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,
    $ZTMP13,     $ZTMP14,     $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,
    $ZTMP19,     $ZTMP20,     $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,
    "start",     $GL,         $GH,             $GM,             $ENC_DEC,     $AAD_HASHz,
    $IA0,        $IA5,        $MASKREG,        $PBLOCK_LEN);

  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  $code .= "jmp           .L_ghash_done_${rndsuffix}\n";

  # ;; =====================================================
  # ;; =====================================================
  # ;; ==== GHASH & encrypt 16 blocks (done before)
  # ;; ==== GHASH 1 x 16 blocks
  # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
  # ;; ====      then GHASH N blocks
  $code .= ".L_encrypt_16_blocks_${rndsuffix}:\n";

  # ;; ==== AES-CTR + GHASH - 16 blocks, start
  $aesout_offset  = ($STACK_LOCAL_OFFSET + (32 * 16));
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
  $data_in_out_offset = (0 * 16);
  &GHASH_16_ENCRYPT_16_PARALLEL(
    $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $CTR_BLOCKz,         $CTR_CHECK,
    48,        $aesout_offset,  $ghashin_offset, $SHUF_MASK,   $ZTMP0,              $ZTMP1,
    $ZTMP2,    $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,              $ZTMP7,
    $ZTMP8,    $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,             $ZTMP13,
    $ZTMP14,   $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,             $ZTMP19,
    $ZTMP20,   $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,         $GL,
    $GH,       $GM,             "first_time",    $ENC_DEC,     $data_in_out_offset, $AAD_HASHz,
    $IA0);

  # ;; ==== GHASH 1 x 16 blocks
  &GHASH_16(
    "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
    "%rsp", &HashKeyOffsetByIdx(32, "frame"),
    0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);

  # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
  $code .= <<___;
        sub               \$`(16 * 16)`,$LENGTH
        add               \$`(16 * 16)`,$DATA_OFFSET
___
  &GCM_ENC_DEC_LAST(
    $AES_KEYS,    $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
    $DATA_OFFSET, $LENGTH,     $CTR_BLOCKz,     $CTR_CHECK,
    &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
    $ZTMP1,       $ZTMP2,     $ZTMP3,     $ZTMP4,
    $ZTMP5,       $ZTMP6,     $ZTMP7,     $ZTMP8,
    $ZTMP9,       $ZTMP10,    $ZTMP11,    $ZTMP12,
    $ZTMP13,      $ZTMP14,    $ZTMP15,    $ZTMP16,
    $ZTMP17,      $ZTMP18,    $ZTMP19,    $ZTMP20,
    $ZTMP21,      $ZTMP22,    $ADDBE_4x4, $ADDBE_1234,
    "end_reduce", $GL,        $GH,        $GM,
    $ENC_DEC,     $AAD_HASHz, $IA0,       $IA5,
    $MASKREG,     $PBLOCK_LEN);

  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  $code .= <<___;
        jmp               .L_ghash_done_${rndsuffix}

.L_message_below_32_blocks_${rndsuffix}:
        # ;; 32 > number of blocks > 16

        sub               \$`(16 * 16)`,$LENGTH
        add               \$`(16 * 16)`,$DATA_OFFSET
___
  $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));

  # ;; calculate offset to the right hash key
  $code .= "mov               @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";

  # ;; Get Htable pointer
  $code .= "lea               `$CTX_OFFSET_HTable`($GCM128_CTX),$IA1\n";
  &precompute_hkeys_on_stack($IA1, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, "mid16");
  $code .= "mov     \$1,$HKEYS_READY\n";

  $code .= <<___;
and               \$~15,@{[DWORD($IA0)]}
mov               \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
sub               @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
___

  &GCM_ENC_DEC_LAST(
    $AES_KEYS,   $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,  $DATA_OFFSET, $LENGTH,
    $CTR_BLOCKz, $CTR_CHECK,  $HASHK_PTR,      $ghashin_offset, $SHUF_MASK,   $ZTMP0,
    $ZTMP1,      $ZTMP2,      $ZTMP3,          $ZTMP4,          $ZTMP5,       $ZTMP6,
    $ZTMP7,      $ZTMP8,      $ZTMP9,          $ZTMP10,         $ZTMP11,      $ZTMP12,
    $ZTMP13,     $ZTMP14,     $ZTMP15,         $ZTMP16,         $ZTMP17,      $ZTMP18,
    $ZTMP19,     $ZTMP20,     $ZTMP21,         $ZTMP22,         $ADDBE_4x4,   $ADDBE_1234,
    "start",     $GL,         $GH,             $GM,             $ENC_DEC,     $AAD_HASHz,
    $IA0,        $IA5,        $MASKREG,        $PBLOCK_LEN);

  $code .= "vpshufb           @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
  $code .= <<___;
        jmp           .L_ghash_done_${rndsuffix}

.L_message_below_equal_16_blocks_${rndsuffix}:
        # ;; Determine how many blocks to process
        # ;; - process one additional block if there is a partial block
        mov               @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
        add               \$15,@{[DWORD($IA1)]}
        shr               \$4, @{[DWORD($IA1)]}     # ; $IA1 can be in the range from 0 to 16
___
  &GCM_ENC_DEC_SMALL(
    $AES_KEYS,    $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
    $DATA_OFFSET, $LENGTH,     $IA1,            $CTR_BLOCKx,    $AAD_HASHx,      $ZTMP0,
    $ZTMP1,       $ZTMP2,      $ZTMP3,          $ZTMP4,         $ZTMP5,          $ZTMP6,
    $ZTMP7,       $ZTMP8,      $ZTMP9,          $ZTMP10,        $ZTMP11,         $ZTMP12,
    $ZTMP13,      $ZTMP14,     $IA0,            $IA3,           $MASKREG,        $SHUF_MASK,
    $PBLOCK_LEN);

  # ;; fall through to exit

  $code .= ".L_ghash_done_${rndsuffix}:\n";

  # ;; save the last counter block
  $code .= <<___;
        vmovdqu64         $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)
.L_enc_dec_done_${rndsuffix}:
        # LE->BE conversion
        vpshufb           SHUF_MASK(%rip),$AAD_HASHx,$AAD_HASHx
        vmovdqu64         $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
.L_enc_dec_abort_${rndsuffix}:
___
}