in crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl [3553:4039]
sub GCM_ENC_DEC {
my $AES_KEYS = $_[0]; # [in] AES Key schedule
my $GCM128_CTX = $_[1]; # [in] context pointer
my $PBLOCK_LEN = $_[2]; # [in/out] length of partial block at the moment of previous update
my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer
my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer
my $ENC_DEC = $_[6]; # [in] cipher direction
my $IA0 = "%r10";
my $IA1 = "%r12";
my $IA2 = "%r13";
my $IA3 = "%r15";
my $IA4 = "%rax";
my $IA5 = "%r11";
my $IA6 = "%rbx";
my $IA7 = "%r14";
my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
my $CTR_CHECK = $IA3;
my $DATA_OFFSET = $IA4;
my $HASHK_PTR = $IA6;
my $HKEYS_READY = $IA7;
my $CTR_BLOCKz = "%zmm2";
my $CTR_BLOCKx = "%xmm2";
# ; hardcoded in GCM_INIT
my $AAD_HASHz = "%zmm14";
my $AAD_HASHx = "%xmm14";
my $ZTMP0 = "%zmm0";
my $ZTMP1 = "%zmm3";
my $ZTMP2 = "%zmm4";
my $ZTMP3 = "%zmm5";
my $ZTMP4 = "%zmm6";
my $ZTMP5 = "%zmm7";
my $ZTMP6 = "%zmm10";
my $ZTMP7 = "%zmm11";
my $ZTMP8 = "%zmm12";
my $ZTMP9 = "%zmm13";
my $ZTMP10 = "%zmm15";
my $ZTMP11 = "%zmm16";
my $ZTMP12 = "%zmm17";
my $ZTMP13 = "%zmm19";
my $ZTMP14 = "%zmm20";
my $ZTMP15 = "%zmm21";
my $ZTMP16 = "%zmm30";
my $ZTMP17 = "%zmm31";
my $ZTMP18 = "%zmm1";
my $ZTMP19 = "%zmm18";
my $ZTMP20 = "%zmm8";
my $ZTMP21 = "%zmm22";
my $ZTMP22 = "%zmm23";
my $GH = "%zmm24";
my $GL = "%zmm25";
my $GM = "%zmm26";
my $SHUF_MASK = "%zmm29";
# ; Unused in the small packet path
my $ADDBE_4x4 = "%zmm27";
my $ADDBE_1234 = "%zmm28";
my $MASKREG = "%k1";
my $rndsuffix = &random_string();
# ;; reduction every 48 blocks, depth 32 blocks
# ;; @note 48 blocks is the maximum capacity of the stack frame
my $big_loop_nblocks = 48;
my $big_loop_depth = 32;
# ;;; Macro flow depending on packet size
# ;;; - LENGTH <= 16 blocks
# ;;; - cipher followed by hashing (reduction)
# ;;; - 16 blocks < LENGTH < 32 blocks
# ;;; - cipher 16 blocks
# ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
# ;;; - 32 blocks < LENGTH < 48 blocks
# ;;; - cipher 2 x 16 blocks
# ;;; - hash 16 blocks
# ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
# ;;; - LENGTH >= 48 blocks
# ;;; - cipher 2 x 16 blocks
# ;;; - while (data_to_cipher >= 48 blocks):
# ;;; - cipher 16 blocks & hash 16 blocks
# ;;; - cipher 16 blocks & hash 16 blocks
# ;;; - cipher 16 blocks & hash 16 blocks (reduction)
# ;;; - if (data_to_cipher >= 32 blocks):
# ;;; - cipher 16 blocks & hash 16 blocks
# ;;; - cipher 16 blocks & hash 16 blocks
# ;;; - hash 16 blocks (reduction)
# ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
# ;;; - elif (data_to_cipher >= 16 blocks):
# ;;; - cipher 16 blocks & hash 16 blocks
# ;;; - hash 16 blocks
# ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
# ;;; - else:
# ;;; - hash 16 blocks
# ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
if ($win64) {
$code .= "cmpq \$0,$PLAIN_CIPH_LEN\n";
} else {
$code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
}
$code .= "je .L_enc_dec_abort_${rndsuffix}\n";
$code .= "xor $HKEYS_READY, $HKEYS_READY\n";
$code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
# BE -> LE conversion
$code .= "vpshufb SHUF_MASK(%rip),$AAD_HASHx,$AAD_HASHx\n";
# ;; Used for the update flow - if there was a previous partial
# ;; block fill the remaining bytes here.
&PARTIAL_BLOCK(
$GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
$DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1,
$IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3,
$ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG);
$code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
# ;; Save the amount of data left to process in $LENGTH
# ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
if ($win64) {
$code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n";
}
# ;; There may be no more data if it was consumed in the partial block.
$code .= <<___;
sub $DATA_OFFSET,$LENGTH
je .L_enc_dec_done_${rndsuffix}
___
$code .= <<___;
cmp \$`(16 * 16)`,$LENGTH
jbe .L_message_below_equal_16_blocks_${rndsuffix}
vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK
vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
# ;; start the pipeline
# ;; - 32 blocks aes-ctr
# ;; - 16 blocks ghash + aes-ctr
# ;; set up CTR_CHECK
vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
and \$255,@{[DWORD($CTR_CHECK)]}
# ;; in LE format after init, convert to BE
vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
___
# ;; ==== AES-CTR - first 16 blocks
my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
my $data_in_out_offset = 0;
&INITIAL_BLOCKS_16(
$PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
$CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
$ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
$SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
# ;; Get Htable pointer
$code .= "lea `$CTX_OFFSET_HTable`($GCM128_CTX),$IA1\n";
&precompute_hkeys_on_stack($IA1, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, "first16");
$code .= <<___;
cmp \$`(32 * 16)`,$LENGTH
jb .L_message_below_32_blocks_${rndsuffix}
___
# ;; ==== AES-CTR - next 16 blocks
$aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
$data_in_out_offset = (16 * 16);
&INITIAL_BLOCKS_16(
$PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
$CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
$ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
$SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
&precompute_hkeys_on_stack($IA1, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, "last32");
$code .= "mov \$1,$HKEYS_READY\n";
$code .= <<___;
add \$`(32 * 16)`,$DATA_OFFSET
sub \$`(32 * 16)`,$LENGTH
cmp \$`($big_loop_nblocks * 16)`,$LENGTH
jb .L_no_more_big_nblocks_${rndsuffix}
___
# ;; ====
# ;; ==== AES-CTR + GHASH - 48 blocks loop
# ;; ====
$code .= ".L_encrypt_big_nblocks_${rndsuffix}:\n";
# ;; ==== AES-CTR + GHASH - 16 blocks, start
$aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
$data_in_out_offset = (0 * 16);
my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
&GHASH_16_ENCRYPT_16_PARALLEL(
$AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
$ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
$ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
$ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
$GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
$IA0);
# ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
$aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
$data_in_out_offset = (16 * 16);
$ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
&GHASH_16_ENCRYPT_16_PARALLEL(
$AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
$ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
$ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
$ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
$GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
$IA0);
# ;; ==== AES-CTR + GHASH - 16 blocks, reduction
$aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
$data_in_out_offset = (32 * 16);
$ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
&GHASH_16_ENCRYPT_16_PARALLEL(
$AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
$ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
$ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
$ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
$GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
$IA0);
# ;; === xor cipher block 0 with GHASH (ZT4)
$code .= <<___;
vmovdqa64 $ZTMP4,$AAD_HASHz
add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
sub \$`($big_loop_nblocks * 16)`,$LENGTH
cmp \$`($big_loop_nblocks * 16)`,$LENGTH
jae .L_encrypt_big_nblocks_${rndsuffix}
.L_no_more_big_nblocks_${rndsuffix}:
cmp \$`(32 * 16)`,$LENGTH
jae .L_encrypt_32_blocks_${rndsuffix}
cmp \$`(16 * 16)`,$LENGTH
jae .L_encrypt_16_blocks_${rndsuffix}
___
# ;; =====================================================
# ;; =====================================================
# ;; ==== GHASH 1 x 16 blocks
# ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
# ;; ==== then GHASH N blocks
$code .= ".L_encrypt_0_blocks_ghash_32_${rndsuffix}:\n";
# ;; calculate offset to the right hash key
$code .= <<___;
mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
and \$~15,@{[DWORD($IA0)]}
mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
___
# ;; ==== GHASH 32 blocks and follow with reduction
&GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
"%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
# ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
$ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
$code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
&GCM_ENC_DEC_LAST(
$AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
$CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
$ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
$ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
$ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
$ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
"mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
$IA0, $IA5, $MASKREG, $PBLOCK_LEN);
$code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
$code .= "jmp .L_ghash_done_${rndsuffix}\n";
# ;; =====================================================
# ;; =====================================================
# ;; ==== GHASH & encrypt 1 x 16 blocks
# ;; ==== GHASH & encrypt 1 x 16 blocks
# ;; ==== GHASH 1 x 16 blocks (reduction)
# ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
# ;; ==== then GHASH N blocks
$code .= ".L_encrypt_32_blocks_${rndsuffix}:\n";
# ;; ==== AES-CTR + GHASH - 16 blocks, start
$aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
$ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
$data_in_out_offset = (0 * 16);
&GHASH_16_ENCRYPT_16_PARALLEL(
$AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
$ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
$ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
$ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
$GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
$IA0);
# ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
$aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
$ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
$data_in_out_offset = (16 * 16);
&GHASH_16_ENCRYPT_16_PARALLEL(
$AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
$ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
$ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
$ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
$GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
$IA0);
# ;; ==== GHASH 16 blocks with reduction
&GHASH_16(
"end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
"%rsp", &HashKeyOffsetByIdx(16, "frame"),
0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
# ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
$ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
$code .= <<___;
sub \$`(32 * 16)`,$LENGTH
add \$`(32 * 16)`,$DATA_OFFSET
___
# ;; calculate offset to the right hash key
$code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
$code .= <<___;
and \$~15,@{[DWORD($IA0)]}
mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
___
&GCM_ENC_DEC_LAST(
$AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
$CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
$ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
$ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
$ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
$ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
"start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
$IA0, $IA5, $MASKREG, $PBLOCK_LEN);
$code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
$code .= "jmp .L_ghash_done_${rndsuffix}\n";
# ;; =====================================================
# ;; =====================================================
# ;; ==== GHASH & encrypt 16 blocks (done before)
# ;; ==== GHASH 1 x 16 blocks
# ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
# ;; ==== then GHASH N blocks
$code .= ".L_encrypt_16_blocks_${rndsuffix}:\n";
# ;; ==== AES-CTR + GHASH - 16 blocks, start
$aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
$ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
$data_in_out_offset = (0 * 16);
&GHASH_16_ENCRYPT_16_PARALLEL(
$AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
$ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
$ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
$ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
$ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
$GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
$IA0);
# ;; ==== GHASH 1 x 16 blocks
&GHASH_16(
"mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
"%rsp", &HashKeyOffsetByIdx(32, "frame"),
0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
# ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
$ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
$code .= <<___;
sub \$`(16 * 16)`,$LENGTH
add \$`(16 * 16)`,$DATA_OFFSET
___
&GCM_ENC_DEC_LAST(
$AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
$DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK,
&HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
$ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4,
$ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
$ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
$ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16,
$ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20,
$ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
"end_reduce", $GL, $GH, $GM,
$ENC_DEC, $AAD_HASHz, $IA0, $IA5,
$MASKREG, $PBLOCK_LEN);
$code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
$code .= <<___;
jmp .L_ghash_done_${rndsuffix}
.L_message_below_32_blocks_${rndsuffix}:
# ;; 32 > number of blocks > 16
sub \$`(16 * 16)`,$LENGTH
add \$`(16 * 16)`,$DATA_OFFSET
___
$ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
# ;; calculate offset to the right hash key
$code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
# ;; Get Htable pointer
$code .= "lea `$CTX_OFFSET_HTable`($GCM128_CTX),$IA1\n";
&precompute_hkeys_on_stack($IA1, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, "mid16");
$code .= "mov \$1,$HKEYS_READY\n";
$code .= <<___;
and \$~15,@{[DWORD($IA0)]}
mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
___
&GCM_ENC_DEC_LAST(
$AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
$CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
$ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
$ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
$ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
$ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
"start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
$IA0, $IA5, $MASKREG, $PBLOCK_LEN);
$code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
$code .= <<___;
jmp .L_ghash_done_${rndsuffix}
.L_message_below_equal_16_blocks_${rndsuffix}:
# ;; Determine how many blocks to process
# ;; - process one additional block if there is a partial block
mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
add \$15,@{[DWORD($IA1)]}
shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16
___
&GCM_ENC_DEC_SMALL(
$AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
$DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0,
$ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
$ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
$ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK,
$PBLOCK_LEN);
# ;; fall through to exit
$code .= ".L_ghash_done_${rndsuffix}:\n";
# ;; save the last counter block
$code .= <<___;
vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)
.L_enc_dec_done_${rndsuffix}:
# LE->BE conversion
vpshufb SHUF_MASK(%rip),$AAD_HASHx,$AAD_HASHx
vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
.L_enc_dec_abort_${rndsuffix}:
___
}