in crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl [2155:2692]
sub GHASH_16_ENCRYPT_N_GHASH_N {
my $AES_KEYS = $_[0]; # [in] key pointer
my $GCM128_CTX = $_[1]; # [in] context pointer
my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
my $DATA_OFFSET = $_[4]; # [in] data offset
my $LENGTH = $_[5]; # [in] data length
my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
# (can be in form of register or numerical value)
my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
my $B00_03 = $_[11]; # [clobbered] temporary ZMM
my $B04_07 = $_[12]; # [clobbered] temporary ZMM
my $B08_11 = $_[13]; # [clobbered] temporary ZMM
my $B12_15 = $_[14]; # [clobbered] temporary ZMM
my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM
my $GH1L = $_[16]; # [clobbered] temporary ZMM
my $GH1M = $_[17]; # [clobbered] temporary ZMM
my $GH1T = $_[18]; # [clobbered] temporary ZMM
my $GH2H = $_[19]; # [clobbered] temporary ZMM
my $GH2L = $_[20]; # [clobbered] temporary ZMM
my $GH2M = $_[21]; # [clobbered] temporary ZMM
my $GH2T = $_[22]; # [clobbered] temporary ZMM
my $GH3H = $_[23]; # [clobbered] temporary ZMM
my $GH3L = $_[24]; # [clobbered] temporary ZMM
my $GH3M = $_[25]; # [clobbered] temporary ZMM
my $GH3T = $_[26]; # [clobbered] temporary ZMM
my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM
my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM
my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM
my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM
my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM
my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM
my $ZT01 = $_[33]; # [clobbered] temporary ZMM
my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
my $ENC_DEC = $_[40]; # [in] cipher direction
my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
my $IA0 = $_[42]; # [clobbered] GP temporary
my $IA1 = $_[43]; # [clobbered] GP temporary
my $MASKREG = $_[44]; # [clobbered] mask register
my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
my $PBLOCK_LEN = $_[46]; # [in] partial block length
die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
my $rndsuffix = &random_string();
my $GH1H = $HASH_IN_OUT;
# ; this is to avoid additional move in do_reduction case
my $LAST_GHASH_BLK = $GH1L;
my $LAST_CIPHER_BLK = $GH1T;
my $RED_POLY = $GH2T;
my $RED_P1 = $GH2L;
my $RED_T1 = $GH2H;
my $RED_T2 = $GH2M;
my $DATA1 = $GH3H;
my $DATA2 = $GH3L;
my $DATA3 = $GH3M;
my $DATA4 = $GH3T;
# ;; do reduction after the 16 blocks ?
my $do_reduction = 0;
# ;; is 16 block chunk a start?
my $is_start = 0;
if ($GHASH_TYPE eq "start_reduce") {
$is_start = 1;
$do_reduction = 1;
}
if ($GHASH_TYPE eq "start") {
$is_start = 1;
}
if ($GHASH_TYPE eq "end_reduce") {
$do_reduction = 1;
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; - get load/store mask
# ;; - load plain/cipher text
# ;; get load/store mask
$code .= <<___;
lea byte64_len_to_mask_table(%rip),$IA0
mov $LENGTH,$IA1
___
if ($NUM_BLOCKS > 12) {
$code .= "sub \$`3*64`,$IA1\n";
} elsif ($NUM_BLOCKS > 8) {
$code .= "sub \$`2*64`,$IA1\n";
} elsif ($NUM_BLOCKS > 4) {
$code .= "sub \$`1*64`,$IA1\n";
}
$code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; prepare counter blocks
$code .= <<___;
cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
jae .L_16_blocks_overflow_${rndsuffix}
___
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
$B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
$code .= <<___;
jmp .L_16_blocks_ok_${rndsuffix}
.L_16_blocks_overflow_${rndsuffix}:
vpshufb $SHFMSK,$CTR_BE,$CTR_BE
vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
___
if ($NUM_BLOCKS > 4) {
$code .= <<___;
vmovdqa64 ddq_add_4444(%rip),$B12_15
vpaddd $B12_15,$B00_03,$B04_07
___
}
if ($NUM_BLOCKS > 8) {
$code .= "vpaddd $B12_15,$B04_07,$B08_11\n";
}
if ($NUM_BLOCKS > 12) {
$code .= "vpaddd $B12_15,$B08_11,$B12_15\n";
}
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
$code .= <<___;
.L_16_blocks_ok_${rndsuffix}:
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; - pre-load constants
# ;; - add current hash into the 1st block
vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1
___
if ($is_start != 0) {
$code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
} else {
$code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
}
$code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; save counter for the next round
# ;; increment counter overflow check register
if ($NUM_BLOCKS <= 4) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
} elsif ($NUM_BLOCKS <= 8) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
} elsif ($NUM_BLOCKS <= 12) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
} else {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
}
$code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
$code .= <<___;
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; pre-load constants
vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; stitch AES rounds with GHASH
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 0 - ARK
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
$code .= <<___;
# ;;==================================================
# ;; GHASH 4 blocks (15 to 12)
vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 1
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
$code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
$code .= <<___;
# ;; =================================================
# ;; GHASH 4 blocks (11 to 8)
vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 2
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n";
$code .= <<___;
# ;; =================================================
# ;; GHASH 4 blocks (7 to 4)
vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds 3
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
$code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n";
$code .= <<___;
# ;; =================================================
# ;; Gather (XOR) GHASH for 12 blocks
vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds 4
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n";
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; load plain/cipher text
&ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds 5
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
$code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n";
$code .= <<___;
# ;; =================================================
# ;; GHASH 4 blocks (3 to 0)
vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 6
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n";
# ;; =================================================
# ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
# ;; - add GH2[MTLH] to GH1[MTLH]
$code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n";
if ($do_reduction != 0) {
if ($is_start != 0) {
$code .= "vpxorq $GH2M,$GH1M,$GH1M\n";
} else {
$code .= <<___;
vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
___
}
} else {
# ;; Update H/M/L hash sums if not carrying reduction
if ($is_start != 0) {
$code .= <<___;
vpxorq $GH2H,$GH1H,$TO_REDUCE_H
vpxorq $GH2L,$GH1L,$TO_REDUCE_L
vpxorq $GH2M,$GH1M,$TO_REDUCE_M
___
} else {
$code .= <<___;
vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
___
}
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 7
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
$code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n";
# ;; =================================================
# ;; prepare mid sum for adding to high & low
# ;; load polynomial constant for reduction
if ($do_reduction != 0) {
$code .= <<___;
vpsrldq \$8,$GH1M,$GH2M
vpslldq \$8,$GH1M,$GH1M
vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 8
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n";
# ;; =================================================
# ;; Add mid product to high and low
if ($do_reduction != 0) {
if ($is_start != 0) {
$code .= <<___;
vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
___
} else {
$code .= <<___;
vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64
vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64
___
}
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 9
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
# ;; =================================================
# ;; horizontal xor of low and high 4x128
if ($do_reduction != 0) {
&VHPXORI4x128($GH1H, $GH2H);
&VHPXORI4x128($GH1L, $GH2L);
}
if (($NROUNDS >= 11)) {
$code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
}
# ;; =================================================
# ;; first phase of reduction
if ($do_reduction != 0) {
$code .= <<___;
vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds up to 11 (AES192) or 13 (AES256)
# ;; AES128 is done
if (($NROUNDS >= 11)) {
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n";
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
if (($NROUNDS == 13)) {
$code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n";
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
$code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n";
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
}
}
# ;; =================================================
# ;; second phase of the reduction
if ($do_reduction != 0) {
$code .= <<___;
vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
# ;; GH1H = GH1H + RED_T1 + RED_T2
vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; the last AES round
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; XOR against plain/cipher text
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
$B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4);
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; retrieve the last cipher counter block (partially XOR'ed with text)
# ;; - this is needed for partial block cases
if ($NUM_BLOCKS <= 4) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
} elsif ($NUM_BLOCKS <= 8) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
} elsif ($NUM_BLOCKS <= 12) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
} else {
$code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; store cipher/plain text
$code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
&ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
# ;; =================================================
# ;; shuffle cipher text blocks for GHASH computation
if ($ENC_DEC eq "ENC") {
# ;; zero bytes outside the mask before hashing
if ($NUM_BLOCKS <= 4) {
$code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n";
} elsif ($NUM_BLOCKS <= 8) {
$code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n";
} elsif ($NUM_BLOCKS <= 12) {
$code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n";
} else {
$code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n";
}
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03,
$B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
} else {
# ;; zero bytes outside the mask before hashing
if ($NUM_BLOCKS <= 4) {
$code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n";
} elsif ($NUM_BLOCKS <= 8) {
$code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n";
} elsif ($NUM_BLOCKS <= 12) {
$code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n";
} else {
$code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n";
}
&ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
$NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1,
$DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
}
# ;; =================================================
# ;; Extract the last block for partial / multi_call cases
if ($NUM_BLOCKS <= 4) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
} elsif ($NUM_BLOCKS <= 8) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
} elsif ($NUM_BLOCKS <= 12) {
$code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
} else {
$code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
}
if ($do_reduction != 0) {
# ;; GH1H holds reduced hash value
# ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
# ;; - register rename trick obsoletes the above move
}
# ;; =================================================
# ;; GHASH last N blocks
# ;; - current hash value in HASH_IN_OUT or
# ;; product parts in TO_REDUCE_H/M/L
# ;; - DATA1-DATA4 include blocks for GHASH
if ($do_reduction == 0) {
&INITIAL_BLOCKS_PARTIAL_GHASH(
$AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
&XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
$DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
$B00_03, $B04_07, $B08_11, $B12_15,
$GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
$GHKEY1, $IA0, $PBLOCK_LEN, $TO_REDUCE_H,
$TO_REDUCE_M, $TO_REDUCE_L);
} else {
&INITIAL_BLOCKS_PARTIAL_GHASH(
$AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
&XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
$DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
$B00_03, $B04_07, $B08_11, $B12_15,
$GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
$GHKEY1, $IA0, $PBLOCK_LEN);
}
}