in crypto/fipsmodule/modes/asm/aesni-gcm-avx512.pl [2842:3271]
sub GHASH_16_ENCRYPT_16_PARALLEL {
my $AES_KEYS = $_[0]; # [in] key pointer
my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer
my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer
my $DATA_OFFSET = $_[3]; # [in] data offset
my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian
my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check
my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value)
my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out
my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in
my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb
my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher)
my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher)
my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher)
my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher)
my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher)
my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher)
my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher)
my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher)
my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash)
my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash)
my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash)
my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash)
my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash)
my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash)
my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash)
my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash)
my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash)
my $ZT19 = $_[28]; # [clobbered] temporary ZMM
my $ZT20 = $_[29]; # [clobbered] temporary ZMM
my $ZT21 = $_[30]; # [clobbered] temporary ZMM
my $ZT22 = $_[31]; # [clobbered] temporary ZMM
my $ZT23 = $_[32]; # [clobbered] temporary ZMM
my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian
my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum
my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum
my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum
my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time"
my $ENC_DEC = $_[39]; # [in] cipher direction
my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
my $IA0 = $_[42]; # [clobbered] temporary GPR
my $B00_03 = $ZT1;
my $B04_07 = $ZT2;
my $B08_11 = $ZT3;
my $B12_15 = $ZT4;
my $GH1H = $ZT5;
# ; @note: do not change this mapping
my $GH1L = $ZT6;
my $GH1M = $ZT7;
my $GH1T = $ZT8;
my $GH2H = $ZT9;
my $GH2L = $ZT10;
my $GH2M = $ZT11;
my $GH2T = $ZT12;
my $RED_POLY = $GH2T;
my $RED_P1 = $GH2L;
my $RED_T1 = $GH2H;
my $RED_T2 = $GH2M;
my $GH3H = $ZT13;
my $GH3L = $ZT14;
my $GH3M = $ZT15;
my $GH3T = $ZT16;
my $DATA1 = $ZT13;
my $DATA2 = $ZT14;
my $DATA3 = $ZT15;
my $DATA4 = $ZT16;
my $AESKEY1 = $ZT17;
my $AESKEY2 = $ZT18;
my $GHKEY1 = $ZT19;
my $GHKEY2 = $ZT20;
my $GHDAT1 = $ZT21;
my $GHDAT2 = $ZT22;
my $rndsuffix = &random_string();
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; prepare counter blocks
$code .= <<___;
cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
jae .L_16_blocks_overflow_${rndsuffix}
vpaddd $ADDBE_1234,$CTR_BE,$B00_03
vpaddd $ADDBE_4x4,$B00_03,$B04_07
vpaddd $ADDBE_4x4,$B04_07,$B08_11
vpaddd $ADDBE_4x4,$B08_11,$B12_15
jmp .L_16_blocks_ok_${rndsuffix}
.L_16_blocks_overflow_${rndsuffix}:
vpshufb $SHFMSK,$CTR_BE,$CTR_BE
vmovdqa64 ddq_add_4444(%rip),$B12_15
vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
vpaddd $B12_15,$B00_03,$B04_07
vpaddd $B12_15,$B04_07,$B08_11
vpaddd $B12_15,$B08_11,$B12_15
vpshufb $SHFMSK,$B00_03,$B00_03
vpshufb $SHFMSK,$B04_07,$B04_07
vpshufb $SHFMSK,$B08_11,$B08_11
vpshufb $SHFMSK,$B12_15,$B12_15
.L_16_blocks_ok_${rndsuffix}:
___
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; pre-load constants
$code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
if ($GHASH_IN ne "no_ghash_in") {
$code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
} else {
$code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
}
$code .= <<___;
vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; save counter for the next round
# ;; increment counter overflow check register
vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
addb \$16,@{[BYTE($CTR_CHECK)]}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; pre-load constants
vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; stitch AES rounds with GHASH
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 0 - ARK
vpxorq $AESKEY1,$B00_03,$B00_03
vpxorq $AESKEY1,$B04_07,$B04_07
vpxorq $AESKEY1,$B08_11,$B08_11
vpxorq $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1
# ;;==================================================
# ;; GHASH 4 blocks (15 to 12)
vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 1
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2
# ;; =================================================
# ;; GHASH 4 blocks (11 to 8)
vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 2
vaesenc $AESKEY1,$B00_03,$B00_03
vaesenc $AESKEY1,$B04_07,$B04_07
vaesenc $AESKEY1,$B08_11,$B08_11
vaesenc $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1
# ;; =================================================
# ;; GHASH 4 blocks (7 to 4)
vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds 3
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2
# ;; =================================================
# ;; Gather (XOR) GHASH for 12 blocks
vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds 4
vaesenc $AESKEY1,$B00_03,$B00_03
vaesenc $AESKEY1,$B04_07,$B04_07
vaesenc $AESKEY1,$B08_11,$B08_11
vaesenc $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; load plain/cipher text (recycle GH3xx registers)
vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds 5
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2
# ;; =================================================
# ;; GHASH 4 blocks (3 to 0)
vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 6
vaesenc $AESKEY1,$B00_03,$B00_03
vaesenc $AESKEY1,$B04_07,$B04_07
vaesenc $AESKEY1,$B08_11,$B08_11
vaesenc $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1
___
# ;; =================================================
# ;; gather GHASH in GH1L (low) and GH1H (high)
if ($DO_REDUCTION eq "first_time") {
$code .= <<___;
vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM
vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH
vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL
___
}
if ($DO_REDUCTION eq "no_reduction") {
$code .= <<___;
vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM
vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH
vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL
___
}
if ($DO_REDUCTION eq "final_reduction") {
$code .= <<___;
# ;; phase 1: add mid products together
# ;; also load polynomial constant for reduction
vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
vpsrldq \$8,$GH1M,$GH2M
vpslldq \$8,$GH1M,$GH1M
vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 7
$code .= <<___;
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2
___
# ;; =================================================
# ;; Add mid product to high and low
if ($DO_REDUCTION eq "final_reduction") {
$code .= <<___;
vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
vpxorq $TO_REDUCE_H,$GH1H,$GH1H
vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
vpxorq $TO_REDUCE_L,$GH1L,$GH1L
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 8
$code .= <<___;
vaesenc $AESKEY1,$B00_03,$B00_03
vaesenc $AESKEY1,$B04_07,$B04_07
vaesenc $AESKEY1,$B08_11,$B08_11
vaesenc $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1
___
# ;; =================================================
# ;; horizontal xor of low and high 4x128
if ($DO_REDUCTION eq "final_reduction") {
&VHPXORI4x128($GH1H, $GH2H);
&VHPXORI4x128($GH1L, $GH2L);
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES round 9
$code .= <<___;
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
___
if (($NROUNDS >= 11)) {
$code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
}
# ;; =================================================
# ;; first phase of reduction
if ($DO_REDUCTION eq "final_reduction") {
$code .= <<___;
vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; AES rounds up to 11 (AES192) or 13 (AES256)
# ;; AES128 is done
if (($NROUNDS >= 11)) {
$code .= <<___;
vaesenc $AESKEY1,$B00_03,$B00_03
vaesenc $AESKEY1,$B04_07,$B04_07
vaesenc $AESKEY1,$B08_11,$B08_11
vaesenc $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
___
if (($NROUNDS == 13)) {
$code .= <<___;
vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2
vaesenc $AESKEY1,$B00_03,$B00_03
vaesenc $AESKEY1,$B04_07,$B04_07
vaesenc $AESKEY1,$B08_11,$B08_11
vaesenc $AESKEY1,$B12_15,$B12_15
vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1
vaesenc $AESKEY2,$B00_03,$B00_03
vaesenc $AESKEY2,$B04_07,$B04_07
vaesenc $AESKEY2,$B08_11,$B08_11
vaesenc $AESKEY2,$B12_15,$B12_15
___
}
}
# ;; =================================================
# ;; second phase of the reduction
if ($DO_REDUCTION eq "final_reduction") {
$code .= <<___;
vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
# ;; GH1H = GH1H x RED_T1 x RED_T2
vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; the last AES round
$code .= <<___;
vaesenclast $AESKEY1,$B00_03,$B00_03
vaesenclast $AESKEY1,$B04_07,$B04_07
vaesenclast $AESKEY1,$B08_11,$B08_11
vaesenclast $AESKEY1,$B12_15,$B12_15
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; XOR against plain/cipher text
vpxorq $DATA1,$B00_03,$B00_03
vpxorq $DATA2,$B04_07,$B04_07
vpxorq $DATA3,$B08_11,$B08_11
vpxorq $DATA4,$B12_15,$B12_15
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;; store cipher/plain text
mov $CIPH_PLAIN_OUT,$IA0
vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
___
# ;; =================================================
# ;; shuffle cipher text blocks for GHASH computation
if ($ENC_DEC eq "ENC") {
$code .= <<___;
vpshufb $SHFMSK,$B00_03,$B00_03
vpshufb $SHFMSK,$B04_07,$B04_07
vpshufb $SHFMSK,$B08_11,$B08_11
vpshufb $SHFMSK,$B12_15,$B12_15
___
} else {
$code .= <<___;
vpshufb $SHFMSK,$DATA1,$B00_03
vpshufb $SHFMSK,$DATA2,$B04_07
vpshufb $SHFMSK,$DATA3,$B08_11
vpshufb $SHFMSK,$DATA4,$B12_15
___
}
# ;; =================================================
# ;; store shuffled cipher text for ghashing
$code .= <<___;
vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
___
}