in crypto/fipsmodule/modes/asm/ghash-x86_64.pl [915:1296]
sub \$0x80,$len
vmovdqu 0x70($inp),$Ii # I[7]
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
vpshufb $bswap,$Ii,$Ii
vmovdqu 0x20-0x40($Htbl),$HK
vpunpckhqdq $Ii,$Ii,$T2
vmovdqu 0x60($inp),$Ij # I[6]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Ii,$T2,$T2
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
vpunpckhqdq $Ij,$Ij,$T1
vmovdqu 0x50($inp),$Ii # I[5]
vpclmulqdq \$0x00,$HK,$T2,$Xmi
vpxor $Ij,$T1,$T1
vpshufb $bswap,$Ii,$Ii
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
vpxor $Ii,$T2,$T2
vmovdqu 0x40($inp),$Ij # I[4]
vpclmulqdq \$0x10,$HK,$T1,$Zmi
vmovdqu 0x50-0x40($Htbl),$HK
vpshufb $bswap,$Ij,$Ij
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Xhi,$Zhi,$Zhi
vpunpckhqdq $Ij,$Ij,$T1
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T2,$Xmi
vpxor $Ij,$T1,$T1
vmovdqu 0x30($inp),$Ii # I[3]
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpxor $Zhi,$Xhi,$Xhi
vpshufb $bswap,$Ii,$Ii
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
vpxor $Zmi,$Xmi,$Xmi
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x10,$HK,$T1,$Zmi
vmovdqu 0x80-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vmovdqu 0x20($inp),$Ij # I[2]
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Xhi,$Zhi,$Zhi
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
vpxor $Xmi,$Zmi,$Zmi
vpunpckhqdq $Ij,$Ij,$T1
vpclmulqdq \$0x00,$HK,$T2,$Xmi
vpxor $Ij,$T1,$T1
vmovdqu 0x10($inp),$Ii # I[1]
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpxor $Zhi,$Xhi,$Xhi
vpshufb $bswap,$Ii,$Ii
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
vpxor $Zmi,$Xmi,$Xmi
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x10,$HK,$T1,$Zmi
vmovdqu 0xb0-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vmovdqu ($inp),$Ij # I[0]
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Xhi,$Zhi,$Zhi
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x10,$HK,$T2,$Xmi
lea 0x80($inp),$inp
cmp \$0x80,$len
jb .Ltail_avx
vpxor $Xi,$Ij,$Ij # accumulate $Xi
sub \$0x80,$len
jmp .Loop8x_avx
.align 32
.Loop8x_avx:
vpunpckhqdq $Ij,$Ij,$T1
vmovdqu 0x70($inp),$Ii # I[7]
vpxor $Xlo,$Zlo,$Zlo
vpxor $Ij,$T1,$T1
vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
vpshufb $bswap,$Ii,$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
vpunpckhqdq $Ii,$Ii,$T2
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Tred
vmovdqu 0x20-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vmovdqu 0x60($inp),$Ij # I[6]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpxor $Zlo,$Xi,$Xi # collect result
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vxorps $Zhi,$Xo,$Xo
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
vpunpckhqdq $Ij,$Ij,$T1
vpclmulqdq \$0x00,$HK, $T2,$Xmi
vpxor $Zmi,$Tred,$Tred
vxorps $Ij,$T1,$T1
vmovdqu 0x50($inp),$Ii # I[5]
vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpxor $Xo,$Tred,$Tred
vpslldq \$8,$Tred,$T2
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vpsrldq \$8,$Tred,$Tred
vpxor $T2, $Xi, $Xi
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
vpshufb $bswap,$Ii,$Ii
vxorps $Tred,$Xo, $Xo
vpxor $Xhi,$Zhi,$Zhi
vpunpckhqdq $Ii,$Ii,$T2
vpclmulqdq \$0x10,$HK, $T1,$Zmi
vmovdqu 0x50-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vpxor $Xmi,$Zmi,$Zmi
vmovdqu 0x40($inp),$Ij # I[4]
vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpshufb $bswap,$Ij,$Ij
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Zhi,$Xhi,$Xhi
vpclmulqdq \$0x00,$HK, $T2,$Xmi
vxorps $Ij,$T1,$T1
vpxor $Zmi,$Xmi,$Xmi
vmovdqu 0x30($inp),$Ii # I[3]
vpclmulqdq \$0x10,(%r10),$Xi,$Xi
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpshufb $bswap,$Ii,$Ii
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
vpunpckhqdq $Ii,$Ii,$T2
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x10,$HK, $T1,$Zmi
vmovdqu 0x80-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vpxor $Xmi,$Zmi,$Zmi
vmovdqu 0x20($inp),$Ij # I[2]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpshufb $bswap,$Ij,$Ij
vpxor $Zlo,$Xlo,$Xlo
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Zhi,$Xhi,$Xhi
vpclmulqdq \$0x00,$HK, $T2,$Xmi
vpxor $Ij,$T1,$T1
vpxor $Zmi,$Xmi,$Xmi
vxorps $Tred,$Xi,$Xi
vmovdqu 0x10($inp),$Ii # I[1]
vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
vpshufb $bswap,$Ii,$Ii
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
vpclmulqdq \$0x10,(%r10),$Xi,$Xi
vxorps $Xo,$Tred,$Tred
vpunpckhqdq $Ii,$Ii,$T2
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x10,$HK, $T1,$Zmi
vmovdqu 0xb0-0x40($Htbl),$HK
vpxor $Ii,$T2,$T2
vpxor $Xmi,$Zmi,$Zmi
vmovdqu ($inp),$Ij # I[0]
vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
vpshufb $bswap,$Ij,$Ij
vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
vpxor $Tred,$Ij,$Ij
vpclmulqdq \$0x10,$HK, $T2,$Xmi
vpxor $Xi,$Ij,$Ij # accumulate $Xi
lea 0x80($inp),$inp
sub \$0x80,$len
jnc .Loop8x_avx
add \$0x80,$len
jmp .Ltail_no_xor_avx
.align 32
.Lshort_avx:
vmovdqu -0x10($inp,$len),$Ii # very last word
lea ($inp,$len),$inp
vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
vmovdqu 0x20-0x40($Htbl),$HK
vpshufb $bswap,$Ii,$Ij
vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
vmovdqa $Xhi,$Zhi # $Zhi and
vmovdqa $Xmi,$Zmi # $Zmi
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x20($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vpsrldq \$8,$HK,$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x30($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovdqu 0x50-0x40($Htbl),$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x40($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vpsrldq \$8,$HK,$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x50($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovdqu 0x80-0x40($Htbl),$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x60($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vpsrldq \$8,$HK,$HK
sub \$0x10,$len
jz .Ltail_avx
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vmovdqu -0x70($inp),$Ii
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
vpshufb $bswap,$Ii,$Ij
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovq 0xb8-0x40($Htbl),$HK
sub \$0x10,$len
jmp .Ltail_avx
.align 32
.Ltail_avx:
vpxor $Xi,$Ij,$Ij # accumulate $Xi
.Ltail_no_xor_avx:
vpunpckhqdq $Ij,$Ij,$T1
vpxor $Xlo,$Zlo,$Zlo
vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
vpxor $Ij,$T1,$T1
vpxor $Xhi,$Zhi,$Zhi
vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
vpxor $Xmi,$Zmi,$Zmi
vpclmulqdq \$0x00,$HK,$T1,$Xmi
vmovdqu (%r10),$Tred
vpxor $Xlo,$Zlo,$Xi
vpxor $Xhi,$Zhi,$Xo
vpxor $Xmi,$Zmi,$Zmi
vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
vpxor $Xo, $Zmi,$Zmi
vpslldq \$8, $Zmi,$T2
vpsrldq \$8, $Zmi,$Zmi
vpxor $T2, $Xi, $Xi
vpxor $Zmi,$Xo, $Xo
vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
vpalignr \$8,$Xi,$Xi,$Xi
vpxor $T2,$Xi,$Xi
vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
vpalignr \$8,$Xi,$Xi,$Xi
vpxor $Xo,$Xi,$Xi
vpxor $T2,$Xi,$Xi
cmp \$0,$len
jne .Lshort_avx
vpshufb $bswap,$Xi,$Xi
vmovdqu $Xi,($Xip)
vzeroupper
___
$code.=<<___ if ($win64);
movaps (%rsp),%xmm6
movaps 0x10(%rsp),%xmm7
movaps 0x20(%rsp),%xmm8
movaps 0x30(%rsp),%xmm9
movaps 0x40(%rsp),%xmm10
movaps 0x50(%rsp),%xmm11
movaps 0x60(%rsp),%xmm12
movaps 0x70(%rsp),%xmm13
movaps 0x80(%rsp),%xmm14
movaps 0x90(%rsp),%xmm15
lea 0xa8(%rsp),%rsp
___
$code.=<<___;
ret
.cfi_endproc
.seh_endproc
.size gcm_ghash_avx,.-gcm_ghash_avx
___
} else {