in ring/crypto/fipsmodule/aes/asm/aesni-x86_64.pl [627:847]
sub \$$frame_size,%rsp
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
___
$code.=<<___ if ($win64);
movaps %xmm6,-0xa8($key_) # offload everything
movaps %xmm7,-0x98($key_)
movaps %xmm8,-0x88($key_)
movaps %xmm9,-0x78($key_)
movaps %xmm10,-0x68($key_)
movaps %xmm11,-0x58($key_)
movaps %xmm12,-0x48($key_)
movaps %xmm13,-0x38($key_)
movaps %xmm14,-0x28($key_)
movaps %xmm15,-0x18($key_)
.Lctr32_body:
___
$code.=<<___;
# 8 16-byte words on top of stack are counter values
# xor-ed with zero-round key
movdqu ($ivp),$inout0
movdqu ($key),$rndkey0
mov 12($ivp),$ctr # counter LSB
pxor $rndkey0,$inout0
mov 12($key),$key0 # 0-round key LSB
movdqa $inout0,0x00(%rsp) # populate counter block
bswap $ctr
movdqa $inout0,$inout1
movdqa $inout0,$inout2
movdqa $inout0,$inout3
movdqa $inout0,0x40(%rsp)
movdqa $inout0,0x50(%rsp)
movdqa $inout0,0x60(%rsp)
mov %rdx,%r10 # about to borrow %rdx
movdqa $inout0,0x70(%rsp)
lea 1($ctr),%rax
lea 2($ctr),%rdx
bswap %eax
bswap %edx
xor $key0,%eax
xor $key0,%edx
pinsrd \$3,%eax,$inout1
lea 3($ctr),%rax
movdqa $inout1,0x10(%rsp)
pinsrd \$3,%edx,$inout2
bswap %eax
mov %r10,%rdx # restore %rdx
lea 4($ctr),%r10
movdqa $inout2,0x20(%rsp)
xor $key0,%eax
bswap %r10d
pinsrd \$3,%eax,$inout3
xor $key0,%r10d
movdqa $inout3,0x30(%rsp)
lea 5($ctr),%r9
mov %r10d,0x40+12(%rsp)
bswap %r9d
lea 6($ctr),%r10
mov 240($key),$rounds # key->rounds
xor $key0,%r9d
bswap %r10d
mov %r9d,0x50+12(%rsp)
xor $key0,%r10d
lea 7($ctr),%r9
mov %r10d,0x60+12(%rsp)
bswap %r9d
leaq GFp_ia32cap_P(%rip),%r10
mov 4(%r10),%r10d
xor $key0,%r9d
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
mov %r9d,0x70+12(%rsp)
$movkey 0x10($key),$rndkey1
movdqa 0x40(%rsp),$inout4
movdqa 0x50(%rsp),$inout5
cmp \$8,$len # $len is in blocks
jb .Lctr32_tail # short input if ($len<8)
sub \$6,$len # $len is biased by -6
cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
je .Lctr32_6x # [which denotes Atom Silvermont]
lea 0x80($key),$key # size optimization
sub \$2,$len # $len is biased by -8
jmp .Lctr32_loop8
.align 16
.Lctr32_6x:
shl \$4,$rounds
mov \$48,$rnds_
bswap $key0
lea 32($key,$rounds),$key # end of key schedule
sub %rax,%r10 # twisted $rounds
jmp .Lctr32_loop6
.align 16
.Lctr32_loop6:
add \$6,$ctr # next counter value
$movkey -48($key,$rnds_),$rndkey0
aesenc $rndkey1,$inout0
mov $ctr,%eax
xor $key0,%eax
aesenc $rndkey1,$inout1
movbe %eax,`0x00+12`(%rsp) # store next counter value
lea 1($ctr),%eax
aesenc $rndkey1,$inout2
xor $key0,%eax
movbe %eax,`0x10+12`(%rsp)
aesenc $rndkey1,$inout3
lea 2($ctr),%eax
xor $key0,%eax
aesenc $rndkey1,$inout4
movbe %eax,`0x20+12`(%rsp)
lea 3($ctr),%eax
aesenc $rndkey1,$inout5
$movkey -32($key,$rnds_),$rndkey1
xor $key0,%eax
aesenc $rndkey0,$inout0
movbe %eax,`0x30+12`(%rsp)
lea 4($ctr),%eax
aesenc $rndkey0,$inout1
xor $key0,%eax
movbe %eax,`0x40+12`(%rsp)
aesenc $rndkey0,$inout2
lea 5($ctr),%eax
xor $key0,%eax
aesenc $rndkey0,$inout3
movbe %eax,`0x50+12`(%rsp)
mov %r10,%rax # mov $rnds_,$rounds
aesenc $rndkey0,$inout4
aesenc $rndkey0,$inout5
$movkey -16($key,$rnds_),$rndkey0
call .Lenc_loop6
movdqu ($inp),$inout6 # load 6 input blocks
movdqu 0x10($inp),$inout7
movdqu 0x20($inp),$in0
movdqu 0x30($inp),$in1
movdqu 0x40($inp),$in2
movdqu 0x50($inp),$in3
lea 0x60($inp),$inp # $inp+=6*16
$movkey -64($key,$rnds_),$rndkey1
pxor $inout0,$inout6 # inp^=E(ctr)
movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
pxor $inout1,$inout7
movaps 0x10(%rsp),$inout1
pxor $inout2,$in0
movaps 0x20(%rsp),$inout2
pxor $inout3,$in1
movaps 0x30(%rsp),$inout3
pxor $inout4,$in2
movaps 0x40(%rsp),$inout4
pxor $inout5,$in3
movaps 0x50(%rsp),$inout5
movdqu $inout6,($out) # store 6 output blocks
movdqu $inout7,0x10($out)
movdqu $in0,0x20($out)
movdqu $in1,0x30($out)
movdqu $in2,0x40($out)
movdqu $in3,0x50($out)
lea 0x60($out),$out # $out+=6*16
sub \$6,$len
jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
add \$6,$len # restore real remaining $len
jz .Lctr32_done # done if ($len==0)
lea -48($rnds_),$rounds
lea -80($key,$rnds_),$key # restore $key
neg $rounds
shr \$4,$rounds # restore $rounds
jmp .Lctr32_tail
.align 32
.Lctr32_loop8:
add \$8,$ctr # next counter value
movdqa 0x60(%rsp),$inout6
aesenc $rndkey1,$inout0
mov $ctr,%r9d
movdqa 0x70(%rsp),$inout7
aesenc $rndkey1,$inout1
bswap %r9d
$movkey 0x20-0x80($key),$rndkey0
aesenc $rndkey1,$inout2
xor $key0,%r9d
nop
aesenc $rndkey1,$inout3
mov %r9d,0x00+12(%rsp) # store next counter value
lea 1($ctr),%r9
aesenc $rndkey1,$inout4
aesenc $rndkey1,$inout5
aesenc $rndkey1,$inout6
aesenc $rndkey1,$inout7
$movkey 0x30-0x80($key),$rndkey1
___
for($i=2;$i<8;$i++) {
my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
$code.=<<___;
bswap %r9d
aesenc $rndkeyx,$inout0
aesenc $rndkeyx,$inout1
xor $key0,%r9d
.byte 0x66,0x90
aesenc $rndkeyx,$inout2
aesenc $rndkeyx,$inout3
mov %r9d,`0x10*($i-1)`+12(%rsp)
lea $i($ctr),%r9
aesenc $rndkeyx,$inout4
aesenc $rndkeyx,$inout5
aesenc $rndkeyx,$inout6
aesenc $rndkeyx,$inout7
$movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
___
}