in crypto/fipsmodule/sha/asm/sha512-x86_64.pl [1139:1377]
sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
and \$-64,%rsp # align stack frame
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
mov %rax,$_rsp # save copy of %rsp
.cfi_cfa_expression $_rsp,deref,+8
___
$code.=<<___ if ($win64);
movaps %xmm6,16*$SZ+32(%rsp)
movaps %xmm7,16*$SZ+48(%rsp)
movaps %xmm8,16*$SZ+64(%rsp)
movaps %xmm9,16*$SZ+80(%rsp)
___
$code.=<<___ if ($win64 && $SZ>4);
movaps %xmm10,16*$SZ+96(%rsp)
movaps %xmm11,16*$SZ+112(%rsp)
___
$code.=<<___;
.Lprologue_avx:
vzeroupper
mov $SZ*0($ctx),$A
mov $SZ*1($ctx),$B
mov $SZ*2($ctx),$C
mov $SZ*3($ctx),$D
mov $SZ*4($ctx),$E
mov $SZ*5($ctx),$F
mov $SZ*6($ctx),$G
mov $SZ*7($ctx),$H
___
if ($SZ==4) { # SHA256
my @X = map("%xmm$_",(0..3));
my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
$code.=<<___;
vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
jmp .Lloop_avx
.align 16
.Lloop_avx:
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0]
vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2]
vmovdqu 0x30($inp),@X[3]
vpshufb $t3,@X[0],@X[0]
lea $TABLE(%rip),$Tbl
vpshufb $t3,@X[1],@X[1]
vpshufb $t3,@X[2],@X[2]
vpaddd 0x00($Tbl),@X[0],$t0
vpshufb $t3,@X[3],@X[3]
vpaddd 0x20($Tbl),@X[1],$t1
vpaddd 0x40($Tbl),@X[2],$t2
vpaddd 0x60($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
mov $A,$a1
vmovdqa $t1,0x10(%rsp)
mov $B,$a3
vmovdqa $t2,0x20(%rsp)
xor $C,$a3 # magic
vmovdqa $t3,0x30(%rsp)
mov $E,$a0
jmp .Lavx_00_47
.align 16
.Lavx_00_47:
sub \$`-16*2*$SZ`,$Tbl # size optimization
___
sub Xupdate_256_AVX () {
(
'&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
'&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
'&vpsrld ($t2,$t0,$sigma0[0]);',
'&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
'&vpsrld ($t3,$t0,$sigma0[2])',
'&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
'&vpxor ($t0,$t3,$t2)',
'&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
'&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
'&vpxor ($t0,$t0,$t1)',
'&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
'&vpxor ($t0,$t0,$t2)',
'&vpsrld ($t2,$t3,$sigma1[2]);',
'&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
'&vpsrlq ($t3,$t3,$sigma1[0]);',
'&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
'&vpxor ($t2,$t2,$t3);',
'&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
'&vpxor ($t2,$t2,$t3)',
'&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
'&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
'&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
'&vpsrld ($t2,$t3,$sigma1[2])',
'&vpsrlq ($t3,$t3,$sigma1[0])',
'&vpxor ($t2,$t2,$t3);',
'&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
'&vpxor ($t2,$t2,$t3)',
'&vpshufb ($t2,$t2,$t5)',
'&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
);
}
sub AVX_256_00_47 () {
my $j = shift;
my $body = shift;
my @X = @_;
my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
foreach (Xupdate_256_AVX()) { # 29 instructions
eval;
eval(shift(@insns));
eval(shift(@insns));
eval(shift(@insns));
}
&vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
foreach (@insns) { eval; } # remaining instructions
&vmovdqa (16*$j."(%rsp)",$t2);
}
for ($i=0,$j=0; $j<4; $j++) {
&AVX_256_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X)
}
&cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
&jne (".Lavx_00_47");
for ($i=0; $i<16; ) {
foreach(body_00_15()) { eval; }
}
} else { # SHA512
my @X = map("%xmm$_",(0..7));
my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
$code.=<<___;
jmp .Lloop_avx
.align 16
.Lloop_avx:
vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
vmovdqu 0x00($inp),@X[0]
lea $TABLE+0x80(%rip),$Tbl # size optimization
vmovdqu 0x10($inp),@X[1]
vmovdqu 0x20($inp),@X[2]
vpshufb $t3,@X[0],@X[0]
vmovdqu 0x30($inp),@X[3]
vpshufb $t3,@X[1],@X[1]
vmovdqu 0x40($inp),@X[4]
vpshufb $t3,@X[2],@X[2]
vmovdqu 0x50($inp),@X[5]
vpshufb $t3,@X[3],@X[3]
vmovdqu 0x60($inp),@X[6]
vpshufb $t3,@X[4],@X[4]
vmovdqu 0x70($inp),@X[7]
vpshufb $t3,@X[5],@X[5]
vpaddq -0x80($Tbl),@X[0],$t0
vpshufb $t3,@X[6],@X[6]
vpaddq -0x60($Tbl),@X[1],$t1
vpshufb $t3,@X[7],@X[7]
vpaddq -0x40($Tbl),@X[2],$t2
vpaddq -0x20($Tbl),@X[3],$t3
vmovdqa $t0,0x00(%rsp)
vpaddq 0x00($Tbl),@X[4],$t0
vmovdqa $t1,0x10(%rsp)
vpaddq 0x20($Tbl),@X[5],$t1
vmovdqa $t2,0x20(%rsp)
vpaddq 0x40($Tbl),@X[6],$t2
vmovdqa $t3,0x30(%rsp)
vpaddq 0x60($Tbl),@X[7],$t3
vmovdqa $t0,0x40(%rsp)
mov $A,$a1
vmovdqa $t1,0x50(%rsp)
mov $B,$a3
vmovdqa $t2,0x60(%rsp)
xor $C,$a3 # magic
vmovdqa $t3,0x70(%rsp)
mov $E,$a0
jmp .Lavx_00_47
.align 16
.Lavx_00_47:
add \$`16*2*$SZ`,$Tbl
___
sub Xupdate_512_AVX () {
(
'&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
'&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
'&vpsrlq ($t2,$t0,$sigma0[0])',
'&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
'&vpsrlq ($t3,$t0,$sigma0[2])',
'&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
'&vpxor ($t0,$t3,$t2)',
'&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
'&vpxor ($t0,$t0,$t1)',
'&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
'&vpxor ($t0,$t0,$t2)',
'&vpsrlq ($t3,@X[7],$sigma1[2]);',
'&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
'&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
'&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
'&vpsrlq ($t1,@X[7],$sigma1[0]);',
'&vpxor ($t3,$t3,$t2)',
'&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
'&vpxor ($t3,$t3,$t1)',
'&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
'&vpxor ($t3,$t3,$t2)',
'&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
'&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
);
}
sub AVX_512_00_47 () {
my $j = shift;
my $body = shift;
my @X = @_;
my @insns = (&$body,&$body); # 52 instructions
foreach (Xupdate_512_AVX()) { # 23 instructions
eval;
eval(shift(@insns));
eval(shift(@insns));
}
&vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
foreach (@insns) { eval; } # remaining instructions
&vmovdqa (16*$j."(%rsp)",$t2);
}
for ($i=0,$j=0; $j<8; $j++) {
&AVX_512_00_47($j,\&body_00_15,@X);
push(@X,shift(@X)); # rotate(@X)
}
&cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
&jne (".Lavx_00_47");
for ($i=0; $i<16; ) {
foreach(body_00_15()) { eval; }
}
}