in ring/crypto/chacha/asm/chacha-x86_64.pl [361:570]
sub \$64,%rbp
jnz .Loop_outer
jmp .Ldone
.align 16
.Ltail:
mov @x[0],4*0(%rsp)
mov @x[1],4*1(%rsp)
xor %rbx,%rbx
mov @x[2],4*2(%rsp)
mov @x[3],4*3(%rsp)
mov @x[4],4*4(%rsp)
mov @x[5],4*5(%rsp)
mov @x[6],4*6(%rsp)
mov @x[7],4*7(%rsp)
movdqa %xmm1,4*8(%rsp)
mov @x[12],4*12(%rsp)
mov @x[13],4*13(%rsp)
mov @x[14],4*14(%rsp)
mov @x[15],4*15(%rsp)
.Loop_tail:
movzb ($inp,%rbx),%eax
movzb (%rsp,%rbx),%edx
lea 1(%rbx),%rbx
xor %edx,%eax
mov %al,-1($out,%rbx)
dec %rbp
jnz .Loop_tail
.Ldone:
lea 64+24+48(%rsp),%rsi
mov -48(%rsi),%r15
.cfi_restore r15
mov -40(%rsi),%r14
.cfi_restore r14
mov -32(%rsi),%r13
.cfi_restore r13
mov -24(%rsi),%r12
.cfi_restore r12
mov -16(%rsi),%rbp
.cfi_restore rbp
mov -8(%rsi),%rbx
.cfi_restore rbx
lea (%rsi),%rsp
.cfi_adjust_cfa_offset `-64-24-48`
.Lno_data:
ret
.cfi_endproc
.size GFp_ChaCha20_ctr32,.-GFp_ChaCha20_ctr32
___
########################################################################
# SSSE3 code path that handles shorter lengths
{
my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7));
sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round
&paddd ($a,$b);
&pxor ($d,$a);
&pshufb ($d,$rot16);
&paddd ($c,$d);
&pxor ($b,$c);
&movdqa ($t,$b);
&psrld ($b,20);
&pslld ($t,12);
&por ($b,$t);
&paddd ($a,$b);
&pxor ($d,$a);
&pshufb ($d,$rot24);
&paddd ($c,$d);
&pxor ($b,$c);
&movdqa ($t,$b);
&psrld ($b,25);
&pslld ($t,7);
&por ($b,$t);
}
my $xframe = $win64 ? 32+8 : 8;
$code.=<<___;
.type ChaCha20_ssse3,\@function,5
.align 32
ChaCha20_ssse3:
.LChaCha20_ssse3:
.cfi_startproc
mov %rsp,%r9 # frame pointer
.cfi_def_cfa_register r9
___
$code.=<<___;
cmp \$128,$len # we might throw away some data,
ja .LChaCha20_4x # but overall it won't be slower
.Ldo_sse3_after_all:
sub \$64+$xframe,%rsp
___
$code.=<<___ if ($win64);
movaps %xmm6,-0x28(%r9)
movaps %xmm7,-0x18(%r9)
.Lssse3_body:
___
$code.=<<___;
movdqa .Lsigma(%rip),$a
movdqu ($key),$b
movdqu 16($key),$c
movdqu ($counter),$d
movdqa .Lrot16(%rip),$rot16
movdqa .Lrot24(%rip),$rot24
movdqa $a,0x00(%rsp)
movdqa $b,0x10(%rsp)
movdqa $c,0x20(%rsp)
movdqa $d,0x30(%rsp)
mov \$10,$counter # reuse $counter
jmp .Loop_ssse3
.align 32
.Loop_outer_ssse3:
movdqa .Lone(%rip),$d
movdqa 0x00(%rsp),$a
movdqa 0x10(%rsp),$b
movdqa 0x20(%rsp),$c
paddd 0x30(%rsp),$d
mov \$10,$counter
movdqa $d,0x30(%rsp)
jmp .Loop_ssse3
.align 32
.Loop_ssse3:
___
&SSSE3ROUND();
&pshufd ($c,$c,0b01001110);
&pshufd ($b,$b,0b00111001);
&pshufd ($d,$d,0b10010011);
&nop ();
&SSSE3ROUND();
&pshufd ($c,$c,0b01001110);
&pshufd ($b,$b,0b10010011);
&pshufd ($d,$d,0b00111001);
&dec ($counter);
&jnz (".Loop_ssse3");
$code.=<<___;
paddd 0x00(%rsp),$a
paddd 0x10(%rsp),$b
paddd 0x20(%rsp),$c
paddd 0x30(%rsp),$d
cmp \$64,$len
jb .Ltail_ssse3
movdqu 0x00($inp),$t
movdqu 0x10($inp),$t1
pxor $t,$a # xor with input
movdqu 0x20($inp),$t
pxor $t1,$b
movdqu 0x30($inp),$t1
lea 0x40($inp),$inp # inp+=64
pxor $t,$c
pxor $t1,$d
movdqu $a,0x00($out) # write output
movdqu $b,0x10($out)
movdqu $c,0x20($out)
movdqu $d,0x30($out)
lea 0x40($out),$out # out+=64
sub \$64,$len
jnz .Loop_outer_ssse3
jmp .Ldone_ssse3
.align 16
.Ltail_ssse3:
movdqa $a,0x00(%rsp)
movdqa $b,0x10(%rsp)
movdqa $c,0x20(%rsp)
movdqa $d,0x30(%rsp)
xor $counter,$counter
.Loop_tail_ssse3:
movzb ($inp,$counter),%eax
movzb (%rsp,$counter),%ecx
lea 1($counter),$counter
xor %ecx,%eax
mov %al,-1($out,$counter)
dec $len
jnz .Loop_tail_ssse3
.Ldone_ssse3:
___
$code.=<<___ if ($win64);
movaps -0x28(%r9),%xmm6
movaps -0x18(%r9),%xmm7
___
$code.=<<___;
lea (%r9),%rsp
.cfi_def_cfa_register rsp
.Lssse3_epilogue:
ret
.cfi_endproc
.size ChaCha20_ssse3,.-ChaCha20_ssse3
___
}