sub $len,$len,()

in crypto/poly1305-armv8.pl [153:435]


	sub	$len,$len,#16
#ifdef	__AARCH64EB__
	rev	$t0,$t0
	rev	$t1,$t1
#endif
	adds	$h0,$h0,$t0		// accumulate input
	adcs	$h1,$h1,$t1

	mul	$d0,$h0,$r0		// h0*r0
	adc	$h2,$h2,$padbit
	umulh	$d1,$h0,$r0

	mul	$t0,$h1,$s1		// h1*5*r1
	umulh	$t1,$h1,$s1

	adds	$d0,$d0,$t0
	mul	$t0,$h0,$r1		// h0*r1
	adc	$d1,$d1,$t1
	umulh	$d2,$h0,$r1

	adds	$d1,$d1,$t0
	mul	$t0,$h1,$r0		// h1*r0
	adc	$d2,$d2,xzr
	umulh	$t1,$h1,$r0

	adds	$d1,$d1,$t0
	mul	$t0,$h2,$s1		// h2*5*r1
	adc	$d2,$d2,$t1
	mul	$t1,$h2,$r0		// h2*r0

	adds	$d1,$d1,$t0
	adc	$d2,$d2,$t1

	and	$t0,$d2,#-4		// final reduction
	and	$h2,$d2,#3
	add	$t0,$t0,$d2,lsr#2
	adds	$h0,$d0,$t0
	adcs	$h1,$d1,xzr
	adc	$h2,$h2,xzr

	cbnz	$len,.Loop

	stp	$h0,$h1,[$ctx]		// store hash value
	stp	$h2,xzr,[$ctx,#16]	// [and clear is_base2_26]

.Lno_data:
	ret
.size	poly1305_blocks,.-poly1305_blocks

.type	poly1305_emit,%function
.align	5
poly1305_emit:
.Lpoly1305_emit:
	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
	ldp	$h2,$r0,[$ctx,#16]	// [along with is_base2_26]
	ldp	$t0,$t1,[$nonce]	// load nonce

#ifdef	__AARCH64EB__
	lsr	$d0,$h0,#32
	mov	w#$d1,w#$h0
	lsr	$d2,$h1,#32
	mov	w15,w#$h1
	lsr	x16,$h2,#32
#else
	mov	w#$d0,w#$h0
	lsr	$d1,$h0,#32
	mov	w#$d2,w#$h1
	lsr	x15,$h1,#32
	mov	w16,w#$h2
#endif

	add	$d0,$d0,$d1,lsl#26	// base 2^26 -> base 2^64
	lsr	$d1,$d2,#12
	adds	$d0,$d0,$d2,lsl#52
	add	$d1,$d1,x15,lsl#14
	adc	$d1,$d1,xzr
	lsr	$d2,x16,#24
	adds	$d1,$d1,x16,lsl#40
	adc	$d2,$d2,xzr

	cmp	$r0,#0			// is_base2_26?
	csel	$h0,$h0,$d0,eq		// choose between radixes
	csel	$h1,$h1,$d1,eq
	csel	$h2,$h2,$d2,eq

	adds	$d0,$h0,#5		// compare to modulus
	adcs	$d1,$h1,xzr
	adc	$d2,$h2,xzr

	tst	$d2,#-4			// see if it's carried/borrowed

	csel	$h0,$h0,$d0,eq
	csel	$h1,$h1,$d1,eq

#ifdef	__AARCH64EB__
	ror	$t0,$t0,#32		// flip nonce words
	ror	$t1,$t1,#32
#endif
	adds	$h0,$h0,$t0		// accumulate nonce
	adc	$h1,$h1,$t1
#ifdef	__AARCH64EB__
	rev	$h0,$h0			// flip output bytes
	rev	$h1,$h1
#endif
	stp	$h0,$h1,[$mac]		// write result

	ret
.size	poly1305_emit,.-poly1305_emit
___
my ($R0,$R1,$S1,$R2,$S2,$R3,$S3,$R4,$S4) = map("v$_.4s",(0..8));
my ($IN01_0,$IN01_1,$IN01_2,$IN01_3,$IN01_4) = map("v$_.2s",(9..13));
my ($IN23_0,$IN23_1,$IN23_2,$IN23_3,$IN23_4) = map("v$_.2s",(14..18));
my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4) = map("v$_.2d",(19..23));
my ($H0,$H1,$H2,$H3,$H4) = map("v$_.2s",(24..28));
my ($T0,$T1,$MASK) = map("v$_",(29..31));

my ($in2,$zeros)=("x16","x17");
my $is_base2_26 = $zeros;		# borrow

$code.=<<___;
.type	poly1305_mult,%function
.align	5
poly1305_mult:
	mul	$d0,$h0,$r0		// h0*r0
	umulh	$d1,$h0,$r0

	mul	$t0,$h1,$s1		// h1*5*r1
	umulh	$t1,$h1,$s1

	adds	$d0,$d0,$t0
	mul	$t0,$h0,$r1		// h0*r1
	adc	$d1,$d1,$t1
	umulh	$d2,$h0,$r1

	adds	$d1,$d1,$t0
	mul	$t0,$h1,$r0		// h1*r0
	adc	$d2,$d2,xzr
	umulh	$t1,$h1,$r0

	adds	$d1,$d1,$t0
	mul	$t0,$h2,$s1		// h2*5*r1
	adc	$d2,$d2,$t1
	mul	$t1,$h2,$r0		// h2*r0

	adds	$d1,$d1,$t0
	adc	$d2,$d2,$t1

	and	$t0,$d2,#-4		// final reduction
	and	$h2,$d2,#3
	add	$t0,$t0,$d2,lsr#2
	adds	$h0,$d0,$t0
	adcs	$h1,$d1,xzr
	adc	$h2,$h2,xzr

	ret
.size	poly1305_mult,.-poly1305_mult

.type	poly1305_splat,%function
.align	4
poly1305_splat:
	and	x12,$h0,#0x03ffffff	// base 2^64 -> base 2^26
	ubfx	x13,$h0,#26,#26
	extr	x14,$h1,$h0,#52
	and	x14,x14,#0x03ffffff
	ubfx	x15,$h1,#14,#26
	extr	x16,$h2,$h1,#40

	str	w12,[$ctx,#16*0]	// r0
	add	w12,w13,w13,lsl#2	// r1*5
	str	w13,[$ctx,#16*1]	// r1
	add	w13,w14,w14,lsl#2	// r2*5
	str	w12,[$ctx,#16*2]	// s1
	str	w14,[$ctx,#16*3]	// r2
	add	w14,w15,w15,lsl#2	// r3*5
	str	w13,[$ctx,#16*4]	// s2
	str	w15,[$ctx,#16*5]	// r3
	add	w15,w16,w16,lsl#2	// r4*5
	str	w14,[$ctx,#16*6]	// s3
	str	w16,[$ctx,#16*7]	// r4
	str	w15,[$ctx,#16*8]	// s4

	ret
.size	poly1305_splat,.-poly1305_splat

#ifdef	__KERNEL__
.globl	poly1305_blocks_neon
#endif
.type	poly1305_blocks_neon,%function
.align	5
poly1305_blocks_neon:
.Lpoly1305_blocks_neon:
	ldr	$is_base2_26,[$ctx,#24]
	cmp	$len,#128
	b.lo	.Lpoly1305_blocks

	.inst	0xd503233f		// paciasp
	stp	x29,x30,[sp,#-80]!
	add	x29,sp,#0

	stp	d8,d9,[sp,#16]		// meet ABI requirements
	stp	d10,d11,[sp,#32]
	stp	d12,d13,[sp,#48]
	stp	d14,d15,[sp,#64]

	cbz	$is_base2_26,.Lbase2_64_neon

	ldp	w10,w11,[$ctx]		// load hash value base 2^26
	ldp	w12,w13,[$ctx,#8]
	ldr	w14,[$ctx,#16]

	tst	$len,#31
	b.eq	.Leven_neon

	ldp	$r0,$r1,[$ctx,#32]	// load key value

	add	$h0,x10,x11,lsl#26	// base 2^26 -> base 2^64
	lsr	$h1,x12,#12
	adds	$h0,$h0,x12,lsl#52
	add	$h1,$h1,x13,lsl#14
	adc	$h1,$h1,xzr
	lsr	$h2,x14,#24
	adds	$h1,$h1,x14,lsl#40
	adc	$d2,$h2,xzr		// can be partially reduced...

	ldp	$d0,$d1,[$inp],#16	// load input
	sub	$len,$len,#16
	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)

#ifdef	__AARCH64EB__
	rev	$d0,$d0
	rev	$d1,$d1
#endif
	adds	$h0,$h0,$d0		// accumulate input
	adcs	$h1,$h1,$d1
	adc	$h2,$h2,$padbit

	bl	poly1305_mult

	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
	ubfx	x11,$h0,#26,#26
	extr	x12,$h1,$h0,#52
	and	x12,x12,#0x03ffffff
	ubfx	x13,$h1,#14,#26
	extr	x14,$h2,$h1,#40

	b	.Leven_neon

.align	4
.Lbase2_64_neon:
	ldp	$r0,$r1,[$ctx,#32]	// load key value

	ldp	$h0,$h1,[$ctx]		// load hash value base 2^64
	ldr	$h2,[$ctx,#16]

	tst	$len,#31
	b.eq	.Linit_neon

	ldp	$d0,$d1,[$inp],#16	// load input
	sub	$len,$len,#16
	add	$s1,$r1,$r1,lsr#2	// s1 = r1 + (r1 >> 2)
#ifdef	__AARCH64EB__
	rev	$d0,$d0
	rev	$d1,$d1
#endif
	adds	$h0,$h0,$d0		// accumulate input
	adcs	$h1,$h1,$d1
	adc	$h2,$h2,$padbit

	bl	poly1305_mult

.Linit_neon:
	ldr	w17,[$ctx,#48]		// first table element
	and	x10,$h0,#0x03ffffff	// base 2^64 -> base 2^26
	ubfx	x11,$h0,#26,#26
	extr	x12,$h1,$h0,#52
	and	x12,x12,#0x03ffffff
	ubfx	x13,$h1,#14,#26
	extr	x14,$h2,$h1,#40

	cmp	w17,#-1			// is value impossible?
	b.ne	.Leven_neon

	fmov	${H0},x10