arrow/math/_lib/uint64_neon.s (59 lines of code) (raw):
.text
.file "uint64.c"
.globl sum_uint64_neon // -- Begin function sum_uint64_neon
.p2align 2
.type sum_uint64_neon,@function
sum_uint64_neon: // @sum_uint64_neon
// %bb.0:
stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
mov x29, sp
cbz x1, .LBB0_3
// %bb.1:
cmp x1, #3 // =3
b.hi .LBB0_4
// %bb.2:
mov x8, xzr
mov x9, xzr
b .LBB0_7
.LBB0_3:
mov x9, xzr
str x9, [x2]
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.LBB0_4:
and x8, x1, #0xfffffffffffffffc
add x9, x0, #16 // =16
movi v0.2d, #0000000000000000
mov x10, x8
movi v1.2d, #0000000000000000
.LBB0_5: // =>This Inner Loop Header: Depth=1
ldp q2, q3, [x9, #-16]
subs x10, x10, #4 // =4
add x9, x9, #32 // =32
add v0.2d, v2.2d, v0.2d
add v1.2d, v3.2d, v1.2d
b.ne .LBB0_5
// %bb.6:
add v0.2d, v1.2d, v0.2d
addp d0, v0.2d
cmp x8, x1
fmov x9, d0
b.eq .LBB0_9
.LBB0_7:
add x10, x0, x8, lsl #3
sub x8, x1, x8
.LBB0_8: // =>This Inner Loop Header: Depth=1
ldr x11, [x10], #8
subs x8, x8, #1 // =1
add x9, x11, x9
b.ne .LBB0_8
.LBB0_9:
str x9, [x2]
ldp x29, x30, [sp], #16 // 16-byte Folded Reload
ret
.Lfunc_end0:
.size sum_uint64_neon, .Lfunc_end0-sum_uint64_neon
// -- End function
.ident "clang version 9.0.1-12 "
.section ".note.GNU-stack","",@progbits
.addrsig