arrow/math/_lib/float64_sse4.s (101 lines of code) (raw):
.text
.intel_syntax noprefix
.file "_lib/float64.c"
.globl sum_float64_sse4
.p2align 4, 0x90
.type sum_float64_sse4,@function
sum_float64_sse4: # @sum_float64_sse4
# BB#0:
push rbp
mov rbp, rsp
and rsp, -8
xorpd xmm0, xmm0
test rsi, rsi
je .LBB0_14
# BB#1:
cmp rsi, 3
jbe .LBB0_2
# BB#5:
mov r9, rsi
and r9, -4
je .LBB0_2
# BB#6:
lea r8, [r9 - 4]
mov eax, r8d
shr eax, 2
inc eax
and rax, 3
je .LBB0_7
# BB#8:
neg rax
xorpd xmm0, xmm0
xor ecx, ecx
xorpd xmm1, xmm1
.p2align 4, 0x90
.LBB0_9: # =>This Inner Loop Header: Depth=1
movupd xmm2, xmmword ptr [rdi + 8*rcx]
movupd xmm3, xmmword ptr [rdi + 8*rcx + 16]
addpd xmm0, xmm2
addpd xmm1, xmm3
add rcx, 4
inc rax
jne .LBB0_9
jmp .LBB0_10
.LBB0_2:
xor r9d, r9d
.LBB0_3:
lea rax, [rdi + 8*r9]
sub rsi, r9
.p2align 4, 0x90
.LBB0_4: # =>This Inner Loop Header: Depth=1
addsd xmm0, qword ptr [rax]
add rax, 8
dec rsi
jne .LBB0_4
.LBB0_14:
movsd qword ptr [rdx], xmm0
mov rsp, rbp
pop rbp
ret
.LBB0_7:
xor ecx, ecx
xorpd xmm0, xmm0
xorpd xmm1, xmm1
.LBB0_10:
cmp r8, 12
jb .LBB0_13
# BB#11:
mov rax, r9
sub rax, rcx
lea rcx, [rdi + 8*rcx + 112]
.p2align 4, 0x90
.LBB0_12: # =>This Inner Loop Header: Depth=1
movupd xmm2, xmmword ptr [rcx - 112]
movupd xmm3, xmmword ptr [rcx - 96]
movupd xmm4, xmmword ptr [rcx - 80]
movupd xmm5, xmmword ptr [rcx - 64]
addpd xmm2, xmm0
addpd xmm3, xmm1
movupd xmm6, xmmword ptr [rcx - 48]
movupd xmm7, xmmword ptr [rcx - 32]
addpd xmm6, xmm4
addpd xmm6, xmm2
addpd xmm7, xmm5
addpd xmm7, xmm3
movupd xmm0, xmmword ptr [rcx - 16]
movupd xmm1, xmmword ptr [rcx]
addpd xmm0, xmm6
addpd xmm1, xmm7
sub rcx, -128
add rax, -16
jne .LBB0_12
.LBB0_13:
addpd xmm0, xmm1
haddpd xmm0, xmm0
cmp r9, rsi
jne .LBB0_3
jmp .LBB0_14
.Lfunc_end0:
.size sum_float64_sse4, .Lfunc_end0-sum_float64_sse4
.ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)"
.section ".note.GNU-stack","",@progbits