arrow/math/_lib/float64_avx2.s (174 lines of code) (raw):
.text
.intel_syntax noprefix
.file "_lib/float64.c"
.globl sum_float64_avx2
.p2align 4, 0x90
.type sum_float64_avx2,@function
sum_float64_avx2: # @sum_float64_avx2
# BB#0:
push rbp
mov rbp, rsp
and rsp, -8
vxorpd xmm0, xmm0, xmm0
test rsi, rsi
je .LBB0_14
# BB#1:
cmp rsi, 31
jbe .LBB0_2
# BB#5:
mov r9, rsi
and r9, -32
je .LBB0_2
# BB#6:
lea r8, [r9 - 32]
mov eax, r8d
shr eax, 5
inc eax
and rax, 7
je .LBB0_7
# BB#8:
neg rax
vxorpd ymm0, ymm0, ymm0
xor ecx, ecx
vxorpd ymm1, ymm1, ymm1
vxorpd ymm2, ymm2, ymm2
vxorpd ymm3, ymm3, ymm3
vxorpd ymm4, ymm4, ymm4
vxorpd ymm5, ymm5, ymm5
vxorpd ymm6, ymm6, ymm6
vxorpd ymm7, ymm7, ymm7
.p2align 4, 0x90
.LBB0_9: # =>This Inner Loop Header: Depth=1
vaddpd ymm0, ymm0, ymmword ptr [rdi + 8*rcx]
vaddpd ymm1, ymm1, ymmword ptr [rdi + 8*rcx + 32]
vaddpd ymm2, ymm2, ymmword ptr [rdi + 8*rcx + 64]
vaddpd ymm3, ymm3, ymmword ptr [rdi + 8*rcx + 96]
vaddpd ymm4, ymm4, ymmword ptr [rdi + 8*rcx + 128]
vaddpd ymm5, ymm5, ymmword ptr [rdi + 8*rcx + 160]
vaddpd ymm6, ymm6, ymmword ptr [rdi + 8*rcx + 192]
vaddpd ymm7, ymm7, ymmword ptr [rdi + 8*rcx + 224]
add rcx, 32
inc rax
jne .LBB0_9
jmp .LBB0_10
.LBB0_2:
xor r9d, r9d
.LBB0_3:
lea rax, [rdi + 8*r9]
sub rsi, r9
.p2align 4, 0x90
.LBB0_4: # =>This Inner Loop Header: Depth=1
vaddsd xmm0, xmm0, qword ptr [rax]
add rax, 8
dec rsi
jne .LBB0_4
.LBB0_14:
vmovsd qword ptr [rdx], xmm0
mov rsp, rbp
pop rbp
vzeroupper
ret
.LBB0_7:
xor ecx, ecx
vxorpd ymm0, ymm0, ymm0
vxorpd ymm1, ymm1, ymm1
vxorpd ymm2, ymm2, ymm2
vxorpd ymm3, ymm3, ymm3
vxorpd ymm4, ymm4, ymm4
vxorpd ymm5, ymm5, ymm5
vxorpd ymm6, ymm6, ymm6
vxorpd ymm7, ymm7, ymm7
.LBB0_10:
cmp r8, 224
jb .LBB0_13
# BB#11:
mov rax, r9
sub rax, rcx
lea rcx, [rdi + 8*rcx + 1792]
.p2align 4, 0x90
.LBB0_12: # =>This Inner Loop Header: Depth=1
vaddpd ymm7, ymm7, ymmword ptr [rcx - 1568]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 1600]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 1632]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 1664]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 1696]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 1728]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 1760]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 1792]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 1536]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 1504]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 1472]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 1440]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 1408]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 1376]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 1344]
vaddpd ymm7, ymm7, ymmword ptr [rcx - 1312]
vaddpd ymm7, ymm7, ymmword ptr [rcx - 1056]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 1088]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 1120]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 1152]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 1184]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 1216]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 1248]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 1280]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 1024]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 992]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 960]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 928]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 896]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 864]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 832]
vaddpd ymm7, ymm7, ymmword ptr [rcx - 800]
vaddpd ymm7, ymm7, ymmword ptr [rcx - 544]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 576]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 608]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 640]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 672]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 704]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 736]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 768]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 512]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 480]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 448]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 416]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 384]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 352]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 320]
vaddpd ymm7, ymm7, ymmword ptr [rcx - 288]
vaddpd ymm7, ymm7, ymmword ptr [rcx - 32]
vaddpd ymm6, ymm6, ymmword ptr [rcx - 64]
vaddpd ymm5, ymm5, ymmword ptr [rcx - 96]
vaddpd ymm4, ymm4, ymmword ptr [rcx - 128]
vaddpd ymm3, ymm3, ymmword ptr [rcx - 160]
vaddpd ymm2, ymm2, ymmword ptr [rcx - 192]
vaddpd ymm1, ymm1, ymmword ptr [rcx - 224]
vaddpd ymm0, ymm0, ymmword ptr [rcx - 256]
vaddpd ymm0, ymm0, ymmword ptr [rcx]
vaddpd ymm1, ymm1, ymmword ptr [rcx + 32]
vaddpd ymm2, ymm2, ymmword ptr [rcx + 64]
vaddpd ymm3, ymm3, ymmword ptr [rcx + 96]
vaddpd ymm4, ymm4, ymmword ptr [rcx + 128]
vaddpd ymm5, ymm5, ymmword ptr [rcx + 160]
vaddpd ymm6, ymm6, ymmword ptr [rcx + 192]
vaddpd ymm7, ymm7, ymmword ptr [rcx + 224]
add rcx, 2048
add rax, -256
jne .LBB0_12
.LBB0_13:
vaddpd ymm1, ymm1, ymm5
vaddpd ymm3, ymm3, ymm7
vaddpd ymm0, ymm0, ymm4
vaddpd ymm2, ymm2, ymm6
vaddpd ymm0, ymm0, ymm2
vaddpd ymm1, ymm1, ymm3
vaddpd ymm0, ymm0, ymm1
vextractf128 xmm1, ymm0, 1
vaddpd ymm0, ymm0, ymm1
vhaddpd ymm0, ymm0, ymm0
cmp r9, rsi
jne .LBB0_3
jmp .LBB0_14
.Lfunc_end0:
.size sum_float64_avx2, .Lfunc_end0-sum_float64_avx2
.ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)"
.section ".note.GNU-stack","",@progbits