arrow/math/_lib/float64_sse4.s (101 lines of code) (raw):

.text .intel_syntax noprefix .file "_lib/float64.c" .globl sum_float64_sse4 .p2align 4, 0x90 .type sum_float64_sse4,@function sum_float64_sse4: # @sum_float64_sse4 # BB#0: push rbp mov rbp, rsp and rsp, -8 xorpd xmm0, xmm0 test rsi, rsi je .LBB0_14 # BB#1: cmp rsi, 3 jbe .LBB0_2 # BB#5: mov r9, rsi and r9, -4 je .LBB0_2 # BB#6: lea r8, [r9 - 4] mov eax, r8d shr eax, 2 inc eax and rax, 3 je .LBB0_7 # BB#8: neg rax xorpd xmm0, xmm0 xor ecx, ecx xorpd xmm1, xmm1 .p2align 4, 0x90 .LBB0_9: # =>This Inner Loop Header: Depth=1 movupd xmm2, xmmword ptr [rdi + 8*rcx] movupd xmm3, xmmword ptr [rdi + 8*rcx + 16] addpd xmm0, xmm2 addpd xmm1, xmm3 add rcx, 4 inc rax jne .LBB0_9 jmp .LBB0_10 .LBB0_2: xor r9d, r9d .LBB0_3: lea rax, [rdi + 8*r9] sub rsi, r9 .p2align 4, 0x90 .LBB0_4: # =>This Inner Loop Header: Depth=1 addsd xmm0, qword ptr [rax] add rax, 8 dec rsi jne .LBB0_4 .LBB0_14: movsd qword ptr [rdx], xmm0 mov rsp, rbp pop rbp ret .LBB0_7: xor ecx, ecx xorpd xmm0, xmm0 xorpd xmm1, xmm1 .LBB0_10: cmp r8, 12 jb .LBB0_13 # BB#11: mov rax, r9 sub rax, rcx lea rcx, [rdi + 8*rcx + 112] .p2align 4, 0x90 .LBB0_12: # =>This Inner Loop Header: Depth=1 movupd xmm2, xmmword ptr [rcx - 112] movupd xmm3, xmmword ptr [rcx - 96] movupd xmm4, xmmword ptr [rcx - 80] movupd xmm5, xmmword ptr [rcx - 64] addpd xmm2, xmm0 addpd xmm3, xmm1 movupd xmm6, xmmword ptr [rcx - 48] movupd xmm7, xmmword ptr [rcx - 32] addpd xmm6, xmm4 addpd xmm6, xmm2 addpd xmm7, xmm5 addpd xmm7, xmm3 movupd xmm0, xmmword ptr [rcx - 16] movupd xmm1, xmmword ptr [rcx] addpd xmm0, xmm6 addpd xmm1, xmm7 sub rcx, -128 add rax, -16 jne .LBB0_12 .LBB0_13: addpd xmm0, xmm1 haddpd xmm0, xmm0 cmp r9, rsi jne .LBB0_3 jmp .LBB0_14 .Lfunc_end0: .size sum_float64_sse4, .Lfunc_end0-sum_float64_sse4 .ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)" .section ".note.GNU-stack","",@progbits