arrow/math/_lib/uint64_sse4.s (106 lines of code) (raw):

.text .intel_syntax noprefix .file "_lib/uint64.c" .globl sum_uint64_sse4 .p2align 4, 0x90 .type sum_uint64_sse4,@function sum_uint64_sse4: # @sum_uint64_sse4 # BB#0: push rbp mov rbp, rsp and rsp, -8 test rsi, rsi je .LBB0_1 # BB#2: cmp rsi, 3 jbe .LBB0_3 # BB#6: mov r9, rsi and r9, -4 je .LBB0_3 # BB#7: lea r8, [r9 - 4] mov eax, r8d shr eax, 2 inc eax and rax, 3 je .LBB0_8 # BB#9: neg rax pxor xmm0, xmm0 xor ecx, ecx pxor xmm1, xmm1 .p2align 4, 0x90 .LBB0_10: # =>This Inner Loop Header: Depth=1 movdqu xmm2, xmmword ptr [rdi + 8*rcx] movdqu xmm3, xmmword ptr [rdi + 8*rcx + 16] paddq xmm0, xmm2 paddq xmm1, xmm3 add rcx, 4 inc rax jne .LBB0_10 jmp .LBB0_11 .LBB0_3: xor r9d, r9d xor eax, eax .LBB0_4: lea rcx, [rdi + 8*r9] sub rsi, r9 .p2align 4, 0x90 .LBB0_5: # =>This Inner Loop Header: Depth=1 add rax, qword ptr [rcx] add rcx, 8 dec rsi jne .LBB0_5 jmp .LBB0_15 .LBB0_1: xor eax, eax .LBB0_15: mov qword ptr [rdx], rax mov rsp, rbp pop rbp ret .LBB0_8: xor ecx, ecx pxor xmm0, xmm0 pxor xmm1, xmm1 .LBB0_11: cmp r8, 12 jb .LBB0_14 # BB#12: mov rax, r9 sub rax, rcx lea rcx, [rdi + 8*rcx + 112] .p2align 4, 0x90 .LBB0_13: # =>This Inner Loop Header: Depth=1 movdqu xmm2, xmmword ptr [rcx - 112] movdqu xmm3, xmmword ptr [rcx - 96] movdqu xmm4, xmmword ptr [rcx - 80] movdqu xmm5, xmmword ptr [rcx - 64] paddq xmm2, xmm0 paddq xmm3, xmm1 movdqu xmm6, xmmword ptr [rcx - 48] movdqu xmm7, xmmword ptr [rcx - 32] paddq xmm6, xmm4 paddq xmm6, xmm2 paddq xmm7, xmm5 paddq xmm7, xmm3 movdqu xmm0, xmmword ptr [rcx - 16] movdqu xmm1, xmmword ptr [rcx] paddq xmm0, xmm6 paddq xmm1, xmm7 sub rcx, -128 add rax, -16 jne .LBB0_13 .LBB0_14: paddq xmm0, xmm1 pshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1] paddq xmm1, xmm0 movq rax, xmm1 cmp r9, rsi jne .LBB0_4 jmp .LBB0_15 .Lfunc_end0: .size sum_uint64_sse4, .Lfunc_end0-sum_uint64_sse4 .ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)" .section ".note.GNU-stack","",@progbits