arrow/memory/_lib/memory_sse4.s (94 lines of code) (raw):

.text .intel_syntax noprefix .file "_lib/memory.c" .globl memset_sse4 .p2align 4, 0x90 .type memset_sse4,@function memset_sse4: # @memset_sse4 # BB#0: push rbp mov rbp, rsp and rsp, -8 lea r11, [rdi + rsi] cmp r11, rdi jbe .LBB0_13 # BB#1: cmp rsi, 32 jb .LBB0_12 # BB#2: mov r8, rsi and r8, -32 mov r10, rsi and r10, -32 je .LBB0_12 # BB#3: movzx eax, dl movd xmm0, eax pxor xmm1, xmm1 pshufb xmm0, xmm1 lea r9, [r10 - 32] mov ecx, r9d shr ecx, 5 inc ecx and rcx, 7 je .LBB0_4 # BB#5: neg rcx xor eax, eax .p2align 4, 0x90 .LBB0_6: # =>This Inner Loop Header: Depth=1 movdqu xmmword ptr [rdi + rax], xmm0 movdqu xmmword ptr [rdi + rax + 16], xmm0 add rax, 32 inc rcx jne .LBB0_6 jmp .LBB0_7 .LBB0_4: xor eax, eax .LBB0_7: cmp r9, 224 jb .LBB0_10 # BB#8: mov rcx, r10 sub rcx, rax lea rax, [rdi + rax + 240] .p2align 4, 0x90 .LBB0_9: # =>This Inner Loop Header: Depth=1 movdqu xmmword ptr [rax - 240], xmm0 movdqu xmmword ptr [rax - 224], xmm0 movdqu xmmword ptr [rax - 208], xmm0 movdqu xmmword ptr [rax - 192], xmm0 movdqu xmmword ptr [rax - 176], xmm0 movdqu xmmword ptr [rax - 160], xmm0 movdqu xmmword ptr [rax - 144], xmm0 movdqu xmmword ptr [rax - 128], xmm0 movdqu xmmword ptr [rax - 112], xmm0 movdqu xmmword ptr [rax - 96], xmm0 movdqu xmmword ptr [rax - 80], xmm0 movdqu xmmword ptr [rax - 64], xmm0 movdqu xmmword ptr [rax - 48], xmm0 movdqu xmmword ptr [rax - 32], xmm0 movdqu xmmword ptr [rax - 16], xmm0 movdqu xmmword ptr [rax], xmm0 add rax, 256 add rcx, -256 jne .LBB0_9 .LBB0_10: cmp r10, rsi je .LBB0_13 # BB#11: add rdi, r8 .p2align 4, 0x90 .LBB0_12: # =>This Inner Loop Header: Depth=1 mov byte ptr [rdi], dl inc rdi cmp r11, rdi jne .LBB0_12 .LBB0_13: mov rsp, rbp pop rbp ret .Lfunc_end0: .size memset_sse4, .Lfunc_end0-memset_sse4 .ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)" .section ".note.GNU-stack","",@progbits