arrow/memory/_lib/memory_avx2.s (95 lines of code) (raw):

.text .intel_syntax noprefix .file "_lib/memory.c" .globl memset_avx2 .p2align 4, 0x90 .type memset_avx2,@function memset_avx2: # @memset_avx2 # BB#0: push rbp mov rbp, rsp and rsp, -8 lea r11, [rdi + rsi] cmp r11, rdi jbe .LBB0_13 # BB#1: cmp rsi, 128 jb .LBB0_12 # BB#2: mov r8, rsi and r8, -128 mov r10, rsi and r10, -128 je .LBB0_12 # BB#3: vmovd xmm0, edx vpbroadcastb ymm0, xmm0 lea r9, [r10 - 128] mov eax, r9d shr eax, 7 inc eax and rax, 3 je .LBB0_4 # BB#5: neg rax xor ecx, ecx .p2align 4, 0x90 .LBB0_6: # =>This Inner Loop Header: Depth=1 vmovdqu ymmword ptr [rdi + rcx], ymm0 vmovdqu ymmword ptr [rdi + rcx + 32], ymm0 vmovdqu ymmword ptr [rdi + rcx + 64], ymm0 vmovdqu ymmword ptr [rdi + rcx + 96], ymm0 sub rcx, -128 inc rax jne .LBB0_6 jmp .LBB0_7 .LBB0_4: xor ecx, ecx .LBB0_7: cmp r9, 384 jb .LBB0_10 # BB#8: mov rax, r10 sub rax, rcx lea rcx, [rdi + rcx + 480] .p2align 4, 0x90 .LBB0_9: # =>This Inner Loop Header: Depth=1 vmovdqu ymmword ptr [rcx - 480], ymm0 vmovdqu ymmword ptr [rcx - 448], ymm0 vmovdqu ymmword ptr [rcx - 416], ymm0 vmovdqu ymmword ptr [rcx - 384], ymm0 vmovdqu ymmword ptr [rcx - 352], ymm0 vmovdqu ymmword ptr [rcx - 320], ymm0 vmovdqu ymmword ptr [rcx - 288], ymm0 vmovdqu ymmword ptr [rcx - 256], ymm0 vmovdqu ymmword ptr [rcx - 224], ymm0 vmovdqu ymmword ptr [rcx - 192], ymm0 vmovdqu ymmword ptr [rcx - 160], ymm0 vmovdqu ymmword ptr [rcx - 128], ymm0 vmovdqu ymmword ptr [rcx - 96], ymm0 vmovdqu ymmword ptr [rcx - 64], ymm0 vmovdqu ymmword ptr [rcx - 32], ymm0 vmovdqu ymmword ptr [rcx], ymm0 add rcx, 512 add rax, -512 jne .LBB0_9 .LBB0_10: cmp r10, rsi je .LBB0_13 # BB#11: add rdi, r8 .p2align 4, 0x90 .LBB0_12: # =>This Inner Loop Header: Depth=1 mov byte ptr [rdi], dl inc rdi cmp r11, rdi jne .LBB0_12 .LBB0_13: mov rsp, rbp pop rbp vzeroupper ret .Lfunc_end0: .size memset_avx2, .Lfunc_end0-memset_avx2 .ident "Apple LLVM version 9.0.0 (clang-900.0.39.2)" .section ".note.GNU-stack","",@progbits