arrow/memory/memory_sse4_amd64.s (74 lines of code) (raw):

//+build !noasm !appengine // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT TEXT ·_memset_sse4(SB), $0-24 MOVQ buf+0(FP), DI MOVQ len+8(FP), SI MOVQ c+16(FP), DX LONG $0x371c8d4c // lea r11, [rdi + rsi] WORD $0x3949; BYTE $0xfb // cmp r11, rdi JBE LBB0_13 LONG $0x20fe8348 // cmp rsi, 32 JB LBB0_12 WORD $0x8949; BYTE $0xf0 // mov r8, rsi LONG $0xe0e08349 // and r8, -32 WORD $0x8949; BYTE $0xf2 // mov r10, rsi LONG $0xe0e28349 // and r10, -32 JE LBB0_12 WORD $0xb60f; BYTE $0xc2 // movzx eax, dl LONG $0xc06e0f66 // movd xmm0, eax LONG $0xc9ef0f66 // pxor xmm1, xmm1 LONG $0x00380f66; BYTE $0xc1 // pshufb xmm0, xmm1 LONG $0xe04a8d4d // lea r9, [r10 - 32] WORD $0x8944; BYTE $0xc9 // mov ecx, r9d WORD $0xe9c1; BYTE $0x05 // shr ecx, 5 WORD $0xc1ff // inc ecx LONG $0x07e18348 // and rcx, 7 JE LBB0_4 WORD $0xf748; BYTE $0xd9 // neg rcx WORD $0xc031 // xor eax, eax LBB0_6: LONG $0x047f0ff3; BYTE $0x07 // movdqu oword [rdi + rax], xmm0 LONG $0x447f0ff3; WORD $0x1007 // movdqu oword [rdi + rax + 16], xmm0 LONG $0x20c08348 // add rax, 32 WORD $0xff48; BYTE $0xc1 // inc rcx JNE LBB0_6 JMP LBB0_7 LBB0_4: WORD $0xc031 // xor eax, eax LBB0_7: LONG $0xe0f98149; WORD $0x0000; BYTE $0x00 // cmp r9, 224 JB LBB0_10 WORD $0x894c; BYTE $0xd1 // mov rcx, r10 WORD $0x2948; BYTE $0xc1 // sub rcx, rax QUAD $0x000000f007848d48 // lea rax, [rdi + rax + 240] LBB0_9: QUAD $0xffffff10807f0ff3 // movdqu oword [rax - 240], xmm0 QUAD $0xffffff20807f0ff3 // movdqu oword [rax - 224], xmm0 QUAD $0xffffff30807f0ff3 // movdqu oword [rax - 208], xmm0 QUAD $0xffffff40807f0ff3 // movdqu oword [rax - 192], xmm0 QUAD $0xffffff50807f0ff3 // movdqu oword [rax - 176], xmm0 QUAD $0xffffff60807f0ff3 // movdqu oword [rax - 160], xmm0 QUAD $0xffffff70807f0ff3 // movdqu oword [rax - 144], xmm0 LONG $0x407f0ff3; BYTE $0x80 // movdqu oword [rax - 128], xmm0 LONG $0x407f0ff3; BYTE $0x90 // movdqu oword [rax - 112], xmm0 LONG $0x407f0ff3; BYTE $0xa0 // movdqu oword [rax - 96], xmm0 LONG $0x407f0ff3; BYTE $0xb0 // movdqu oword [rax - 80], xmm0 LONG $0x407f0ff3; BYTE $0xc0 // movdqu oword [rax - 64], xmm0 LONG $0x407f0ff3; BYTE $0xd0 // movdqu oword [rax - 48], xmm0 LONG $0x407f0ff3; BYTE $0xe0 // movdqu oword [rax - 32], xmm0 LONG $0x407f0ff3; BYTE $0xf0 // movdqu oword [rax - 16], xmm0 LONG $0x007f0ff3 // movdqu oword [rax], xmm0 LONG $0x01000548; WORD $0x0000 // add rax, 256 LONG $0x00c18148; WORD $0xffff; BYTE $0xff // add rcx, -256 JNE LBB0_9 LBB0_10: WORD $0x3949; BYTE $0xf2 // cmp r10, rsi JE LBB0_13 WORD $0x014c; BYTE $0xc7 // add rdi, r8 LBB0_12: WORD $0x1788 // mov byte [rdi], dl WORD $0xff48; BYTE $0xc7 // inc rdi WORD $0x3949; BYTE $0xfb // cmp r11, rdi JNE LBB0_12 LBB0_13: RET