arrow/bitutil/bitmap_ops_sse4_amd64.s (450 lines of code) (raw):
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ·_bitmap_aligned_and_sse4(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB0_16
LONG $0x1ff98348 // cmp rcx, 31
JA LBB0_7
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LBB0_3:
WORD $0x894d; BYTE $0xd8 // mov r8, r11
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xc8 // add r8, rcx
WORD $0x8949; BYTE $0xc9 // mov r9, rcx
LONG $0x03e18349 // and r9, 3
JE LBB0_5
LBB0_4:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
LONG $0x1f042242 // and al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x01c38349 // add r11, 1
LONG $0xffc18349 // add r9, -1
JNE LBB0_4
LBB0_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB0_16
LBB0_6:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
LONG $0x1f042242 // and al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1]
LONG $0x1f442242; BYTE $0x01 // and al, byte [rdi + r11 + 1]
LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al
LONG $0x44b60f42; WORD $0x021e // movzx eax, byte [rsi + r11 + 2]
LONG $0x1f442242; BYTE $0x02 // and al, byte [rdi + r11 + 2]
LONG $0x1a448842; BYTE $0x02 // mov byte [rdx + r11 + 2], al
LONG $0x44b60f42; WORD $0x031e // movzx eax, byte [rsi + r11 + 3]
LONG $0x1f442242; BYTE $0x03 // and al, byte [rdi + r11 + 3]
LONG $0x1a448842; BYTE $0x03 // mov byte [rdx + r11 + 3], al
LONG $0x04c38349 // add r11, 4
WORD $0x394c; BYTE $0xd9 // cmp rcx, r11
JNE LBB0_6
JMP LBB0_16
LBB0_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd2970f41 // seta r10b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
WORD $0x8441; BYTE $0xda // test r10b, bl
JNE LBB0_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB0_3
WORD $0x8949; BYTE $0xcb // mov r11, rcx
LONG $0xe0e38349 // and r11, -32
LONG $0xe0438d49 // lea rax, [r11 - 32]
WORD $0x8949; BYTE $0xc1 // mov r9, rax
LONG $0x05e9c149 // shr r9, 5
LONG $0x01c18349 // add r9, 1
WORD $0x8548; BYTE $0xc0 // test rax, rax
JE LBB0_10
WORD $0x894d; BYTE $0xca // mov r10, r9
LONG $0xfee28349 // and r10, -2
WORD $0xf749; BYTE $0xda // neg r10
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB0_12:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x540f; BYTE $0xd0 // andps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x540f; BYTE $0xc1 // andps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32]
LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48]
LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32]
WORD $0x540f; BYTE $0xd0 // andps xmm2, xmm0
LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48]
WORD $0x540f; BYTE $0xc1 // andps xmm0, xmm1
LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2
LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0
LONG $0x40c08349 // add r8, 64
LONG $0x02c28349 // add r10, 2
JNE LBB0_12
LONG $0x01c1f641 // test r9b, 1
JE LBB0_15
LBB0_14:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x540f; BYTE $0xd0 // andps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x540f; BYTE $0xc1 // andps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LBB0_15:
WORD $0x3949; BYTE $0xcb // cmp r11, rcx
JNE LBB0_3
LBB0_16:
RET
LBB0_10:
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LONG $0x01c1f641 // test r9b, 1
JNE LBB0_14
JMP LBB0_15
TEXT ·_bitmap_aligned_or_sse4(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB1_16
LONG $0x1ff98348 // cmp rcx, 31
JA LBB1_7
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LBB1_3:
WORD $0x894d; BYTE $0xd8 // mov r8, r11
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xc8 // add r8, rcx
WORD $0x8949; BYTE $0xc9 // mov r9, rcx
LONG $0x03e18349 // and r9, 3
JE LBB1_5
LBB1_4:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
LONG $0x1f040a42 // or al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x01c38349 // add r11, 1
LONG $0xffc18349 // add r9, -1
JNE LBB1_4
LBB1_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB1_16
LBB1_6:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
LONG $0x1f040a42 // or al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1]
LONG $0x1f440a42; BYTE $0x01 // or al, byte [rdi + r11 + 1]
LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al
LONG $0x44b60f42; WORD $0x021e // movzx eax, byte [rsi + r11 + 2]
LONG $0x1f440a42; BYTE $0x02 // or al, byte [rdi + r11 + 2]
LONG $0x1a448842; BYTE $0x02 // mov byte [rdx + r11 + 2], al
LONG $0x44b60f42; WORD $0x031e // movzx eax, byte [rsi + r11 + 3]
LONG $0x1f440a42; BYTE $0x03 // or al, byte [rdi + r11 + 3]
LONG $0x1a448842; BYTE $0x03 // mov byte [rdx + r11 + 3], al
LONG $0x04c38349 // add r11, 4
WORD $0x394c; BYTE $0xd9 // cmp rcx, r11
JNE LBB1_6
JMP LBB1_16
LBB1_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd2970f41 // seta r10b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
WORD $0x8441; BYTE $0xda // test r10b, bl
JNE LBB1_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB1_3
WORD $0x8949; BYTE $0xcb // mov r11, rcx
LONG $0xe0e38349 // and r11, -32
LONG $0xe0438d49 // lea rax, [r11 - 32]
WORD $0x8949; BYTE $0xc1 // mov r9, rax
LONG $0x05e9c149 // shr r9, 5
LONG $0x01c18349 // add r9, 1
WORD $0x8548; BYTE $0xc0 // test rax, rax
JE LBB1_10
WORD $0x894d; BYTE $0xca // mov r10, r9
LONG $0xfee28349 // and r10, -2
WORD $0xf749; BYTE $0xda // neg r10
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB1_12:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x560f; BYTE $0xd0 // orps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x560f; BYTE $0xc1 // orps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32]
LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48]
LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32]
WORD $0x560f; BYTE $0xd0 // orps xmm2, xmm0
LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48]
WORD $0x560f; BYTE $0xc1 // orps xmm0, xmm1
LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2
LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0
LONG $0x40c08349 // add r8, 64
LONG $0x02c28349 // add r10, 2
JNE LBB1_12
LONG $0x01c1f641 // test r9b, 1
JE LBB1_15
LBB1_14:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x560f; BYTE $0xd0 // orps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x560f; BYTE $0xc1 // orps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LBB1_15:
WORD $0x3949; BYTE $0xcb // cmp r11, rcx
JNE LBB1_3
LBB1_16:
RET
LBB1_10:
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LONG $0x01c1f641 // test r9b, 1
JNE LBB1_14
JMP LBB1_15
TEXT ·_bitmap_aligned_and_not_sse4(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB2_16
LONG $0x1ff98348 // cmp rcx, 31
JA LBB2_7
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LBB2_3:
WORD $0x894d; BYTE $0xd8 // mov r8, r11
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0xc1f6; BYTE $0x01 // test cl, 1
JE LBB2_5
LONG $0x1e048a42 // mov al, byte [rsi + r11]
WORD $0xd0f6 // not al
LONG $0x1f042242 // and al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x01cb8349 // or r11, 1
LBB2_5:
WORD $0x0149; BYTE $0xc8 // add r8, rcx
JE LBB2_16
LBB2_6:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
WORD $0xd0f6 // not al
LONG $0x1f042242 // and al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1]
WORD $0xd0f6 // not al
LONG $0x1f442242; BYTE $0x01 // and al, byte [rdi + r11 + 1]
LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al
LONG $0x02c38349 // add r11, 2
WORD $0x394c; BYTE $0xd9 // cmp rcx, r11
JNE LBB2_6
JMP LBB2_16
LBB2_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd2970f41 // seta r10b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
WORD $0x8441; BYTE $0xda // test r10b, bl
JNE LBB2_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB2_3
WORD $0x8949; BYTE $0xcb // mov r11, rcx
LONG $0xe0e38349 // and r11, -32
LONG $0xe0438d49 // lea rax, [r11 - 32]
WORD $0x8949; BYTE $0xc1 // mov r9, rax
LONG $0x05e9c149 // shr r9, 5
LONG $0x01c18349 // add r9, 1
WORD $0x8548; BYTE $0xc0 // test rax, rax
JE LBB2_10
WORD $0x894d; BYTE $0xca // mov r10, r9
LONG $0xfee28349 // and r10, -2
WORD $0xf749; BYTE $0xda // neg r10
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB2_12:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32]
LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48]
LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32]
WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0
LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48]
WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1
LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2
LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0
LONG $0x40c08349 // add r8, 64
LONG $0x02c28349 // add r10, 2
JNE LBB2_12
LONG $0x01c1f641 // test r9b, 1
JE LBB2_15
LBB2_14:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x550f; BYTE $0xd0 // andnps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x550f; BYTE $0xc1 // andnps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LBB2_15:
WORD $0x3949; BYTE $0xcb // cmp r11, rcx
JNE LBB2_3
LBB2_16:
RET
LBB2_10:
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LONG $0x01c1f641 // test r9b, 1
JNE LBB2_14
JMP LBB2_15
TEXT ·_bitmap_aligned_xor_sse4(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB3_16
LONG $0x1ff98348 // cmp rcx, 31
JA LBB3_7
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LBB3_3:
WORD $0x894d; BYTE $0xd8 // mov r8, r11
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xc8 // add r8, rcx
WORD $0x8949; BYTE $0xc9 // mov r9, rcx
LONG $0x03e18349 // and r9, 3
JE LBB3_5
LBB3_4:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
LONG $0x1f043242 // xor al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x01c38349 // add r11, 1
LONG $0xffc18349 // add r9, -1
JNE LBB3_4
LBB3_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB3_16
LBB3_6:
LONG $0x04b60f42; BYTE $0x1e // movzx eax, byte [rsi + r11]
LONG $0x1f043242 // xor al, byte [rdi + r11]
LONG $0x1a048842 // mov byte [rdx + r11], al
LONG $0x44b60f42; WORD $0x011e // movzx eax, byte [rsi + r11 + 1]
LONG $0x1f443242; BYTE $0x01 // xor al, byte [rdi + r11 + 1]
LONG $0x1a448842; BYTE $0x01 // mov byte [rdx + r11 + 1], al
LONG $0x44b60f42; WORD $0x021e // movzx eax, byte [rsi + r11 + 2]
LONG $0x1f443242; BYTE $0x02 // xor al, byte [rdi + r11 + 2]
LONG $0x1a448842; BYTE $0x02 // mov byte [rdx + r11 + 2], al
LONG $0x44b60f42; WORD $0x031e // movzx eax, byte [rsi + r11 + 3]
LONG $0x1f443242; BYTE $0x03 // xor al, byte [rdi + r11 + 3]
LONG $0x1a448842; BYTE $0x03 // mov byte [rdx + r11 + 3], al
LONG $0x04c38349 // add r11, 4
WORD $0x394c; BYTE $0xd9 // cmp rcx, r11
JNE LBB3_6
JMP LBB3_16
LBB3_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd2970f41 // seta r10b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
WORD $0x8441; BYTE $0xda // test r10b, bl
JNE LBB3_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB3_3
WORD $0x8949; BYTE $0xcb // mov r11, rcx
LONG $0xe0e38349 // and r11, -32
LONG $0xe0438d49 // lea rax, [r11 - 32]
WORD $0x8949; BYTE $0xc1 // mov r9, rax
LONG $0x05e9c149 // shr r9, 5
LONG $0x01c18349 // add r9, 1
WORD $0x8548; BYTE $0xc0 // test rax, rax
JE LBB3_10
WORD $0x894d; BYTE $0xca // mov r10, r9
LONG $0xfee28349 // and r10, -2
WORD $0xf749; BYTE $0xda // neg r10
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB3_12:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LONG $0x44100f42; WORD $0x2007 // movups xmm0, oword [rdi + r8 + 32]
LONG $0x4c100f42; WORD $0x3007 // movups xmm1, oword [rdi + r8 + 48]
LONG $0x54100f42; WORD $0x2006 // movups xmm2, oword [rsi + r8 + 32]
WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0
LONG $0x44100f42; WORD $0x3006 // movups xmm0, oword [rsi + r8 + 48]
WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1
LONG $0x54110f42; WORD $0x2002 // movups oword [rdx + r8 + 32], xmm2
LONG $0x44110f42; WORD $0x3002 // movups oword [rdx + r8 + 48], xmm0
LONG $0x40c08349 // add r8, 64
LONG $0x02c28349 // add r10, 2
JNE LBB3_12
LONG $0x01c1f641 // test r9b, 1
JE LBB3_15
LBB3_14:
LONG $0x04100f42; BYTE $0x07 // movups xmm0, oword [rdi + r8]
LONG $0x4c100f42; WORD $0x1007 // movups xmm1, oword [rdi + r8 + 16]
LONG $0x14100f42; BYTE $0x06 // movups xmm2, oword [rsi + r8]
WORD $0x570f; BYTE $0xd0 // xorps xmm2, xmm0
LONG $0x44100f42; WORD $0x1006 // movups xmm0, oword [rsi + r8 + 16]
WORD $0x570f; BYTE $0xc1 // xorps xmm0, xmm1
LONG $0x14110f42; BYTE $0x02 // movups oword [rdx + r8], xmm2
LONG $0x44110f42; WORD $0x1002 // movups oword [rdx + r8 + 16], xmm0
LBB3_15:
WORD $0x3949; BYTE $0xcb // cmp r11, rcx
JNE LBB3_3
LBB3_16:
RET
LBB3_10:
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LONG $0x01c1f641 // test r9b, 1
JNE LBB3_14
JMP LBB3_15