arrow/bitutil/bitmap_ops_avx2_amd64.s (334 lines of code) (raw):
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ·_bitmap_aligned_and_avx2(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB0_12
LONG $0x7ff98348 // cmp rcx, 127
JA LBB0_7
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
JMP LBB0_3
LBB0_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd3970f41 // seta r11b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
WORD $0x8441; BYTE $0xdb // test r11b, bl
JNE LBB0_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB0_3
WORD $0x8949; BYTE $0xca // mov r10, rcx
LONG $0x80e28349 // and r10, -128
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB0_10:
LONG $0x107ca1c4; WORD $0x0604 // vmovups ymm0, yword [rsi + r8]
LONG $0x107ca1c4; WORD $0x064c; BYTE $0x20 // vmovups ymm1, yword [rsi + r8 + 32]
LONG $0x107ca1c4; WORD $0x0654; BYTE $0x40 // vmovups ymm2, yword [rsi + r8 + 64]
LONG $0x107ca1c4; WORD $0x065c; BYTE $0x60 // vmovups ymm3, yword [rsi + r8 + 96]
LONG $0x547ca1c4; WORD $0x0704 // vandps ymm0, ymm0, yword [rdi + r8]
LONG $0x5474a1c4; WORD $0x074c; BYTE $0x20 // vandps ymm1, ymm1, yword [rdi + r8 + 32]
LONG $0x546ca1c4; WORD $0x0754; BYTE $0x40 // vandps ymm2, ymm2, yword [rdi + r8 + 64]
LONG $0x5464a1c4; WORD $0x075c; BYTE $0x60 // vandps ymm3, ymm3, yword [rdi + r8 + 96]
LONG $0x117ca1c4; WORD $0x0204 // vmovups yword [rdx + r8], ymm0
LONG $0x117ca1c4; WORD $0x024c; BYTE $0x20 // vmovups yword [rdx + r8 + 32], ymm1
LONG $0x117ca1c4; WORD $0x0254; BYTE $0x40 // vmovups yword [rdx + r8 + 64], ymm2
LONG $0x117ca1c4; WORD $0x025c; BYTE $0x60 // vmovups yword [rdx + r8 + 96], ymm3
LONG $0x80e88349 // sub r8, -128
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JNE LBB0_10
WORD $0x3949; BYTE $0xca // cmp r10, rcx
JE LBB0_12
LBB0_3:
WORD $0x894d; BYTE $0xd0 // mov r8, r10
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xc8 // add r8, rcx
WORD $0x8949; BYTE $0xc9 // mov r9, rcx
LONG $0x03e18349 // and r9, 3
JE LBB0_5
LBB0_4:
LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
LONG $0x17042242 // and al, byte [rdi + r10]
LONG $0x12048842 // mov byte [rdx + r10], al
LONG $0x01c28349 // add r10, 1
LONG $0xffc18349 // add r9, -1
JNE LBB0_4
LBB0_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB0_12
LBB0_6:
LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
LONG $0x17042242 // and al, byte [rdi + r10]
LONG $0x12048842 // mov byte [rdx + r10], al
LONG $0x44b60f42; WORD $0x0116 // movzx eax, byte [rsi + r10 + 1]
LONG $0x17442242; BYTE $0x01 // and al, byte [rdi + r10 + 1]
LONG $0x12448842; BYTE $0x01 // mov byte [rdx + r10 + 1], al
LONG $0x44b60f42; WORD $0x0216 // movzx eax, byte [rsi + r10 + 2]
LONG $0x17442242; BYTE $0x02 // and al, byte [rdi + r10 + 2]
LONG $0x12448842; BYTE $0x02 // mov byte [rdx + r10 + 2], al
LONG $0x44b60f42; WORD $0x0316 // movzx eax, byte [rsi + r10 + 3]
LONG $0x17442242; BYTE $0x03 // and al, byte [rdi + r10 + 3]
LONG $0x12448842; BYTE $0x03 // mov byte [rdx + r10 + 3], al
LONG $0x04c28349 // add r10, 4
WORD $0x394c; BYTE $0xd1 // cmp rcx, r10
JNE LBB0_6
LBB0_12:
VZEROUPPER
RET
TEXT ·_bitmap_aligned_or_avx2(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB1_12
LONG $0x7ff98348 // cmp rcx, 127
JA LBB1_7
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
JMP LBB1_3
LBB1_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd3970f41 // seta r11b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
WORD $0x8441; BYTE $0xdb // test r11b, bl
JNE LBB1_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB1_3
WORD $0x8949; BYTE $0xca // mov r10, rcx
LONG $0x80e28349 // and r10, -128
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB1_10:
LONG $0x107ca1c4; WORD $0x0604 // vmovups ymm0, yword [rsi + r8]
LONG $0x107ca1c4; WORD $0x064c; BYTE $0x20 // vmovups ymm1, yword [rsi + r8 + 32]
LONG $0x107ca1c4; WORD $0x0654; BYTE $0x40 // vmovups ymm2, yword [rsi + r8 + 64]
LONG $0x107ca1c4; WORD $0x065c; BYTE $0x60 // vmovups ymm3, yword [rsi + r8 + 96]
LONG $0x567ca1c4; WORD $0x0704 // vorps ymm0, ymm0, yword [rdi + r8]
LONG $0x5674a1c4; WORD $0x074c; BYTE $0x20 // vorps ymm1, ymm1, yword [rdi + r8 + 32]
LONG $0x566ca1c4; WORD $0x0754; BYTE $0x40 // vorps ymm2, ymm2, yword [rdi + r8 + 64]
LONG $0x5664a1c4; WORD $0x075c; BYTE $0x60 // vorps ymm3, ymm3, yword [rdi + r8 + 96]
LONG $0x117ca1c4; WORD $0x0204 // vmovups yword [rdx + r8], ymm0
LONG $0x117ca1c4; WORD $0x024c; BYTE $0x20 // vmovups yword [rdx + r8 + 32], ymm1
LONG $0x117ca1c4; WORD $0x0254; BYTE $0x40 // vmovups yword [rdx + r8 + 64], ymm2
LONG $0x117ca1c4; WORD $0x025c; BYTE $0x60 // vmovups yword [rdx + r8 + 96], ymm3
LONG $0x80e88349 // sub r8, -128
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JNE LBB1_10
WORD $0x3949; BYTE $0xca // cmp r10, rcx
JE LBB1_12
LBB1_3:
WORD $0x894d; BYTE $0xd0 // mov r8, r10
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xc8 // add r8, rcx
WORD $0x8949; BYTE $0xc9 // mov r9, rcx
LONG $0x03e18349 // and r9, 3
JE LBB1_5
LBB1_4:
LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
LONG $0x17040a42 // or al, byte [rdi + r10]
LONG $0x12048842 // mov byte [rdx + r10], al
LONG $0x01c28349 // add r10, 1
LONG $0xffc18349 // add r9, -1
JNE LBB1_4
LBB1_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB1_12
LBB1_6:
LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
LONG $0x17040a42 // or al, byte [rdi + r10]
LONG $0x12048842 // mov byte [rdx + r10], al
LONG $0x44b60f42; WORD $0x0116 // movzx eax, byte [rsi + r10 + 1]
LONG $0x17440a42; BYTE $0x01 // or al, byte [rdi + r10 + 1]
LONG $0x12448842; BYTE $0x01 // mov byte [rdx + r10 + 1], al
LONG $0x44b60f42; WORD $0x0216 // movzx eax, byte [rsi + r10 + 2]
LONG $0x17440a42; BYTE $0x02 // or al, byte [rdi + r10 + 2]
LONG $0x12448842; BYTE $0x02 // mov byte [rdx + r10 + 2], al
LONG $0x44b60f42; WORD $0x0316 // movzx eax, byte [rsi + r10 + 3]
LONG $0x17440a42; BYTE $0x03 // or al, byte [rdi + r10 + 3]
LONG $0x12448842; BYTE $0x03 // mov byte [rdx + r10 + 3], al
LONG $0x04c28349 // add r10, 4
WORD $0x394c; BYTE $0xd1 // cmp rcx, r10
JNE LBB1_6
LBB1_12:
VZEROUPPER
RET
TEXT ·_bitmap_aligned_and_not_avx2(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB2_12
LONG $0x7ff98348 // cmp rcx, 127
JA LBB2_7
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
JMP LBB2_3
LBB2_7:
LONG $0x0a048d4c // lea r8, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd3970f41 // seta r11b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf8 // cmp r8, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd2970f41 // seta r10b
WORD $0x3949; BYTE $0xf0 // cmp r8, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
WORD $0x8441; BYTE $0xdb // test r11b, bl
JNE LBB2_3
WORD $0x2045; BYTE $0xca // and r10b, r9b
JNE LBB2_3
WORD $0x8949; BYTE $0xc8 // mov r8, rcx
LONG $0x80e08349 // and r8, -128
WORD $0xc031 // xor eax, eax
LBB2_10:
LONG $0x0410fcc5; BYTE $0x06 // vmovups ymm0, yword [rsi + rax]
LONG $0x4c10fcc5; WORD $0x2006 // vmovups ymm1, yword [rsi + rax + 32]
LONG $0x5410fcc5; WORD $0x4006 // vmovups ymm2, yword [rsi + rax + 64]
LONG $0x5c10fcc5; WORD $0x6006 // vmovups ymm3, yword [rsi + rax + 96]
LONG $0x0455fcc5; BYTE $0x07 // vandnps ymm0, ymm0, yword [rdi + rax]
LONG $0x4c55f4c5; WORD $0x2007 // vandnps ymm1, ymm1, yword [rdi + rax + 32]
LONG $0x5455ecc5; WORD $0x4007 // vandnps ymm2, ymm2, yword [rdi + rax + 64]
LONG $0x5c55e4c5; WORD $0x6007 // vandnps ymm3, ymm3, yword [rdi + rax + 96]
LONG $0x0411fcc5; BYTE $0x02 // vmovups yword [rdx + rax], ymm0
LONG $0x4c11fcc5; WORD $0x2002 // vmovups yword [rdx + rax + 32], ymm1
LONG $0x5411fcc5; WORD $0x4002 // vmovups yword [rdx + rax + 64], ymm2
LONG $0x5c11fcc5; WORD $0x6002 // vmovups yword [rdx + rax + 96], ymm3
LONG $0x80e88348 // sub rax, -128
WORD $0x3949; BYTE $0xc0 // cmp r8, rax
JNE LBB2_10
WORD $0x3949; BYTE $0xc8 // cmp r8, rcx
JE LBB2_12
LBB2_3:
WORD $0x894d; BYTE $0xc1 // mov r9, r8
WORD $0xf749; BYTE $0xd1 // not r9
WORD $0xc1f6; BYTE $0x01 // test cl, 1
JE LBB2_5
LONG $0x06048a42 // mov al, byte [rsi + r8]
WORD $0xd0f6 // not al
LONG $0x07042242 // and al, byte [rdi + r8]
LONG $0x02048842 // mov byte [rdx + r8], al
LONG $0x01c88349 // or r8, 1
LBB2_5:
WORD $0x0149; BYTE $0xc9 // add r9, rcx
JE LBB2_12
LBB2_6:
LONG $0x04b60f42; BYTE $0x06 // movzx eax, byte [rsi + r8]
WORD $0xd0f6 // not al
LONG $0x07042242 // and al, byte [rdi + r8]
LONG $0x02048842 // mov byte [rdx + r8], al
LONG $0x44b60f42; WORD $0x0106 // movzx eax, byte [rsi + r8 + 1]
WORD $0xd0f6 // not al
LONG $0x07442242; BYTE $0x01 // and al, byte [rdi + r8 + 1]
LONG $0x02448842; BYTE $0x01 // mov byte [rdx + r8 + 1], al
LONG $0x02c08349 // add r8, 2
WORD $0x394c; BYTE $0xc1 // cmp rcx, r8
JNE LBB2_6
LBB2_12:
VZEROUPPER
RET
TEXT ·_bitmap_aligned_xor_avx2(SB), $0-32
MOVQ left+0(FP), DI
MOVQ right+8(FP), SI
MOVQ out+16(FP), DX
MOVQ length+24(FP), CX
WORD $0x8548; BYTE $0xc9 // test rcx, rcx
JLE LBB3_12
LONG $0x7ff98348 // cmp rcx, 127
JA LBB3_7
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
JMP LBB3_3
LBB3_7:
LONG $0x0a0c8d4c // lea r9, [rdx + rcx]
LONG $0x0f048d48 // lea rax, [rdi + rcx]
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd3970f41 // seta r11b
LONG $0x0e048d48 // lea rax, [rsi + rcx]
WORD $0x3949; BYTE $0xf9 // cmp r9, rdi
WORD $0x970f; BYTE $0xd3 // seta bl
WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
LONG $0xd0970f41 // seta r8b
WORD $0x3949; BYTE $0xf1 // cmp r9, rsi
LONG $0xd1970f41 // seta r9b
WORD $0x3145; BYTE $0xd2 // xor r10d, r10d
WORD $0x8441; BYTE $0xdb // test r11b, bl
JNE LBB3_3
WORD $0x2045; BYTE $0xc8 // and r8b, r9b
JNE LBB3_3
WORD $0x8949; BYTE $0xca // mov r10, rcx
LONG $0x80e28349 // and r10, -128
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB3_10:
LONG $0x107ca1c4; WORD $0x0604 // vmovups ymm0, yword [rsi + r8]
LONG $0x107ca1c4; WORD $0x064c; BYTE $0x20 // vmovups ymm1, yword [rsi + r8 + 32]
LONG $0x107ca1c4; WORD $0x0654; BYTE $0x40 // vmovups ymm2, yword [rsi + r8 + 64]
LONG $0x107ca1c4; WORD $0x065c; BYTE $0x60 // vmovups ymm3, yword [rsi + r8 + 96]
LONG $0x577ca1c4; WORD $0x0704 // vxorps ymm0, ymm0, yword [rdi + r8]
LONG $0x5774a1c4; WORD $0x074c; BYTE $0x20 // vxorps ymm1, ymm1, yword [rdi + r8 + 32]
LONG $0x576ca1c4; WORD $0x0754; BYTE $0x40 // vxorps ymm2, ymm2, yword [rdi + r8 + 64]
LONG $0x5764a1c4; WORD $0x075c; BYTE $0x60 // vxorps ymm3, ymm3, yword [rdi + r8 + 96]
LONG $0x117ca1c4; WORD $0x0204 // vmovups yword [rdx + r8], ymm0
LONG $0x117ca1c4; WORD $0x024c; BYTE $0x20 // vmovups yword [rdx + r8 + 32], ymm1
LONG $0x117ca1c4; WORD $0x0254; BYTE $0x40 // vmovups yword [rdx + r8 + 64], ymm2
LONG $0x117ca1c4; WORD $0x025c; BYTE $0x60 // vmovups yword [rdx + r8 + 96], ymm3
LONG $0x80e88349 // sub r8, -128
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JNE LBB3_10
WORD $0x3949; BYTE $0xca // cmp r10, rcx
JE LBB3_12
LBB3_3:
WORD $0x894d; BYTE $0xd0 // mov r8, r10
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x0149; BYTE $0xc8 // add r8, rcx
WORD $0x8949; BYTE $0xc9 // mov r9, rcx
LONG $0x03e18349 // and r9, 3
JE LBB3_5
LBB3_4:
LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
LONG $0x17043242 // xor al, byte [rdi + r10]
LONG $0x12048842 // mov byte [rdx + r10], al
LONG $0x01c28349 // add r10, 1
LONG $0xffc18349 // add r9, -1
JNE LBB3_4
LBB3_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB3_12
LBB3_6:
LONG $0x04b60f42; BYTE $0x16 // movzx eax, byte [rsi + r10]
LONG $0x17043242 // xor al, byte [rdi + r10]
LONG $0x12048842 // mov byte [rdx + r10], al
LONG $0x44b60f42; WORD $0x0116 // movzx eax, byte [rsi + r10 + 1]
LONG $0x17443242; BYTE $0x01 // xor al, byte [rdi + r10 + 1]
LONG $0x12448842; BYTE $0x01 // mov byte [rdx + r10 + 1], al
LONG $0x44b60f42; WORD $0x0216 // movzx eax, byte [rsi + r10 + 2]
LONG $0x17443242; BYTE $0x02 // xor al, byte [rdi + r10 + 2]
LONG $0x12448842; BYTE $0x02 // mov byte [rdx + r10 + 2], al
LONG $0x44b60f42; WORD $0x0316 // movzx eax, byte [rsi + r10 + 3]
LONG $0x17443242; BYTE $0x03 // xor al, byte [rdi + r10 + 3]
LONG $0x12448842; BYTE $0x03 // mov byte [rdx + r10 + 3], al
LONG $0x04c28349 // add r10, 4
WORD $0x394c; BYTE $0xd1 // cmp rcx, r10
JNE LBB3_6
LBB3_12:
VZEROUPPER
RET