internal/utils/transpose_ints_avx2_amd64.s (2,626 lines of code) (raw):
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ·_transpose_uint8_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB0_1
LBB0_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB0_5
LBB0_1:
WORD $0xd285 // test edx, edx
JLE LBB0_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB0_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB0_3
LBB0_4:
RET
TEXT ·_transpose_int8_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB1_1
LBB1_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB1_5
LBB1_1:
WORD $0xd285 // test edx, edx
JLE LBB1_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB1_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB1_3
LBB1_4:
RET
TEXT ·_transpose_uint16_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB2_1
LBB2_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x0257b70f // movzx edx, word [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x0457b70f // movzx edx, word [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x0657b70f // movzx edx, word [rdi + 6]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB2_5
LBB2_1:
WORD $0xd285 // test edx, edx
JLE LBB2_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB2_3:
LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB2_3
LBB2_4:
RET
TEXT ·_transpose_int16_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB3_1
LBB3_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB3_5
LBB3_1:
WORD $0xd285 // test edx, edx
JLE LBB3_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB3_3:
LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB3_3
LBB3_4:
RET
TEXT ·_transpose_uint32_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB4_1
LBB4_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB4_5
LBB4_1:
WORD $0xd285 // test edx, edx
JLE LBB4_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB4_3:
LONG $0x87048b42 // mov eax, dword [rdi + 4*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB4_3
LBB4_4:
RET
TEXT ·_transpose_int32_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB5_1
LBB5_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB5_5
LBB5_1:
WORD $0xd285 // test edx, edx
JLE LBB5_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB5_3:
LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB5_3
LBB5_4:
RET
TEXT ·_transpose_uint64_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB6_1
LBB6_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB6_5
LBB6_1:
WORD $0xd285 // test edx, edx
JLE LBB6_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB6_3:
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB6_3
LBB6_4:
RET
TEXT ·_transpose_int64_uint8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB7_1
LBB7_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB7_5
LBB7_1:
WORD $0xd285 // test edx, edx
JLE LBB7_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB7_3:
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB7_3
LBB7_4:
RET
TEXT ·_transpose_uint8_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB8_1
LBB8_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB8_5
LBB8_1:
WORD $0xd285 // test edx, edx
JLE LBB8_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB8_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB8_3
LBB8_4:
RET
TEXT ·_transpose_int8_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB9_1
LBB9_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB9_5
LBB9_1:
WORD $0xd285 // test edx, edx
JLE LBB9_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB9_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB9_3
LBB9_4:
RET
TEXT ·_transpose_uint16_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB10_1
LBB10_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x0257b70f // movzx edx, word [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x0457b70f // movzx edx, word [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x0657b70f // movzx edx, word [rdi + 6]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB10_5
LBB10_1:
WORD $0xd285 // test edx, edx
JLE LBB10_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB10_3:
LONG $0x04b70f42; BYTE $0x47 // movzx eax, word [rdi + 2*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB10_3
LBB10_4:
RET
TEXT ·_transpose_int16_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB11_1
LBB11_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB11_5
LBB11_1:
WORD $0xd285 // test edx, edx
JLE LBB11_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB11_3:
LONG $0x04bf0f4a; BYTE $0x47 // movsx rax, word [rdi + 2*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB11_3
LBB11_4:
RET
TEXT ·_transpose_uint32_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB12_1
LBB12_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB12_5
LBB12_1:
WORD $0xd285 // test edx, edx
JLE LBB12_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB12_3:
LONG $0x87048b42 // mov eax, dword [rdi + 4*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB12_3
LBB12_4:
RET
TEXT ·_transpose_int32_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB13_1
LBB13_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB13_5
LBB13_1:
WORD $0xd285 // test edx, edx
JLE LBB13_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB13_3:
LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB13_3
LBB13_4:
RET
TEXT ·_transpose_uint64_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB14_1
LBB14_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB14_5
LBB14_1:
WORD $0xd285 // test edx, edx
JLE LBB14_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB14_3:
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB14_3
LBB14_4:
RET
TEXT ·_transpose_int64_int8_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB15_1
LBB15_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x1688 // mov byte [rsi], dl
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x01 // mov byte [rsi + 1], dl
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x02 // mov byte [rsi + 2], dl
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b60f // movzx edx, byte [rcx + 4*rdx]
WORD $0x5688; BYTE $0x03 // mov byte [rsi + 3], dl
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x04c68348 // add rsi, 4
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB15_5
LBB15_1:
WORD $0xd285 // test edx, edx
JLE LBB15_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB15_3:
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
LONG $0x8104b60f // movzx eax, byte [rcx + 4*rax]
LONG $0x06048842 // mov byte [rsi + r8], al
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB15_3
LBB15_4:
RET
TEXT ·_transpose_uint8_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB16_1
LBB16_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB16_5
LBB16_1:
WORD $0xd285 // test edx, edx
JLE LBB16_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB16_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB16_3
LBB16_4:
RET
TEXT ·_transpose_int8_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB17_1
LBB17_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB17_5
LBB17_1:
WORD $0xd285 // test edx, edx
JLE LBB17_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB17_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB17_3
LBB17_4:
RET
TEXT ·_transpose_uint16_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB18_1
LBB18_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x0257b70f // movzx edx, word [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x0457b70f // movzx edx, word [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x0657b70f // movzx edx, word [rdi + 6]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB18_5
LBB18_1:
WORD $0xd285 // test edx, edx
JLE LBB18_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB18_3:
LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB18_3
LBB18_4:
RET
TEXT ·_transpose_int16_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB19_1
LBB19_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB19_5
LBB19_1:
WORD $0xd285 // test edx, edx
JLE LBB19_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB19_3:
LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB19_3
LBB19_4:
RET
TEXT ·_transpose_uint32_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB20_1
LBB20_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB20_5
LBB20_1:
WORD $0xd285 // test edx, edx
JLE LBB20_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB20_3:
LONG $0x47048b42 // mov eax, dword [rdi + 2*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB20_3
LBB20_4:
RET
TEXT ·_transpose_int32_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB21_1
LBB21_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB21_5
LBB21_1:
WORD $0xd285 // test edx, edx
JLE LBB21_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB21_3:
LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB21_3
LBB21_4:
RET
TEXT ·_transpose_uint64_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB22_1
LBB22_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB22_5
LBB22_1:
WORD $0xd285 // test edx, edx
JLE LBB22_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB22_3:
LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB22_3
LBB22_4:
RET
TEXT ·_transpose_int64_uint16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB23_1
LBB23_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB23_5
LBB23_1:
WORD $0xd285 // test edx, edx
JLE LBB23_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB23_3:
LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB23_3
LBB23_4:
RET
TEXT ·_transpose_uint8_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB24_1
LBB24_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB24_5
LBB24_1:
WORD $0xd285 // test edx, edx
JLE LBB24_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB24_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB24_3
LBB24_4:
RET
TEXT ·_transpose_int8_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB25_1
LBB25_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB25_5
LBB25_1:
WORD $0xd285 // test edx, edx
JLE LBB25_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB25_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x46 // mov word [rsi + 2*r8], ax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB25_3
LBB25_4:
RET
TEXT ·_transpose_uint16_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB26_1
LBB26_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x0257b70f // movzx edx, word [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x0457b70f // movzx edx, word [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x0657b70f // movzx edx, word [rdi + 6]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB26_5
LBB26_1:
WORD $0xd285 // test edx, edx
JLE LBB26_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB26_3:
LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB26_3
LBB26_4:
RET
TEXT ·_transpose_int16_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB27_1
LBB27_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB27_5
LBB27_1:
WORD $0xd285 // test edx, edx
JLE LBB27_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB27_3:
LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB27_3
LBB27_4:
RET
TEXT ·_transpose_uint32_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB28_1
LBB28_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB28_5
LBB28_1:
WORD $0xd285 // test edx, edx
JLE LBB28_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB28_3:
LONG $0x47048b42 // mov eax, dword [rdi + 2*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB28_3
LBB28_4:
RET
TEXT ·_transpose_int32_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB29_1
LBB29_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB29_5
LBB29_1:
WORD $0xd285 // test edx, edx
JLE LBB29_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB29_3:
LONG $0x4704634a // movsxd rax, dword [rdi + 2*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB29_3
LBB29_4:
RET
TEXT ·_transpose_uint64_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB30_1
LBB30_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB30_5
LBB30_1:
WORD $0xd285 // test edx, edx
JLE LBB30_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB30_3:
LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB30_3
LBB30_4:
RET
TEXT ·_transpose_int64_int16_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB31_1
LBB31_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
WORD $0x8966; BYTE $0x16 // mov word [rsi], dx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x02568966 // mov word [rsi + 2], dx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x04568966 // mov word [rsi + 4], dx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x9114b70f // movzx edx, word [rcx + 4*rdx]
LONG $0x06568966 // mov word [rsi + 6], dx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x08c68348 // add rsi, 8
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB31_5
LBB31_1:
WORD $0xd285 // test edx, edx
JLE LBB31_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB31_3:
LONG $0x87048b4a // mov rax, qword [rdi + 4*r8]
LONG $0x8104b70f // movzx eax, word [rcx + 4*rax]
LONG $0x04894266; BYTE $0x06 // mov word [rsi + r8], ax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB31_3
LBB31_4:
RET
TEXT ·_transpose_uint8_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB32_1
LBB32_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB32_5
LBB32_1:
WORD $0xd285 // test edx, edx
JLE LBB32_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB32_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB32_3
LBB32_4:
RET
TEXT ·_transpose_int8_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB33_1
LBB33_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB33_5
LBB33_1:
WORD $0xd285 // test edx, edx
JLE LBB33_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB33_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB33_3
LBB33_4:
RET
TEXT ·_transpose_uint16_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB34_1
LBB34_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x0257b70f // movzx edx, word [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x0457b70f // movzx edx, word [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x0657b70f // movzx edx, word [rdi + 6]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB34_5
LBB34_1:
WORD $0xd285 // test edx, edx
JLE LBB34_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB34_3:
LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x46048942 // mov dword [rsi + 2*r8], eax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB34_3
LBB34_4:
RET
TEXT ·_transpose_int16_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB35_1
LBB35_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB35_5
LBB35_1:
WORD $0xd285 // test edx, edx
JLE LBB35_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB35_3:
LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x46048942 // mov dword [rsi + 2*r8], eax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB35_3
LBB35_4:
RET
TEXT ·_transpose_uint32_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB36_1
LBB36_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB36_5
LBB36_1:
WORD $0xd285 // test edx, edx
JLE LBB36_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB36_3:
LONG $0x07048b42 // mov eax, dword [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB36_3
LBB36_4:
RET
TEXT ·_transpose_int32_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB37_1
LBB37_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB37_5
LBB37_1:
WORD $0xd285 // test edx, edx
JLE LBB37_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB37_3:
LONG $0x0704634a // movsxd rax, dword [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB37_3
LBB37_4:
RET
TEXT ·_transpose_uint64_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB38_1
LBB38_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB38_5
LBB38_1:
WORD $0xd285 // test edx, edx
JLE LBB38_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB38_3:
LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB38_3
LBB38_4:
RET
TEXT ·_transpose_int64_uint32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB39_1
LBB39_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB39_5
LBB39_1:
WORD $0xd285 // test edx, edx
JLE LBB39_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB39_3:
LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB39_3
LBB39_4:
RET
TEXT ·_transpose_uint8_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB40_1
LBB40_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB40_5
LBB40_1:
WORD $0xd285 // test edx, edx
JLE LBB40_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB40_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB40_3
LBB40_4:
RET
TEXT ·_transpose_int8_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB41_1
LBB41_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB41_5
LBB41_1:
WORD $0xd285 // test edx, edx
JLE LBB41_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB41_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB41_3
LBB41_4:
RET
TEXT ·_transpose_uint16_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB42_1
LBB42_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x0257b70f // movzx edx, word [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x0457b70f // movzx edx, word [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x0657b70f // movzx edx, word [rdi + 6]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB42_5
LBB42_1:
WORD $0xd285 // test edx, edx
JLE LBB42_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB42_3:
LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x46048942 // mov dword [rsi + 2*r8], eax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB42_3
LBB42_4:
RET
TEXT ·_transpose_int16_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB43_1
LBB43_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB43_5
LBB43_1:
WORD $0xd285 // test edx, edx
JLE LBB43_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB43_3:
LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x46048942 // mov dword [rsi + 2*r8], eax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB43_3
LBB43_4:
RET
TEXT ·_transpose_uint32_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB44_1
LBB44_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB44_5
LBB44_1:
WORD $0xd285 // test edx, edx
JLE LBB44_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB44_3:
LONG $0x07048b42 // mov eax, dword [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB44_3
LBB44_4:
RET
TEXT ·_transpose_int32_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB45_1
LBB45_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB45_5
LBB45_1:
WORD $0xd285 // test edx, edx
JLE LBB45_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB45_3:
LONG $0x0704634a // movsxd rax, dword [rdi + r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB45_3
LBB45_4:
RET
TEXT ·_transpose_uint64_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB46_1
LBB46_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB46_5
LBB46_1:
WORD $0xd285 // test edx, edx
JLE LBB46_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB46_3:
LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB46_3
LBB46_4:
RET
TEXT ·_transpose_int64_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB47_1
LBB47_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x1689 // mov dword [rsi], edx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x04 // mov dword [rsi + 4], edx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x08 // mov dword [rsi + 8], edx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
WORD $0x148b; BYTE $0x91 // mov edx, dword [rcx + 4*rdx]
WORD $0x5689; BYTE $0x0c // mov dword [rsi + 12], edx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x10c68348 // add rsi, 16
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB47_5
LBB47_1:
WORD $0xd285 // test edx, edx
JLE LBB47_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB47_3:
LONG $0x47048b4a // mov rax, qword [rdi + 2*r8]
WORD $0x048b; BYTE $0x81 // mov eax, dword [rcx + 4*rax]
LONG $0x06048942 // mov dword [rsi + r8], eax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB47_3
LBB47_4:
RET
TEXT ·_transpose_uint8_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB48_1
LBB48_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB48_5
LBB48_1:
WORD $0xd285 // test edx, edx
JLE LBB48_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB48_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB48_3
LBB48_4:
RET
TEXT ·_transpose_int8_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB49_1
LBB49_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB49_5
LBB49_1:
WORD $0xd285 // test edx, edx
JLE LBB49_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB49_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB49_3
LBB49_4:
RET
TEXT ·_transpose_uint16_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB50_1
LBB50_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x0257b70f // movzx edx, word [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x0457b70f // movzx edx, word [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x0657b70f // movzx edx, word [rdi + 6]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB50_5
LBB50_1:
WORD $0xd285 // test edx, edx
JLE LBB50_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB50_3:
LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x8604894a // mov qword [rsi + 4*r8], rax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB50_3
LBB50_4:
RET
TEXT ·_transpose_int16_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB51_1
LBB51_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB51_5
LBB51_1:
WORD $0xd285 // test edx, edx
JLE LBB51_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB51_3:
LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x8604894a // mov qword [rsi + 4*r8], rax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB51_3
LBB51_4:
RET
TEXT ·_transpose_uint32_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB52_1
LBB52_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB52_5
LBB52_1:
WORD $0xd285 // test edx, edx
JLE LBB52_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB52_3:
LONG $0x07048b42 // mov eax, dword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x4604894a // mov qword [rsi + 2*r8], rax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB52_3
LBB52_4:
RET
TEXT ·_transpose_int32_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB53_1
LBB53_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB53_5
LBB53_1:
WORD $0xd285 // test edx, edx
JLE LBB53_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB53_3:
LONG $0x0704634a // movsxd rax, dword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x4604894a // mov qword [rsi + 2*r8], rax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB53_3
LBB53_4:
RET
TEXT ·_transpose_uint64_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB54_1
LBB54_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB54_5
LBB54_1:
WORD $0xd285 // test edx, edx
JLE LBB54_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB54_3:
LONG $0x07048b4a // mov rax, qword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x0604894a // mov qword [rsi + r8], rax
LONG $0x08c08349 // add r8, 8
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB54_3
LBB54_4:
RET
TEXT ·_transpose_int64_uint64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB55_1
LBB55_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB55_5
LBB55_1:
WORD $0xd285 // test edx, edx
JLE LBB55_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB55_3:
LONG $0x07048b4a // mov rax, qword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x0604894a // mov qword [rsi + r8], rax
LONG $0x08c08349 // add r8, 8
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB55_3
LBB55_4:
RET
TEXT ·_transpose_uint8_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB56_1
LBB56_5:
WORD $0xd089 // mov eax, edx
WORD $0xb60f; BYTE $0x17 // movzx edx, byte [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x0157b60f // movzx edx, byte [rdi + 1]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x0257b60f // movzx edx, byte [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x0357b60f // movzx edx, byte [rdi + 3]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB56_5
LBB56_1:
WORD $0xd285 // test edx, edx
JLE LBB56_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB56_3:
LONG $0x04b60f42; BYTE $0x07 // movzx eax, byte [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB56_3
LBB56_4:
RET
TEXT ·_transpose_int8_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB57_1
LBB57_5:
WORD $0xd089 // mov eax, edx
LONG $0x17be0f48 // movsx rdx, byte [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x57be0f48; BYTE $0x01 // movsx rdx, byte [rdi + 1]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x57be0f48; BYTE $0x02 // movsx rdx, byte [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x57be0f48; BYTE $0x03 // movsx rdx, byte [rdi + 3]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x04c78348 // add rdi, 4
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB57_5
LBB57_1:
WORD $0xd285 // test edx, edx
JLE LBB57_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB57_3:
LONG $0x04be0f4a; BYTE $0x07 // movsx rax, byte [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LONG $0x01c08349 // add r8, 1
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB57_3
LBB57_4:
RET
TEXT ·_transpose_uint16_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB58_1
LBB58_5:
WORD $0xd089 // mov eax, edx
WORD $0xb70f; BYTE $0x17 // movzx edx, word [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x0257b70f // movzx edx, word [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x0457b70f // movzx edx, word [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x0657b70f // movzx edx, word [rdi + 6]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB58_5
LBB58_1:
WORD $0xd285 // test edx, edx
JLE LBB58_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB58_3:
LONG $0x04b70f42; BYTE $0x07 // movzx eax, word [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x8604894a // mov qword [rsi + 4*r8], rax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB58_3
LBB58_4:
RET
TEXT ·_transpose_int16_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB59_1
LBB59_5:
WORD $0xd089 // mov eax, edx
LONG $0x17bf0f48 // movsx rdx, word [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x57bf0f48; BYTE $0x02 // movsx rdx, word [rdi + 2]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x57bf0f48; BYTE $0x04 // movsx rdx, word [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x57bf0f48; BYTE $0x06 // movsx rdx, word [rdi + 6]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x08c78348 // add rdi, 8
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB59_5
LBB59_1:
WORD $0xd285 // test edx, edx
JLE LBB59_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB59_3:
LONG $0x04bf0f4a; BYTE $0x07 // movsx rax, word [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x8604894a // mov qword [rsi + 4*r8], rax
LONG $0x02c08349 // add r8, 2
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB59_3
LBB59_4:
RET
TEXT ·_transpose_uint32_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB60_1
LBB60_5:
WORD $0xd089 // mov eax, edx
WORD $0x178b // mov edx, dword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
WORD $0x578b; BYTE $0x04 // mov edx, dword [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
WORD $0x578b; BYTE $0x08 // mov edx, dword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
WORD $0x578b; BYTE $0x0c // mov edx, dword [rdi + 12]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB60_5
LBB60_1:
WORD $0xd285 // test edx, edx
JLE LBB60_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB60_3:
LONG $0x07048b42 // mov eax, dword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x4604894a // mov qword [rsi + 2*r8], rax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB60_3
LBB60_4:
RET
TEXT ·_transpose_int32_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB61_1
LBB61_5:
WORD $0xd089 // mov eax, edx
WORD $0x6348; BYTE $0x17 // movsxd rdx, dword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x04576348 // movsxd rdx, dword [rdi + 4]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x08576348 // movsxd rdx, dword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x0c576348 // movsxd rdx, dword [rdi + 12]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x10c78348 // add rdi, 16
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB61_5
LBB61_1:
WORD $0xd285 // test edx, edx
JLE LBB61_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB61_3:
LONG $0x0704634a // movsxd rax, dword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x4604894a // mov qword [rsi + 2*r8], rax
LONG $0x04c08349 // add r8, 4
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB61_3
LBB61_4:
RET
TEXT ·_transpose_uint64_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB62_1
LBB62_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB62_5
LBB62_1:
WORD $0xd285 // test edx, edx
JLE LBB62_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB62_3:
LONG $0x07048b4a // mov rax, qword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x0604894a // mov qword [rsi + r8], rax
LONG $0x08c08349 // add r8, 8
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB62_3
LBB62_4:
RET
TEXT ·_transpose_int64_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ length+16(FP), DX
MOVQ transposeMap+24(FP), CX
WORD $0xfa83; BYTE $0x04 // cmp edx, 4
JL LBB63_1
LBB63_5:
WORD $0xd089 // mov eax, edx
WORD $0x8b48; BYTE $0x17 // mov rdx, qword [rdi]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
WORD $0x8948; BYTE $0x16 // mov qword [rsi], rdx
LONG $0x08578b48 // mov rdx, qword [rdi + 8]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x08568948 // mov qword [rsi + 8], rdx
LONG $0x10578b48 // mov rdx, qword [rdi + 16]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x10568948 // mov qword [rsi + 16], rdx
LONG $0x18578b48 // mov rdx, qword [rdi + 24]
LONG $0x91146348 // movsxd rdx, dword [rcx + 4*rdx]
LONG $0x18568948 // mov qword [rsi + 24], rdx
WORD $0x508d; BYTE $0xfc // lea edx, [rax - 4]
LONG $0x20c78348 // add rdi, 32
LONG $0x20c68348 // add rsi, 32
WORD $0xf883; BYTE $0x07 // cmp eax, 7
JG LBB63_5
LBB63_1:
WORD $0xd285 // test edx, edx
JLE LBB63_4
WORD $0xc283; BYTE $0x01 // add edx, 1
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB63_3:
LONG $0x07048b4a // mov rax, qword [rdi + r8]
LONG $0x81046348 // movsxd rax, dword [rcx + 4*rax]
LONG $0x0604894a // mov qword [rsi + r8], rax
LONG $0x08c08349 // add r8, 8
WORD $0xc283; BYTE $0xff // add edx, -1
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JG LBB63_3
LBB63_4:
RET