arrow/compute/internal/kernels/constant_factor_avx2_amd64.s (687 lines of code) (raw):
//go:build go1.18 && !noasm && !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
TEXT ·_multiply_constant_int32_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB0_16
WORD $0x8941; BYTE $0xd1 // mov r9d, edx
WORD $0xfa83; BYTE $0x1f // cmp edx, 31
JBE LBB0_2
LONG $0x8f048d4a // lea rax, [rdi + 4*r9]
WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
JBE LBB0_9
LONG $0x8e048d4a // lea rax, [rsi + 4*r9]
WORD $0x3948; BYTE $0xf8 // cmp rax, rdi
JBE LBB0_9
LBB0_2:
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LBB0_3:
WORD $0x894d; BYTE $0xd8 // mov r8, r11
WORD $0xf749; BYTE $0xd0 // not r8
WORD $0x014d; BYTE $0xc8 // add r8, r9
WORD $0x894c; BYTE $0xc8 // mov rax, r9
LONG $0x03e08348 // and rax, 3
JE LBB0_5
LBB0_4:
LONG $0x9f148b42 // mov edx, dword [rdi + 4*r11]
WORD $0xaf0f; BYTE $0xd1 // imul edx, ecx
LONG $0x9e148942 // mov dword [rsi + 4*r11], edx
LONG $0x01c38349 // add r11, 1
LONG $0xffc08348 // add rax, -1
JNE LBB0_4
LBB0_5:
LONG $0x03f88349 // cmp r8, 3
JB LBB0_16
LBB0_6:
LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11]
WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx
LONG $0x9e048942 // mov dword [rsi + 4*r11], eax
LONG $0x9f448b42; BYTE $0x04 // mov eax, dword [rdi + 4*r11 + 4]
WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx
LONG $0x9e448942; BYTE $0x04 // mov dword [rsi + 4*r11 + 4], eax
LONG $0x9f448b42; BYTE $0x08 // mov eax, dword [rdi + 4*r11 + 8]
WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx
LONG $0x9e448942; BYTE $0x08 // mov dword [rsi + 4*r11 + 8], eax
LONG $0x9f448b42; BYTE $0x0c // mov eax, dword [rdi + 4*r11 + 12]
WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx
LONG $0x9e448942; BYTE $0x0c // mov dword [rsi + 4*r11 + 12], eax
LONG $0x04c38349 // add r11, 4
WORD $0x394d; BYTE $0xd9 // cmp r9, r11
JNE LBB0_6
JMP LBB0_16
LBB0_9:
WORD $0x8945; BYTE $0xcb // mov r11d, r9d
LONG $0xe0e38341 // and r11d, -32
LONG $0xc16ef9c5 // vmovd xmm0, ecx
LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0
LONG $0xe0438d49 // lea rax, [r11 - 32]
WORD $0x8949; BYTE $0xc0 // mov r8, rax
LONG $0x05e8c149 // shr r8, 5
LONG $0x01c08349 // add r8, 1
WORD $0x8548; BYTE $0xc0 // test rax, rax
JE LBB0_10
WORD $0x894d; BYTE $0xc2 // mov r10, r8
LONG $0xfee28349 // and r10, -2
WORD $0xf749; BYTE $0xda // neg r10
WORD $0xc031 // xor eax, eax
LBB0_12:
LONG $0x407de2c4; WORD $0x870c // vpmulld ymm1, ymm0, yword [rdi + 4*rax]
LONG $0x407de2c4; WORD $0x8754; BYTE $0x20 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 32]
LONG $0x407de2c4; WORD $0x875c; BYTE $0x40 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 64]
LONG $0x407de2c4; WORD $0x8764; BYTE $0x60 // vpmulld ymm4, ymm0, yword [rdi + 4*rax + 96]
LONG $0x0c7ffec5; BYTE $0x86 // vmovdqu yword [rsi + 4*rax], ymm1
LONG $0x547ffec5; WORD $0x2086 // vmovdqu yword [rsi + 4*rax + 32], ymm2
LONG $0x5c7ffec5; WORD $0x4086 // vmovdqu yword [rsi + 4*rax + 64], ymm3
LONG $0x647ffec5; WORD $0x6086 // vmovdqu yword [rsi + 4*rax + 96], ymm4
QUAD $0x0080878c407de2c4; WORD $0x0000 // vpmulld ymm1, ymm0, yword [rdi + 4*rax + 128]
QUAD $0x00a08794407de2c4; WORD $0x0000 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 160]
QUAD $0x00c0879c407de2c4; WORD $0x0000 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 192]
QUAD $0x00e087a4407de2c4; WORD $0x0000 // vpmulld ymm4, ymm0, yword [rdi + 4*rax + 224]
QUAD $0x000080868c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 128], ymm1
QUAD $0x0000a086947ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 160], ymm2
QUAD $0x0000c0869c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 192], ymm3
QUAD $0x0000e086a47ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 224], ymm4
LONG $0x40c08348 // add rax, 64
LONG $0x02c28349 // add r10, 2
JNE LBB0_12
LONG $0x01c0f641 // test r8b, 1
JE LBB0_15
LBB0_14:
LONG $0x407de2c4; WORD $0x870c // vpmulld ymm1, ymm0, yword [rdi + 4*rax]
LONG $0x407de2c4; WORD $0x8754; BYTE $0x20 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 32]
LONG $0x407de2c4; WORD $0x875c; BYTE $0x40 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 64]
LONG $0x407de2c4; WORD $0x8744; BYTE $0x60 // vpmulld ymm0, ymm0, yword [rdi + 4*rax + 96]
LONG $0x0c7ffec5; BYTE $0x86 // vmovdqu yword [rsi + 4*rax], ymm1
LONG $0x547ffec5; WORD $0x2086 // vmovdqu yword [rsi + 4*rax + 32], ymm2
LONG $0x5c7ffec5; WORD $0x4086 // vmovdqu yword [rsi + 4*rax + 64], ymm3
LONG $0x447ffec5; WORD $0x6086 // vmovdqu yword [rsi + 4*rax + 96], ymm0
LBB0_15:
WORD $0x394d; BYTE $0xcb // cmp r11, r9
JNE LBB0_3
LBB0_16:
VZEROUPPER
RET
LBB0_10:
WORD $0xc031 // xor eax, eax
LONG $0x01c0f641 // test r8b, 1
JNE LBB0_14
JMP LBB0_15
TEXT ·_divide_constant_int32_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB1_8
WORD $0x8941; BYTE $0xd1 // mov r9d, edx
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JNE LBB1_9
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB1_3:
LONG $0x01c1f641 // test r9b, 1
JE LBB1_8
LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB1_5
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB1_7
LBB1_9:
WORD $0x8945; BYTE $0xca // mov r10d, r9d
LONG $0xfee28341 // and r10d, -2
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
JMP LBB1_10
LBB1_15:
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
LBB1_16:
LONG $0x86448942; BYTE $0x04 // mov dword [rsi + 4*r8 + 4], eax
LONG $0x02c08349 // add r8, 2
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JE LBB1_3
LBB1_10:
LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB1_11
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB1_13
LBB1_11:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB1_13:
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LONG $0x8744634a; BYTE $0x04 // movsxd rax, dword [rdi + 4*r8 + 4]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JNE LBB1_15
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
JMP LBB1_16
LBB1_5:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB1_7:
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LBB1_8:
RET
TEXT ·_multiply_constant_int32_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB2_7
WORD $0x8941; BYTE $0xd0 // mov r8d, edx
WORD $0xfa83; BYTE $0x0f // cmp edx, 15
JA LBB2_3
WORD $0xd231 // xor edx, edx
JMP LBB2_6
LBB2_3:
WORD $0x8944; BYTE $0xc2 // mov edx, r8d
WORD $0xe283; BYTE $0xf0 // and edx, -16
LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx
LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
WORD $0xc031 // xor eax, eax
LONG $0xd073f5c5; BYTE $0x20 // vpsrlq ymm1, ymm0, 32
LBB2_4:
LONG $0x257de2c4; WORD $0x8714 // vpmovsxdq ymm2, oword [rdi + 4*rax]
LONG $0x257de2c4; WORD $0x875c; BYTE $0x10 // vpmovsxdq ymm3, oword [rdi + 4*rax + 16]
LONG $0x257de2c4; WORD $0x8764; BYTE $0x20 // vpmovsxdq ymm4, oword [rdi + 4*rax + 32]
LONG $0x257de2c4; WORD $0x876c; BYTE $0x30 // vpmovsxdq ymm5, oword [rdi + 4*rax + 48]
LONG $0xf2f4f5c5 // vpmuludq ymm6, ymm1, ymm2
LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32
LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7
LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd2f4fdc5 // vpmuludq ymm2, ymm0, ymm2
LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6
LONG $0xf3f4f5c5 // vpmuludq ymm6, ymm1, ymm3
LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32
LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7
LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xdbf4fdc5 // vpmuludq ymm3, ymm0, ymm3
LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6
LONG $0xf4f4f5c5 // vpmuludq ymm6, ymm1, ymm4
LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32
LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7
LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xe4f4fdc5 // vpmuludq ymm4, ymm0, ymm4
LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6
LONG $0xf5f4f5c5 // vpmuludq ymm6, ymm1, ymm5
LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32
LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7
LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xedf4fdc5 // vpmuludq ymm5, ymm0, ymm5
LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6
LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2
LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3
LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4
LONG $0x6c7ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm5
LONG $0x10c08348 // add rax, 16
WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
JNE LBB2_4
WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
JE LBB2_7
LBB2_6:
LONG $0x97046348 // movsxd rax, dword [rdi + 4*rdx]
LONG $0xc1af0f48 // imul rax, rcx
LONG $0xd6048948 // mov qword [rsi + 8*rdx], rax
LONG $0x01c28348 // add rdx, 1
WORD $0x3949; BYTE $0xd0 // cmp r8, rdx
JNE LBB2_6
LBB2_7:
VZEROUPPER
RET
TEXT ·_divide_constant_int32_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB3_8
WORD $0x8941; BYTE $0xd1 // mov r9d, edx
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JNE LBB3_9
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB3_3:
LONG $0x01c1f641 // test r9b, 1
JE LBB3_8
LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB3_5
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB3_7
LBB3_9:
WORD $0x8945; BYTE $0xca // mov r10d, r9d
LONG $0xfee28341 // and r10d, -2
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
JMP LBB3_10
LBB3_15:
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
LBB3_16:
LONG $0xc644894a; BYTE $0x08 // mov qword [rsi + 8*r8 + 8], rax
LONG $0x02c08349 // add r8, 2
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JE LBB3_3
LBB3_10:
LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB3_11
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB3_13
LBB3_11:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB3_13:
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LONG $0x8744634a; BYTE $0x04 // movsxd rax, dword [rdi + 4*r8 + 4]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JNE LBB3_15
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
JMP LBB3_16
LBB3_5:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB3_7:
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LBB3_8:
RET
TEXT ·_multiply_constant_int64_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB4_7
WORD $0x8941; BYTE $0xd0 // mov r8d, edx
WORD $0xfa83; BYTE $0x0f // cmp edx, 15
JA LBB4_3
WORD $0xd231 // xor edx, edx
JMP LBB4_6
LBB4_3:
WORD $0x8944; BYTE $0xc2 // mov edx, r8d
WORD $0xe283; BYTE $0xf0 // and edx, -16
LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx
LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
WORD $0xc031 // xor eax, eax
LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1
LBB4_4:
LONG $0x1410f8c5; BYTE $0xc7 // vmovups xmm2, oword [rdi + 8*rax]
LONG $0x5c10f8c5; WORD $0x20c7 // vmovups xmm3, oword [rdi + 8*rax + 32]
LONG $0x6410f8c5; WORD $0x40c7 // vmovups xmm4, oword [rdi + 8*rax + 64]
LONG $0x6c10f8c5; WORD $0x60c7 // vmovups xmm5, oword [rdi + 8*rax + 96]
LONG $0x54c6e8c5; WORD $0x10c7; BYTE $0x88 // vshufps xmm2, xmm2, oword [rdi + 8*rax + 16], 136
LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136
LONG $0x4069e2c4; BYTE $0xd6 // vpmulld xmm2, xmm2, xmm6
LONG $0x5cc6e0c5; WORD $0x30c7; BYTE $0x88 // vshufps xmm3, xmm3, oword [rdi + 8*rax + 48], 136
LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136
LONG $0x4061e2c4; BYTE $0xde // vpmulld xmm3, xmm3, xmm6
LONG $0x64c6d8c5; WORD $0x50c7; BYTE $0x88 // vshufps xmm4, xmm4, oword [rdi + 8*rax + 80], 136
LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136
LONG $0x4059e2c4; BYTE $0xe6 // vpmulld xmm4, xmm4, xmm6
LONG $0x6cc6d0c5; WORD $0x70c7; BYTE $0x88 // vshufps xmm5, xmm5, oword [rdi + 8*rax + 112], 136
LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136
LONG $0x4051e2c4; BYTE $0xee // vpmulld xmm5, xmm5, xmm6
LONG $0x147ffac5; BYTE $0x86 // vmovdqu oword [rsi + 4*rax], xmm2
LONG $0x5c7ffac5; WORD $0x1086 // vmovdqu oword [rsi + 4*rax + 16], xmm3
LONG $0x647ffac5; WORD $0x2086 // vmovdqu oword [rsi + 4*rax + 32], xmm4
LONG $0x6c7ffac5; WORD $0x3086 // vmovdqu oword [rsi + 4*rax + 48], xmm5
LONG $0x10c08348 // add rax, 16
WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
JNE LBB4_4
WORD $0x394c; BYTE $0xc2 // cmp rdx, r8
JE LBB4_7
LBB4_6:
WORD $0x048b; BYTE $0xd7 // mov eax, dword [rdi + 8*rdx]
WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx
WORD $0x0489; BYTE $0x96 // mov dword [rsi + 4*rdx], eax
LONG $0x01c28348 // add rdx, 1
WORD $0x3949; BYTE $0xd0 // cmp r8, rdx
JNE LBB4_6
LBB4_7:
VZEROUPPER
RET
TEXT ·_divide_constant_int64_int32_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB5_8
WORD $0x8941; BYTE $0xd1 // mov r9d, edx
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JNE LBB5_9
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB5_3:
LONG $0x01c1f641 // test r9b, 1
JE LBB5_8
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB5_5
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB5_7
LBB5_9:
WORD $0x8945; BYTE $0xca // mov r10d, r9d
LONG $0xfee28341 // and r10d, -2
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
JMP LBB5_10
LBB5_15:
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
LBB5_16:
LONG $0x86448942; BYTE $0x04 // mov dword [rsi + 4*r8 + 4], eax
LONG $0x02c08349 // add r8, 2
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JE LBB5_3
LBB5_10:
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB5_11
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB5_13
LBB5_11:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB5_13:
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LONG $0xc7448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r8 + 8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JNE LBB5_15
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
JMP LBB5_16
LBB5_5:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB5_7:
LONG $0x86048942 // mov dword [rsi + 4*r8], eax
LBB5_8:
RET
TEXT ·_multiply_constant_int64_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB6_16
WORD $0x8941; BYTE $0xd0 // mov r8d, edx
WORD $0xfa83; BYTE $0x0f // cmp edx, 15
JBE LBB6_2
LONG $0xc7048d4a // lea rax, [rdi + 8*r8]
WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
JBE LBB6_9
LONG $0xc6048d4a // lea rax, [rsi + 8*r8]
WORD $0x3948; BYTE $0xf8 // cmp rax, rdi
JBE LBB6_9
LBB6_2:
WORD $0x3145; BYTE $0xdb // xor r11d, r11d
LBB6_3:
WORD $0x894d; BYTE $0xd9 // mov r9, r11
WORD $0xf749; BYTE $0xd1 // not r9
WORD $0x014d; BYTE $0xc1 // add r9, r8
WORD $0x894c; BYTE $0xc0 // mov rax, r8
LONG $0x03e08348 // and rax, 3
JE LBB6_5
LBB6_4:
LONG $0xdf148b4a // mov rdx, qword [rdi + 8*r11]
LONG $0xd1af0f48 // imul rdx, rcx
LONG $0xde14894a // mov qword [rsi + 8*r11], rdx
LONG $0x01c38349 // add r11, 1
LONG $0xffc08348 // add rax, -1
JNE LBB6_4
LBB6_5:
LONG $0x03f98349 // cmp r9, 3
JB LBB6_16
LBB6_6:
LONG $0xdf048b4a // mov rax, qword [rdi + 8*r11]
LONG $0xc1af0f48 // imul rax, rcx
LONG $0xde04894a // mov qword [rsi + 8*r11], rax
LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r11 + 8]
LONG $0xc1af0f48 // imul rax, rcx
LONG $0xde44894a; BYTE $0x08 // mov qword [rsi + 8*r11 + 8], rax
LONG $0xdf448b4a; BYTE $0x10 // mov rax, qword [rdi + 8*r11 + 16]
LONG $0xc1af0f48 // imul rax, rcx
LONG $0xde44894a; BYTE $0x10 // mov qword [rsi + 8*r11 + 16], rax
LONG $0xdf448b4a; BYTE $0x18 // mov rax, qword [rdi + 8*r11 + 24]
LONG $0xc1af0f48 // imul rax, rcx
LONG $0xde44894a; BYTE $0x18 // mov qword [rsi + 8*r11 + 24], rax
LONG $0x04c38349 // add r11, 4
WORD $0x394d; BYTE $0xd8 // cmp r8, r11
JNE LBB6_6
JMP LBB6_16
LBB6_9:
WORD $0x8945; BYTE $0xc3 // mov r11d, r8d
LONG $0xf0e38341 // and r11d, -16
LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx
LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0
LONG $0xf0438d49 // lea rax, [r11 - 16]
WORD $0x8949; BYTE $0xc1 // mov r9, rax
LONG $0x04e9c149 // shr r9, 4
LONG $0x01c18349 // add r9, 1
LONG $0xd073f5c5; BYTE $0x20 // vpsrlq ymm1, ymm0, 32
WORD $0x8548; BYTE $0xc0 // test rax, rax
JE LBB6_10
WORD $0x894d; BYTE $0xca // mov r10, r9
LONG $0xfee28349 // and r10, -2
WORD $0xf749; BYTE $0xda // neg r10
WORD $0xc031 // xor eax, eax
LBB6_12:
LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax]
LONG $0x5c6ffec5; WORD $0x20c7 // vmovdqu ymm3, yword [rdi + 8*rax + 32]
LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64]
LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96]
LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1
LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0
LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6
LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1
LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0
LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6
LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1
LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0
LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6
LONG $0xf1f4d5c5 // vpmuludq ymm6, ymm5, ymm1
LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xe8f4d5c5 // vpmuludq ymm5, ymm5, ymm0
LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6
LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2
LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3
LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4
LONG $0x6c7ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm5
QUAD $0x000080c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 128]
QUAD $0x0000a0c79c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdi + 8*rax + 160]
QUAD $0x0000c0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 192]
QUAD $0x0000e0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 224]
LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1
LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0
LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6
LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1
LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0
LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6
LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1
LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0
LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6
LONG $0xf1f4d5c5 // vpmuludq ymm6, ymm5, ymm1
LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xe8f4d5c5 // vpmuludq ymm5, ymm5, ymm0
LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6
QUAD $0x000080c6947ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 128], ymm2
QUAD $0x0000a0c69c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 160], ymm3
QUAD $0x0000c0c6a47ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 192], ymm4
QUAD $0x0000e0c6ac7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 224], ymm5
LONG $0x20c08348 // add rax, 32
LONG $0x02c28349 // add r10, 2
JNE LBB6_12
LONG $0x01c1f641 // test r9b, 1
JE LBB6_15
LBB6_14:
LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax]
LONG $0x5c6ffec5; WORD $0x20c7 // vmovdqu ymm3, yword [rdi + 8*rax + 32]
LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64]
LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96]
LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1
LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0
LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6
LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1
LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0
LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6
LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1
LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32
LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0
LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7
LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32
LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0
LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6
LONG $0xc9f4d5c5 // vpmuludq ymm1, ymm5, ymm1
LONG $0xd573cdc5; BYTE $0x20 // vpsrlq ymm6, ymm5, 32
LONG $0xf0f4cdc5 // vpmuludq ymm6, ymm6, ymm0
LONG $0xced4f5c5 // vpaddq ymm1, ymm1, ymm6
LONG $0xf173f5c5; BYTE $0x20 // vpsllq ymm1, ymm1, 32
LONG $0xc0f4d5c5 // vpmuludq ymm0, ymm5, ymm0
LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1
LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2
LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3
LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4
LONG $0x447ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm0
LBB6_15:
WORD $0x394d; BYTE $0xc3 // cmp r11, r8
JNE LBB6_3
LBB6_16:
VZEROUPPER
RET
LBB6_10:
WORD $0xc031 // xor eax, eax
LONG $0x01c1f641 // test r9b, 1
JNE LBB6_14
JMP LBB6_15
TEXT ·_divide_constant_int64_int64_avx2(SB), $0-32
MOVQ src+0(FP), DI
MOVQ dest+8(FP), SI
MOVQ len+16(FP), DX
MOVQ factor+24(FP), CX
WORD $0xd285 // test edx, edx
JLE LBB7_8
WORD $0x8941; BYTE $0xd1 // mov r9d, edx
WORD $0xfa83; BYTE $0x01 // cmp edx, 1
JNE LBB7_9
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
LBB7_3:
LONG $0x01c1f641 // test r9b, 1
JE LBB7_8
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB7_5
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB7_7
LBB7_9:
WORD $0x8945; BYTE $0xca // mov r10d, r9d
LONG $0xfee28341 // and r10d, -2
WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
JMP LBB7_10
LBB7_15:
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
LBB7_16:
LONG $0xc644894a; BYTE $0x08 // mov qword [rsi + 8*r8 + 8], rax
LONG $0x02c08349 // add r8, 2
WORD $0x394d; BYTE $0xc2 // cmp r10, r8
JE LBB7_3
LBB7_10:
LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JE LBB7_11
WORD $0x9948 // cqo
WORD $0xf748; BYTE $0xf9 // idiv rcx
JMP LBB7_13
LBB7_11:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB7_13:
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LONG $0xc7448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r8 + 8]
WORD $0x8948; BYTE $0xc2 // mov rdx, rax
WORD $0x0948; BYTE $0xca // or rdx, rcx
LONG $0x20eac148 // shr rdx, 32
JNE LBB7_15
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
JMP LBB7_16
LBB7_5:
WORD $0xd231 // xor edx, edx
WORD $0xf1f7 // div ecx
LBB7_7:
LONG $0xc604894a // mov qword [rsi + 8*r8], rax
LBB7_8:
RET