arrow/compute/internal/kernels/constant_factor_avx2_amd64.s (687 lines of code) (raw):

//go:build go1.18 && !noasm && !appengine // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT TEXT ·_multiply_constant_int32_int32_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB0_16 WORD $0x8941; BYTE $0xd1 // mov r9d, edx WORD $0xfa83; BYTE $0x1f // cmp edx, 31 JBE LBB0_2 LONG $0x8f048d4a // lea rax, [rdi + 4*r9] WORD $0x3948; BYTE $0xf0 // cmp rax, rsi JBE LBB0_9 LONG $0x8e048d4a // lea rax, [rsi + 4*r9] WORD $0x3948; BYTE $0xf8 // cmp rax, rdi JBE LBB0_9 LBB0_2: WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB0_3: WORD $0x894d; BYTE $0xd8 // mov r8, r11 WORD $0xf749; BYTE $0xd0 // not r8 WORD $0x014d; BYTE $0xc8 // add r8, r9 WORD $0x894c; BYTE $0xc8 // mov rax, r9 LONG $0x03e08348 // and rax, 3 JE LBB0_5 LBB0_4: LONG $0x9f148b42 // mov edx, dword [rdi + 4*r11] WORD $0xaf0f; BYTE $0xd1 // imul edx, ecx LONG $0x9e148942 // mov dword [rsi + 4*r11], edx LONG $0x01c38349 // add r11, 1 LONG $0xffc08348 // add rax, -1 JNE LBB0_4 LBB0_5: LONG $0x03f88349 // cmp r8, 3 JB LBB0_16 LBB0_6: LONG $0x9f048b42 // mov eax, dword [rdi + 4*r11] WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx LONG $0x9e048942 // mov dword [rsi + 4*r11], eax LONG $0x9f448b42; BYTE $0x04 // mov eax, dword [rdi + 4*r11 + 4] WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx LONG $0x9e448942; BYTE $0x04 // mov dword [rsi + 4*r11 + 4], eax LONG $0x9f448b42; BYTE $0x08 // mov eax, dword [rdi + 4*r11 + 8] WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx LONG $0x9e448942; BYTE $0x08 // mov dword [rsi + 4*r11 + 8], eax LONG $0x9f448b42; BYTE $0x0c // mov eax, dword [rdi + 4*r11 + 12] WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx LONG $0x9e448942; BYTE $0x0c // mov dword [rsi + 4*r11 + 12], eax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd9 // cmp r9, r11 JNE LBB0_6 JMP LBB0_16 LBB0_9: WORD $0x8945; BYTE $0xcb // mov r11d, r9d LONG $0xe0e38341 // and r11d, -32 LONG $0xc16ef9c5 // vmovd xmm0, ecx LONG $0x587de2c4; BYTE $0xc0 // vpbroadcastd ymm0, xmm0 LONG $0xe0438d49 // lea rax, [r11 - 32] WORD $0x8949; BYTE $0xc0 // mov r8, rax LONG $0x05e8c149 // shr r8, 5 LONG $0x01c08349 // add r8, 1 WORD $0x8548; BYTE $0xc0 // test rax, rax JE LBB0_10 WORD $0x894d; BYTE $0xc2 // mov r10, r8 LONG $0xfee28349 // and r10, -2 WORD $0xf749; BYTE $0xda // neg r10 WORD $0xc031 // xor eax, eax LBB0_12: LONG $0x407de2c4; WORD $0x870c // vpmulld ymm1, ymm0, yword [rdi + 4*rax] LONG $0x407de2c4; WORD $0x8754; BYTE $0x20 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 32] LONG $0x407de2c4; WORD $0x875c; BYTE $0x40 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 64] LONG $0x407de2c4; WORD $0x8764; BYTE $0x60 // vpmulld ymm4, ymm0, yword [rdi + 4*rax + 96] LONG $0x0c7ffec5; BYTE $0x86 // vmovdqu yword [rsi + 4*rax], ymm1 LONG $0x547ffec5; WORD $0x2086 // vmovdqu yword [rsi + 4*rax + 32], ymm2 LONG $0x5c7ffec5; WORD $0x4086 // vmovdqu yword [rsi + 4*rax + 64], ymm3 LONG $0x647ffec5; WORD $0x6086 // vmovdqu yword [rsi + 4*rax + 96], ymm4 QUAD $0x0080878c407de2c4; WORD $0x0000 // vpmulld ymm1, ymm0, yword [rdi + 4*rax + 128] QUAD $0x00a08794407de2c4; WORD $0x0000 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 160] QUAD $0x00c0879c407de2c4; WORD $0x0000 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 192] QUAD $0x00e087a4407de2c4; WORD $0x0000 // vpmulld ymm4, ymm0, yword [rdi + 4*rax + 224] QUAD $0x000080868c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 128], ymm1 QUAD $0x0000a086947ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 160], ymm2 QUAD $0x0000c0869c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 192], ymm3 QUAD $0x0000e086a47ffec5; BYTE $0x00 // vmovdqu yword [rsi + 4*rax + 224], ymm4 LONG $0x40c08348 // add rax, 64 LONG $0x02c28349 // add r10, 2 JNE LBB0_12 LONG $0x01c0f641 // test r8b, 1 JE LBB0_15 LBB0_14: LONG $0x407de2c4; WORD $0x870c // vpmulld ymm1, ymm0, yword [rdi + 4*rax] LONG $0x407de2c4; WORD $0x8754; BYTE $0x20 // vpmulld ymm2, ymm0, yword [rdi + 4*rax + 32] LONG $0x407de2c4; WORD $0x875c; BYTE $0x40 // vpmulld ymm3, ymm0, yword [rdi + 4*rax + 64] LONG $0x407de2c4; WORD $0x8744; BYTE $0x60 // vpmulld ymm0, ymm0, yword [rdi + 4*rax + 96] LONG $0x0c7ffec5; BYTE $0x86 // vmovdqu yword [rsi + 4*rax], ymm1 LONG $0x547ffec5; WORD $0x2086 // vmovdqu yword [rsi + 4*rax + 32], ymm2 LONG $0x5c7ffec5; WORD $0x4086 // vmovdqu yword [rsi + 4*rax + 64], ymm3 LONG $0x447ffec5; WORD $0x6086 // vmovdqu yword [rsi + 4*rax + 96], ymm0 LBB0_15: WORD $0x394d; BYTE $0xcb // cmp r11, r9 JNE LBB0_3 LBB0_16: VZEROUPPER RET LBB0_10: WORD $0xc031 // xor eax, eax LONG $0x01c0f641 // test r8b, 1 JNE LBB0_14 JMP LBB0_15 TEXT ·_divide_constant_int32_int32_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB1_8 WORD $0x8941; BYTE $0xd1 // mov r9d, edx WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JNE LBB1_9 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB1_3: LONG $0x01c1f641 // test r9b, 1 JE LBB1_8 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB1_5 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB1_7 LBB1_9: WORD $0x8945; BYTE $0xca // mov r10d, r9d LONG $0xfee28341 // and r10d, -2 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d JMP LBB1_10 LBB1_15: WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx LBB1_16: LONG $0x86448942; BYTE $0x04 // mov dword [rsi + 4*r8 + 4], eax LONG $0x02c08349 // add r8, 2 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 JE LBB1_3 LBB1_10: LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB1_11 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB1_13 LBB1_11: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB1_13: LONG $0x86048942 // mov dword [rsi + 4*r8], eax LONG $0x8744634a; BYTE $0x04 // movsxd rax, dword [rdi + 4*r8 + 4] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JNE LBB1_15 WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx JMP LBB1_16 LBB1_5: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB1_7: LONG $0x86048942 // mov dword [rsi + 4*r8], eax LBB1_8: RET TEXT ·_multiply_constant_int32_int64_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB2_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx WORD $0xfa83; BYTE $0x0f // cmp edx, 15 JA LBB2_3 WORD $0xd231 // xor edx, edx JMP LBB2_6 LBB2_3: WORD $0x8944; BYTE $0xc2 // mov edx, r8d WORD $0xe283; BYTE $0xf0 // and edx, -16 LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0xd073f5c5; BYTE $0x20 // vpsrlq ymm1, ymm0, 32 LBB2_4: LONG $0x257de2c4; WORD $0x8714 // vpmovsxdq ymm2, oword [rdi + 4*rax] LONG $0x257de2c4; WORD $0x875c; BYTE $0x10 // vpmovsxdq ymm3, oword [rdi + 4*rax + 16] LONG $0x257de2c4; WORD $0x8764; BYTE $0x20 // vpmovsxdq ymm4, oword [rdi + 4*rax + 32] LONG $0x257de2c4; WORD $0x876c; BYTE $0x30 // vpmovsxdq ymm5, oword [rdi + 4*rax + 48] LONG $0xf2f4f5c5 // vpmuludq ymm6, ymm1, ymm2 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd2f4fdc5 // vpmuludq ymm2, ymm0, ymm2 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 LONG $0xf3f4f5c5 // vpmuludq ymm6, ymm1, ymm3 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xdbf4fdc5 // vpmuludq ymm3, ymm0, ymm3 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 LONG $0xf4f4f5c5 // vpmuludq ymm6, ymm1, ymm4 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xe4f4fdc5 // vpmuludq ymm4, ymm0, ymm4 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 LONG $0xf5f4f5c5 // vpmuludq ymm6, ymm1, ymm5 LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32 LONG $0xfff4fdc5 // vpmuludq ymm7, ymm0, ymm7 LONG $0xf6d4c5c5 // vpaddq ymm6, ymm7, ymm6 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xedf4fdc5 // vpmuludq ymm5, ymm0, ymm5 LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6 LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2 LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3 LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4 LONG $0x6c7ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm5 LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc2 // cmp rdx, rax JNE LBB2_4 WORD $0x394c; BYTE $0xc2 // cmp rdx, r8 JE LBB2_7 LBB2_6: LONG $0x97046348 // movsxd rax, dword [rdi + 4*rdx] LONG $0xc1af0f48 // imul rax, rcx LONG $0xd6048948 // mov qword [rsi + 8*rdx], rax LONG $0x01c28348 // add rdx, 1 WORD $0x3949; BYTE $0xd0 // cmp r8, rdx JNE LBB2_6 LBB2_7: VZEROUPPER RET TEXT ·_divide_constant_int32_int64_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB3_8 WORD $0x8941; BYTE $0xd1 // mov r9d, edx WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JNE LBB3_9 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB3_3: LONG $0x01c1f641 // test r9b, 1 JE LBB3_8 LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB3_5 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB3_7 LBB3_9: WORD $0x8945; BYTE $0xca // mov r10d, r9d LONG $0xfee28341 // and r10d, -2 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d JMP LBB3_10 LBB3_15: WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx LBB3_16: LONG $0xc644894a; BYTE $0x08 // mov qword [rsi + 8*r8 + 8], rax LONG $0x02c08349 // add r8, 2 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 JE LBB3_3 LBB3_10: LONG $0x8704634a // movsxd rax, dword [rdi + 4*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB3_11 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB3_13 LBB3_11: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB3_13: LONG $0xc604894a // mov qword [rsi + 8*r8], rax LONG $0x8744634a; BYTE $0x04 // movsxd rax, dword [rdi + 4*r8 + 4] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JNE LBB3_15 WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx JMP LBB3_16 LBB3_5: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB3_7: LONG $0xc604894a // mov qword [rsi + 8*r8], rax LBB3_8: RET TEXT ·_multiply_constant_int64_int32_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB4_7 WORD $0x8941; BYTE $0xd0 // mov r8d, edx WORD $0xfa83; BYTE $0x0f // cmp edx, 15 JA LBB4_3 WORD $0xd231 // xor edx, edx JMP LBB4_6 LBB4_3: WORD $0x8944; BYTE $0xc2 // mov edx, r8d WORD $0xe283; BYTE $0xf0 // and edx, -16 LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 WORD $0xc031 // xor eax, eax LONG $0x397de3c4; WORD $0x01c1 // vextracti128 xmm1, ymm0, 1 LBB4_4: LONG $0x1410f8c5; BYTE $0xc7 // vmovups xmm2, oword [rdi + 8*rax] LONG $0x5c10f8c5; WORD $0x20c7 // vmovups xmm3, oword [rdi + 8*rax + 32] LONG $0x6410f8c5; WORD $0x40c7 // vmovups xmm4, oword [rdi + 8*rax + 64] LONG $0x6c10f8c5; WORD $0x60c7 // vmovups xmm5, oword [rdi + 8*rax + 96] LONG $0x54c6e8c5; WORD $0x10c7; BYTE $0x88 // vshufps xmm2, xmm2, oword [rdi + 8*rax + 16], 136 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 LONG $0x4069e2c4; BYTE $0xd6 // vpmulld xmm2, xmm2, xmm6 LONG $0x5cc6e0c5; WORD $0x30c7; BYTE $0x88 // vshufps xmm3, xmm3, oword [rdi + 8*rax + 48], 136 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 LONG $0x4061e2c4; BYTE $0xde // vpmulld xmm3, xmm3, xmm6 LONG $0x64c6d8c5; WORD $0x50c7; BYTE $0x88 // vshufps xmm4, xmm4, oword [rdi + 8*rax + 80], 136 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 LONG $0x4059e2c4; BYTE $0xe6 // vpmulld xmm4, xmm4, xmm6 LONG $0x6cc6d0c5; WORD $0x70c7; BYTE $0x88 // vshufps xmm5, xmm5, oword [rdi + 8*rax + 112], 136 LONG $0xf1c6f8c5; BYTE $0x88 // vshufps xmm6, xmm0, xmm1, 136 LONG $0x4051e2c4; BYTE $0xee // vpmulld xmm5, xmm5, xmm6 LONG $0x147ffac5; BYTE $0x86 // vmovdqu oword [rsi + 4*rax], xmm2 LONG $0x5c7ffac5; WORD $0x1086 // vmovdqu oword [rsi + 4*rax + 16], xmm3 LONG $0x647ffac5; WORD $0x2086 // vmovdqu oword [rsi + 4*rax + 32], xmm4 LONG $0x6c7ffac5; WORD $0x3086 // vmovdqu oword [rsi + 4*rax + 48], xmm5 LONG $0x10c08348 // add rax, 16 WORD $0x3948; BYTE $0xc2 // cmp rdx, rax JNE LBB4_4 WORD $0x394c; BYTE $0xc2 // cmp rdx, r8 JE LBB4_7 LBB4_6: WORD $0x048b; BYTE $0xd7 // mov eax, dword [rdi + 8*rdx] WORD $0xaf0f; BYTE $0xc1 // imul eax, ecx WORD $0x0489; BYTE $0x96 // mov dword [rsi + 4*rdx], eax LONG $0x01c28348 // add rdx, 1 WORD $0x3949; BYTE $0xd0 // cmp r8, rdx JNE LBB4_6 LBB4_7: VZEROUPPER RET TEXT ·_divide_constant_int64_int32_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB5_8 WORD $0x8941; BYTE $0xd1 // mov r9d, edx WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JNE LBB5_9 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB5_3: LONG $0x01c1f641 // test r9b, 1 JE LBB5_8 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB5_5 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB5_7 LBB5_9: WORD $0x8945; BYTE $0xca // mov r10d, r9d LONG $0xfee28341 // and r10d, -2 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d JMP LBB5_10 LBB5_15: WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx LBB5_16: LONG $0x86448942; BYTE $0x04 // mov dword [rsi + 4*r8 + 4], eax LONG $0x02c08349 // add r8, 2 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 JE LBB5_3 LBB5_10: LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB5_11 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB5_13 LBB5_11: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB5_13: LONG $0x86048942 // mov dword [rsi + 4*r8], eax LONG $0xc7448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r8 + 8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JNE LBB5_15 WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx JMP LBB5_16 LBB5_5: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB5_7: LONG $0x86048942 // mov dword [rsi + 4*r8], eax LBB5_8: RET TEXT ·_multiply_constant_int64_int64_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB6_16 WORD $0x8941; BYTE $0xd0 // mov r8d, edx WORD $0xfa83; BYTE $0x0f // cmp edx, 15 JBE LBB6_2 LONG $0xc7048d4a // lea rax, [rdi + 8*r8] WORD $0x3948; BYTE $0xf0 // cmp rax, rsi JBE LBB6_9 LONG $0xc6048d4a // lea rax, [rsi + 8*r8] WORD $0x3948; BYTE $0xf8 // cmp rax, rdi JBE LBB6_9 LBB6_2: WORD $0x3145; BYTE $0xdb // xor r11d, r11d LBB6_3: WORD $0x894d; BYTE $0xd9 // mov r9, r11 WORD $0xf749; BYTE $0xd1 // not r9 WORD $0x014d; BYTE $0xc1 // add r9, r8 WORD $0x894c; BYTE $0xc0 // mov rax, r8 LONG $0x03e08348 // and rax, 3 JE LBB6_5 LBB6_4: LONG $0xdf148b4a // mov rdx, qword [rdi + 8*r11] LONG $0xd1af0f48 // imul rdx, rcx LONG $0xde14894a // mov qword [rsi + 8*r11], rdx LONG $0x01c38349 // add r11, 1 LONG $0xffc08348 // add rax, -1 JNE LBB6_4 LBB6_5: LONG $0x03f98349 // cmp r9, 3 JB LBB6_16 LBB6_6: LONG $0xdf048b4a // mov rax, qword [rdi + 8*r11] LONG $0xc1af0f48 // imul rax, rcx LONG $0xde04894a // mov qword [rsi + 8*r11], rax LONG $0xdf448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r11 + 8] LONG $0xc1af0f48 // imul rax, rcx LONG $0xde44894a; BYTE $0x08 // mov qword [rsi + 8*r11 + 8], rax LONG $0xdf448b4a; BYTE $0x10 // mov rax, qword [rdi + 8*r11 + 16] LONG $0xc1af0f48 // imul rax, rcx LONG $0xde44894a; BYTE $0x10 // mov qword [rsi + 8*r11 + 16], rax LONG $0xdf448b4a; BYTE $0x18 // mov rax, qword [rdi + 8*r11 + 24] LONG $0xc1af0f48 // imul rax, rcx LONG $0xde44894a; BYTE $0x18 // mov qword [rsi + 8*r11 + 24], rax LONG $0x04c38349 // add r11, 4 WORD $0x394d; BYTE $0xd8 // cmp r8, r11 JNE LBB6_6 JMP LBB6_16 LBB6_9: WORD $0x8945; BYTE $0xc3 // mov r11d, r8d LONG $0xf0e38341 // and r11d, -16 LONG $0x6ef9e1c4; BYTE $0xc1 // vmovq xmm0, rcx LONG $0x597de2c4; BYTE $0xc0 // vpbroadcastq ymm0, xmm0 LONG $0xf0438d49 // lea rax, [r11 - 16] WORD $0x8949; BYTE $0xc1 // mov r9, rax LONG $0x04e9c149 // shr r9, 4 LONG $0x01c18349 // add r9, 1 LONG $0xd073f5c5; BYTE $0x20 // vpsrlq ymm1, ymm0, 32 WORD $0x8548; BYTE $0xc0 // test rax, rax JE LBB6_10 WORD $0x894d; BYTE $0xca // mov r10, r9 LONG $0xfee28349 // and r10, -2 WORD $0xf749; BYTE $0xda // neg r10 WORD $0xc031 // xor eax, eax LBB6_12: LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax] LONG $0x5c6ffec5; WORD $0x20c7 // vmovdqu ymm3, yword [rdi + 8*rax + 32] LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96] LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 LONG $0xf1f4d5c5 // vpmuludq ymm6, ymm5, ymm1 LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xe8f4d5c5 // vpmuludq ymm5, ymm5, ymm0 LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6 LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2 LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3 LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4 LONG $0x6c7ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm5 QUAD $0x000080c7946ffec5; BYTE $0x00 // vmovdqu ymm2, yword [rdi + 8*rax + 128] QUAD $0x0000a0c79c6ffec5; BYTE $0x00 // vmovdqu ymm3, yword [rdi + 8*rax + 160] QUAD $0x0000c0c7a46ffec5; BYTE $0x00 // vmovdqu ymm4, yword [rdi + 8*rax + 192] QUAD $0x0000e0c7ac6ffec5; BYTE $0x00 // vmovdqu ymm5, yword [rdi + 8*rax + 224] LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 LONG $0xf1f4d5c5 // vpmuludq ymm6, ymm5, ymm1 LONG $0xd573c5c5; BYTE $0x20 // vpsrlq ymm7, ymm5, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xe8f4d5c5 // vpmuludq ymm5, ymm5, ymm0 LONG $0xeed4d5c5 // vpaddq ymm5, ymm5, ymm6 QUAD $0x000080c6947ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 128], ymm2 QUAD $0x0000a0c69c7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 160], ymm3 QUAD $0x0000c0c6a47ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 192], ymm4 QUAD $0x0000e0c6ac7ffec5; BYTE $0x00 // vmovdqu yword [rsi + 8*rax + 224], ymm5 LONG $0x20c08348 // add rax, 32 LONG $0x02c28349 // add r10, 2 JNE LBB6_12 LONG $0x01c1f641 // test r9b, 1 JE LBB6_15 LBB6_14: LONG $0x146ffec5; BYTE $0xc7 // vmovdqu ymm2, yword [rdi + 8*rax] LONG $0x5c6ffec5; WORD $0x20c7 // vmovdqu ymm3, yword [rdi + 8*rax + 32] LONG $0x646ffec5; WORD $0x40c7 // vmovdqu ymm4, yword [rdi + 8*rax + 64] LONG $0x6c6ffec5; WORD $0x60c7 // vmovdqu ymm5, yword [rdi + 8*rax + 96] LONG $0xf1f4edc5 // vpmuludq ymm6, ymm2, ymm1 LONG $0xd273c5c5; BYTE $0x20 // vpsrlq ymm7, ymm2, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd0f4edc5 // vpmuludq ymm2, ymm2, ymm0 LONG $0xd6d4edc5 // vpaddq ymm2, ymm2, ymm6 LONG $0xf1f4e5c5 // vpmuludq ymm6, ymm3, ymm1 LONG $0xd373c5c5; BYTE $0x20 // vpsrlq ymm7, ymm3, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xd8f4e5c5 // vpmuludq ymm3, ymm3, ymm0 LONG $0xded4e5c5 // vpaddq ymm3, ymm3, ymm6 LONG $0xf1f4ddc5 // vpmuludq ymm6, ymm4, ymm1 LONG $0xd473c5c5; BYTE $0x20 // vpsrlq ymm7, ymm4, 32 LONG $0xf8f4c5c5 // vpmuludq ymm7, ymm7, ymm0 LONG $0xf7d4cdc5 // vpaddq ymm6, ymm6, ymm7 LONG $0xf673cdc5; BYTE $0x20 // vpsllq ymm6, ymm6, 32 LONG $0xe0f4ddc5 // vpmuludq ymm4, ymm4, ymm0 LONG $0xe6d4ddc5 // vpaddq ymm4, ymm4, ymm6 LONG $0xc9f4d5c5 // vpmuludq ymm1, ymm5, ymm1 LONG $0xd573cdc5; BYTE $0x20 // vpsrlq ymm6, ymm5, 32 LONG $0xf0f4cdc5 // vpmuludq ymm6, ymm6, ymm0 LONG $0xced4f5c5 // vpaddq ymm1, ymm1, ymm6 LONG $0xf173f5c5; BYTE $0x20 // vpsllq ymm1, ymm1, 32 LONG $0xc0f4d5c5 // vpmuludq ymm0, ymm5, ymm0 LONG $0xc1d4fdc5 // vpaddq ymm0, ymm0, ymm1 LONG $0x147ffec5; BYTE $0xc6 // vmovdqu yword [rsi + 8*rax], ymm2 LONG $0x5c7ffec5; WORD $0x20c6 // vmovdqu yword [rsi + 8*rax + 32], ymm3 LONG $0x647ffec5; WORD $0x40c6 // vmovdqu yword [rsi + 8*rax + 64], ymm4 LONG $0x447ffec5; WORD $0x60c6 // vmovdqu yword [rsi + 8*rax + 96], ymm0 LBB6_15: WORD $0x394d; BYTE $0xc3 // cmp r11, r8 JNE LBB6_3 LBB6_16: VZEROUPPER RET LBB6_10: WORD $0xc031 // xor eax, eax LONG $0x01c1f641 // test r9b, 1 JNE LBB6_14 JMP LBB6_15 TEXT ·_divide_constant_int64_int64_avx2(SB), $0-32 MOVQ src+0(FP), DI MOVQ dest+8(FP), SI MOVQ len+16(FP), DX MOVQ factor+24(FP), CX WORD $0xd285 // test edx, edx JLE LBB7_8 WORD $0x8941; BYTE $0xd1 // mov r9d, edx WORD $0xfa83; BYTE $0x01 // cmp edx, 1 JNE LBB7_9 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d LBB7_3: LONG $0x01c1f641 // test r9b, 1 JE LBB7_8 LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB7_5 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB7_7 LBB7_9: WORD $0x8945; BYTE $0xca // mov r10d, r9d LONG $0xfee28341 // and r10d, -2 WORD $0x3145; BYTE $0xc0 // xor r8d, r8d JMP LBB7_10 LBB7_15: WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx LBB7_16: LONG $0xc644894a; BYTE $0x08 // mov qword [rsi + 8*r8 + 8], rax LONG $0x02c08349 // add r8, 2 WORD $0x394d; BYTE $0xc2 // cmp r10, r8 JE LBB7_3 LBB7_10: LONG $0xc7048b4a // mov rax, qword [rdi + 8*r8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JE LBB7_11 WORD $0x9948 // cqo WORD $0xf748; BYTE $0xf9 // idiv rcx JMP LBB7_13 LBB7_11: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB7_13: LONG $0xc604894a // mov qword [rsi + 8*r8], rax LONG $0xc7448b4a; BYTE $0x08 // mov rax, qword [rdi + 8*r8 + 8] WORD $0x8948; BYTE $0xc2 // mov rdx, rax WORD $0x0948; BYTE $0xca // or rdx, rcx LONG $0x20eac148 // shr rdx, 32 JNE LBB7_15 WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx JMP LBB7_16 LBB7_5: WORD $0xd231 // xor edx, edx WORD $0xf1f7 // div ecx LBB7_7: LONG $0xc604894a // mov qword [rsi + 8*r8], rax LBB7_8: RET