internal/utils/_lib/min_max_avx2_amd64.s (1,009 lines of code) (raw):
.text
.intel_syntax noprefix
.file "min_max.c"
.section .rodata.cst32,"aM",@progbits,32
.p2align 5 # -- Begin function int8_max_min_avx2
.LCPI0_0:
.zero 32,128
.LCPI0_1:
.zero 32,127
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI0_2:
.zero 16,127
.LCPI0_3:
.zero 16,128
.text
.globl int8_max_min_avx2
.p2align 4, 0x90
.type int8_max_min_avx2,@function
int8_max_min_avx2: # @int8_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB0_1
# %bb.2:
mov r9d, esi
cmp esi, 63
ja .LBB0_4
# %bb.3:
mov r8b, -128
mov sil, 127
xor r10d, r10d
jmp .LBB0_11
.LBB0_1:
mov sil, 127
mov r8b, -128
jmp .LBB0_12
.LBB0_4:
mov r10d, r9d
and r10d, -64
lea rax, [r10 - 64]
mov r8, rax
shr r8, 6
add r8, 1
test rax, rax
je .LBB0_5
# %bb.6:
mov rsi, r8
and rsi, -2
neg rsi
vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
vmovdqa ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
xor eax, eax
vmovdqa ymm2, ymm0
vmovdqa ymm3, ymm1
.p2align 4, 0x90
.LBB0_7: # =>This Inner Loop Header: Depth=1
vmovdqu ymm4, ymmword ptr [rdi + rax]
vmovdqu ymm5, ymmword ptr [rdi + rax + 32]
vmovdqu ymm6, ymmword ptr [rdi + rax + 64]
vmovdqu ymm7, ymmword ptr [rdi + rax + 96]
vpminsb ymm0, ymm0, ymm4
vpminsb ymm2, ymm2, ymm5
vpmaxsb ymm1, ymm1, ymm4
vpmaxsb ymm3, ymm3, ymm5
vpminsb ymm0, ymm0, ymm6
vpminsb ymm2, ymm2, ymm7
vpmaxsb ymm1, ymm1, ymm6
vpmaxsb ymm3, ymm3, ymm7
sub rax, -128
add rsi, 2
jne .LBB0_7
# %bb.8:
test r8b, 1
je .LBB0_10
.LBB0_9:
vmovdqu ymm4, ymmword ptr [rdi + rax]
vmovdqu ymm5, ymmword ptr [rdi + rax + 32]
vpmaxsb ymm3, ymm3, ymm5
vpmaxsb ymm1, ymm1, ymm4
vpminsb ymm2, ymm2, ymm5
vpminsb ymm0, ymm0, ymm4
.LBB0_10:
vpmaxsb ymm1, ymm1, ymm3
vextracti128 xmm3, ymm1, 1
vpmaxsb xmm1, xmm1, xmm3
vpxor xmm1, xmm1, xmmword ptr [rip + .LCPI0_2]
vpminsb ymm0, ymm0, ymm2
vpsrlw xmm2, xmm1, 8
vpminub xmm1, xmm1, xmm2
vphminposuw xmm1, xmm1
vmovd r8d, xmm1
xor r8b, 127
vextracti128 xmm1, ymm0, 1
vpminsb xmm0, xmm0, xmm1
vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI0_3]
vpsrlw xmm1, xmm0, 8
vpminub xmm0, xmm0, xmm1
vphminposuw xmm0, xmm0
vmovd esi, xmm0
xor sil, -128
cmp r10, r9
je .LBB0_12
.p2align 4, 0x90
.LBB0_11: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rdi + r10]
cmp sil, al
movzx esi, sil
cmovg esi, eax
cmp r8b, al
movzx r8d, r8b
cmovl r8d, eax
add r10, 1
cmp r9, r10
jne .LBB0_11
.LBB0_12:
mov byte ptr [rcx], r8b
mov byte ptr [rdx], sil
mov rsp, rbp
pop rbp
vzeroupper
ret
.LBB0_5:
vmovdqa ymm1, ymmword ptr [rip + .LCPI0_0] # ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
vmovdqa ymm0, ymmword ptr [rip + .LCPI0_1] # ymm0 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
xor eax, eax
vmovdqa ymm2, ymm0
vmovdqa ymm3, ymm1
test r8b, 1
jne .LBB0_9
jmp .LBB0_10
.Lfunc_end0:
.size int8_max_min_avx2, .Lfunc_end0-int8_max_min_avx2
# -- End function
.globl uint8_max_min_avx2 # -- Begin function uint8_max_min_avx2
.p2align 4, 0x90
.type uint8_max_min_avx2,@function
uint8_max_min_avx2: # @uint8_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB1_1
# %bb.2:
mov r9d, esi
cmp esi, 63
ja .LBB1_4
# %bb.3:
mov sil, -1
xor r10d, r10d
xor eax, eax
jmp .LBB1_11
.LBB1_1:
mov sil, -1
xor eax, eax
jmp .LBB1_12
.LBB1_4:
mov r10d, r9d
and r10d, -64
lea rax, [r10 - 64]
mov r8, rax
shr r8, 6
add r8, 1
test rax, rax
je .LBB1_5
# %bb.6:
mov rsi, r8
and rsi, -2
neg rsi
vpxor xmm0, xmm0, xmm0
vpcmpeqd ymm1, ymm1, ymm1
xor eax, eax
vpcmpeqd ymm2, ymm2, ymm2
vpxor xmm3, xmm3, xmm3
.p2align 4, 0x90
.LBB1_7: # =>This Inner Loop Header: Depth=1
vmovdqu ymm4, ymmword ptr [rdi + rax]
vmovdqu ymm5, ymmword ptr [rdi + rax + 32]
vmovdqu ymm6, ymmword ptr [rdi + rax + 64]
vmovdqu ymm7, ymmword ptr [rdi + rax + 96]
vpminub ymm1, ymm1, ymm4
vpminub ymm2, ymm2, ymm5
vpmaxub ymm0, ymm0, ymm4
vpmaxub ymm3, ymm3, ymm5
vpminub ymm1, ymm1, ymm6
vpminub ymm2, ymm2, ymm7
vpmaxub ymm0, ymm0, ymm6
vpmaxub ymm3, ymm3, ymm7
sub rax, -128
add rsi, 2
jne .LBB1_7
# %bb.8:
test r8b, 1
je .LBB1_10
.LBB1_9:
vmovdqu ymm4, ymmword ptr [rdi + rax]
vmovdqu ymm5, ymmword ptr [rdi + rax + 32]
vpmaxub ymm3, ymm3, ymm5
vpmaxub ymm0, ymm0, ymm4
vpminub ymm2, ymm2, ymm5
vpminub ymm1, ymm1, ymm4
.LBB1_10:
vpminub ymm1, ymm1, ymm2
vpmaxub ymm0, ymm0, ymm3
vextracti128 xmm2, ymm0, 1
vpmaxub xmm0, xmm0, xmm2
vpcmpeqd xmm2, xmm2, xmm2
vpxor xmm0, xmm0, xmm2
vpsrlw xmm2, xmm0, 8
vpminub xmm0, xmm0, xmm2
vphminposuw xmm0, xmm0
vmovd eax, xmm0
not al
vextracti128 xmm0, ymm1, 1
vpminub xmm0, xmm1, xmm0
vpsrlw xmm1, xmm0, 8
vpminub xmm0, xmm0, xmm1
vphminposuw xmm0, xmm0
vmovd esi, xmm0
cmp r10, r9
je .LBB1_12
.p2align 4, 0x90
.LBB1_11: # =>This Inner Loop Header: Depth=1
movzx r8d, byte ptr [rdi + r10]
cmp sil, r8b
movzx esi, sil
cmovae esi, r8d
cmp al, r8b
movzx eax, al
cmovbe eax, r8d
add r10, 1
cmp r9, r10
jne .LBB1_11
.LBB1_12:
mov byte ptr [rcx], al
mov byte ptr [rdx], sil
mov rsp, rbp
pop rbp
vzeroupper
ret
.LBB1_5:
vpxor xmm0, xmm0, xmm0
vpcmpeqd ymm1, ymm1, ymm1
xor eax, eax
vpcmpeqd ymm2, ymm2, ymm2
vpxor xmm3, xmm3, xmm3
test r8b, 1
jne .LBB1_9
jmp .LBB1_10
.Lfunc_end1:
.size uint8_max_min_avx2, .Lfunc_end1-uint8_max_min_avx2
# -- End function
.section .rodata.cst32,"aM",@progbits,32
.p2align 5 # -- Begin function int16_max_min_avx2
.LCPI2_0:
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.LCPI2_1:
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.section .rodata.cst16,"aM",@progbits,16
.p2align 4
.LCPI2_2:
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.short 32767 # 0x7fff
.LCPI2_3:
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.short 32768 # 0x8000
.text
.globl int16_max_min_avx2
.p2align 4, 0x90
.type int16_max_min_avx2,@function
int16_max_min_avx2: # @int16_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB2_1
# %bb.2:
mov r9d, esi
cmp esi, 31
ja .LBB2_4
# %bb.3:
mov r8w, -32768
mov si, 32767
xor r10d, r10d
jmp .LBB2_11
.LBB2_1:
mov si, 32767
mov r8w, -32768
jmp .LBB2_12
.LBB2_4:
mov r10d, r9d
and r10d, -32
lea rax, [r10 - 32]
mov r8, rax
shr r8, 5
add r8, 1
test rax, rax
je .LBB2_5
# %bb.6:
mov rsi, r8
and rsi, -2
neg rsi
vmovdqa ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
vmovdqa ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
xor eax, eax
vmovdqa ymm2, ymm0
vmovdqa ymm3, ymm1
.p2align 4, 0x90
.LBB2_7: # =>This Inner Loop Header: Depth=1
vmovdqu ymm4, ymmword ptr [rdi + 2*rax]
vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32]
vmovdqu ymm6, ymmword ptr [rdi + 2*rax + 64]
vmovdqu ymm7, ymmword ptr [rdi + 2*rax + 96]
vpminsw ymm0, ymm0, ymm4
vpminsw ymm2, ymm2, ymm5
vpmaxsw ymm1, ymm1, ymm4
vpmaxsw ymm3, ymm3, ymm5
vpminsw ymm0, ymm0, ymm6
vpminsw ymm2, ymm2, ymm7
vpmaxsw ymm1, ymm1, ymm6
vpmaxsw ymm3, ymm3, ymm7
add rax, 64
add rsi, 2
jne .LBB2_7
# %bb.8:
test r8b, 1
je .LBB2_10
.LBB2_9:
vmovdqu ymm4, ymmword ptr [rdi + 2*rax]
vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32]
vpmaxsw ymm3, ymm3, ymm5
vpmaxsw ymm1, ymm1, ymm4
vpminsw ymm2, ymm2, ymm5
vpminsw ymm0, ymm0, ymm4
.LBB2_10:
vpmaxsw ymm1, ymm1, ymm3
vextracti128 xmm3, ymm1, 1
vpmaxsw xmm1, xmm1, xmm3
vpxor xmm1, xmm1, xmmword ptr [rip + .LCPI2_2]
vpminsw ymm0, ymm0, ymm2
vphminposuw xmm1, xmm1
vmovd r8d, xmm1
xor r8d, 32767
vextracti128 xmm1, ymm0, 1
vpminsw xmm0, xmm0, xmm1
vpxor xmm0, xmm0, xmmword ptr [rip + .LCPI2_3]
vphminposuw xmm0, xmm0
vmovd esi, xmm0
xor esi, 32768
cmp r10, r9
je .LBB2_12
.p2align 4, 0x90
.LBB2_11: # =>This Inner Loop Header: Depth=1
movzx eax, word ptr [rdi + 2*r10]
cmp si, ax
cmovg esi, eax
cmp r8w, ax
cmovl r8d, eax
add r10, 1
cmp r9, r10
jne .LBB2_11
.LBB2_12:
mov word ptr [rcx], r8w
mov word ptr [rdx], si
mov rsp, rbp
pop rbp
vzeroupper
ret
.LBB2_5:
vmovdqa ymm1, ymmword ptr [rip + .LCPI2_0] # ymm1 = [32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768,32768]
vmovdqa ymm0, ymmword ptr [rip + .LCPI2_1] # ymm0 = [32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767,32767]
xor eax, eax
vmovdqa ymm2, ymm0
vmovdqa ymm3, ymm1
test r8b, 1
jne .LBB2_9
jmp .LBB2_10
.Lfunc_end2:
.size int16_max_min_avx2, .Lfunc_end2-int16_max_min_avx2
# -- End function
.globl uint16_max_min_avx2 # -- Begin function uint16_max_min_avx2
.p2align 4, 0x90
.type uint16_max_min_avx2,@function
uint16_max_min_avx2: # @uint16_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB3_1
# %bb.2:
mov r9d, esi
cmp esi, 31
ja .LBB3_4
# %bb.3:
mov r8w, -1
xor r10d, r10d
xor esi, esi
jmp .LBB3_11
.LBB3_1:
mov r8w, -1
xor esi, esi
jmp .LBB3_12
.LBB3_4:
mov r10d, r9d
and r10d, -32
lea rax, [r10 - 32]
mov r8, rax
shr r8, 5
add r8, 1
test rax, rax
je .LBB3_5
# %bb.6:
mov rsi, r8
and rsi, -2
neg rsi
vpxor xmm0, xmm0, xmm0
vpcmpeqd ymm1, ymm1, ymm1
xor eax, eax
vpcmpeqd ymm2, ymm2, ymm2
vpxor xmm3, xmm3, xmm3
.p2align 4, 0x90
.LBB3_7: # =>This Inner Loop Header: Depth=1
vmovdqu ymm4, ymmword ptr [rdi + 2*rax]
vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32]
vmovdqu ymm6, ymmword ptr [rdi + 2*rax + 64]
vmovdqu ymm7, ymmword ptr [rdi + 2*rax + 96]
vpminuw ymm1, ymm1, ymm4
vpminuw ymm2, ymm2, ymm5
vpmaxuw ymm0, ymm0, ymm4
vpmaxuw ymm3, ymm3, ymm5
vpminuw ymm1, ymm1, ymm6
vpminuw ymm2, ymm2, ymm7
vpmaxuw ymm0, ymm0, ymm6
vpmaxuw ymm3, ymm3, ymm7
add rax, 64
add rsi, 2
jne .LBB3_7
# %bb.8:
test r8b, 1
je .LBB3_10
.LBB3_9:
vmovdqu ymm4, ymmword ptr [rdi + 2*rax]
vmovdqu ymm5, ymmword ptr [rdi + 2*rax + 32]
vpmaxuw ymm3, ymm3, ymm5
vpmaxuw ymm0, ymm0, ymm4
vpminuw ymm2, ymm2, ymm5
vpminuw ymm1, ymm1, ymm4
.LBB3_10:
vpminuw ymm1, ymm1, ymm2
vpmaxuw ymm0, ymm0, ymm3
vextracti128 xmm2, ymm0, 1
vpmaxuw xmm0, xmm0, xmm2
vpcmpeqd xmm2, xmm2, xmm2
vpxor xmm0, xmm0, xmm2
vphminposuw xmm0, xmm0
vmovd esi, xmm0
not esi
vextracti128 xmm0, ymm1, 1
vpminuw xmm0, xmm1, xmm0
vphminposuw xmm0, xmm0
vmovd r8d, xmm0
cmp r10, r9
je .LBB3_12
.p2align 4, 0x90
.LBB3_11: # =>This Inner Loop Header: Depth=1
movzx eax, word ptr [rdi + 2*r10]
cmp r8w, ax
cmovae r8d, eax
cmp si, ax
cmovbe esi, eax
add r10, 1
cmp r9, r10
jne .LBB3_11
.LBB3_12:
mov word ptr [rcx], si
mov word ptr [rdx], r8w
mov rsp, rbp
pop rbp
vzeroupper
ret
.LBB3_5:
vpxor xmm0, xmm0, xmm0
vpcmpeqd ymm1, ymm1, ymm1
xor eax, eax
vpcmpeqd ymm2, ymm2, ymm2
vpxor xmm3, xmm3, xmm3
test r8b, 1
jne .LBB3_9
jmp .LBB3_10
.Lfunc_end3:
.size uint16_max_min_avx2, .Lfunc_end3-uint16_max_min_avx2
# -- End function
.section .rodata.cst4,"aM",@progbits,4
.p2align 2 # -- Begin function int32_max_min_avx2
.LCPI4_0:
.long 2147483648 # 0x80000000
.LCPI4_1:
.long 2147483647 # 0x7fffffff
.text
.globl int32_max_min_avx2
.p2align 4, 0x90
.type int32_max_min_avx2,@function
int32_max_min_avx2: # @int32_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB4_1
# %bb.2:
mov r8d, esi
cmp esi, 31
ja .LBB4_4
# %bb.3:
mov r10d, -2147483648
mov eax, 2147483647
xor r9d, r9d
jmp .LBB4_7
.LBB4_1:
mov eax, 2147483647
mov esi, -2147483648
jmp .LBB4_8
.LBB4_4:
mov r9d, r8d
vpbroadcastd ymm4, dword ptr [rip + .LCPI4_0] # ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
and r9d, -32
vpbroadcastd ymm0, dword ptr [rip + .LCPI4_1] # ymm0 = [2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647,2147483647]
xor eax, eax
vmovdqa ymm1, ymm0
vmovdqa ymm2, ymm0
vmovdqa ymm3, ymm0
vmovdqa ymm5, ymm4
vmovdqa ymm6, ymm4
vmovdqa ymm7, ymm4
.p2align 4, 0x90
.LBB4_5: # =>This Inner Loop Header: Depth=1
vmovdqu ymm8, ymmword ptr [rdi + 4*rax]
vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32]
vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64]
vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96]
vpminsd ymm0, ymm0, ymm8
vpminsd ymm1, ymm1, ymm9
vpminsd ymm2, ymm2, ymm10
vpminsd ymm3, ymm3, ymm11
vpmaxsd ymm4, ymm4, ymm8
vpmaxsd ymm5, ymm5, ymm9
vpmaxsd ymm6, ymm6, ymm10
vpmaxsd ymm7, ymm7, ymm11
add rax, 32
cmp r9, rax
jne .LBB4_5
# %bb.6:
vpmaxsd ymm4, ymm4, ymm5
vpmaxsd ymm4, ymm4, ymm6
vpmaxsd ymm4, ymm4, ymm7
vextracti128 xmm5, ymm4, 1
vpmaxsd xmm4, xmm4, xmm5
vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1]
vpmaxsd xmm4, xmm4, xmm5
vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3]
vpmaxsd xmm4, xmm4, xmm5
vmovd r10d, xmm4
vpminsd ymm0, ymm0, ymm1
vpminsd ymm0, ymm0, ymm2
vpminsd ymm0, ymm0, ymm3
vextracti128 xmm1, ymm0, 1
vpminsd xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpminsd xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3]
vpminsd xmm0, xmm0, xmm1
vmovd eax, xmm0
mov esi, r10d
cmp r9, r8
je .LBB4_8
.p2align 4, 0x90
.LBB4_7: # =>This Inner Loop Header: Depth=1
mov esi, dword ptr [rdi + 4*r9]
cmp eax, esi
cmovg eax, esi
cmp r10d, esi
cmovge esi, r10d
add r9, 1
mov r10d, esi
cmp r8, r9
jne .LBB4_7
.LBB4_8:
mov dword ptr [rcx], esi
mov dword ptr [rdx], eax
mov rsp, rbp
pop rbp
vzeroupper
ret
.Lfunc_end4:
.size int32_max_min_avx2, .Lfunc_end4-int32_max_min_avx2
# -- End function
.globl uint32_max_min_avx2 # -- Begin function uint32_max_min_avx2
.p2align 4, 0x90
.type uint32_max_min_avx2,@function
uint32_max_min_avx2: # @uint32_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB5_1
# %bb.2:
mov r8d, esi
cmp esi, 31
ja .LBB5_4
# %bb.3:
xor r9d, r9d
mov eax, -1
xor r10d, r10d
jmp .LBB5_7
.LBB5_1:
mov eax, -1
xor esi, esi
jmp .LBB5_8
.LBB5_4:
mov r9d, r8d
and r9d, -32
vpxor xmm4, xmm4, xmm4
vpcmpeqd ymm0, ymm0, ymm0
xor eax, eax
vpcmpeqd ymm1, ymm1, ymm1
vpcmpeqd ymm2, ymm2, ymm2
vpcmpeqd ymm3, ymm3, ymm3
vpxor xmm5, xmm5, xmm5
vpxor xmm6, xmm6, xmm6
vpxor xmm7, xmm7, xmm7
.p2align 4, 0x90
.LBB5_5: # =>This Inner Loop Header: Depth=1
vmovdqu ymm8, ymmword ptr [rdi + 4*rax]
vmovdqu ymm9, ymmword ptr [rdi + 4*rax + 32]
vmovdqu ymm10, ymmword ptr [rdi + 4*rax + 64]
vmovdqu ymm11, ymmword ptr [rdi + 4*rax + 96]
vpminud ymm0, ymm0, ymm8
vpminud ymm1, ymm1, ymm9
vpminud ymm2, ymm2, ymm10
vpminud ymm3, ymm3, ymm11
vpmaxud ymm4, ymm4, ymm8
vpmaxud ymm5, ymm5, ymm9
vpmaxud ymm6, ymm6, ymm10
vpmaxud ymm7, ymm7, ymm11
add rax, 32
cmp r9, rax
jne .LBB5_5
# %bb.6:
vpmaxud ymm4, ymm4, ymm5
vpmaxud ymm4, ymm4, ymm6
vpmaxud ymm4, ymm4, ymm7
vextracti128 xmm5, ymm4, 1
vpmaxud xmm4, xmm4, xmm5
vpshufd xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1]
vpmaxud xmm4, xmm4, xmm5
vpshufd xmm5, xmm4, 229 # xmm5 = xmm4[1,1,2,3]
vpmaxud xmm4, xmm4, xmm5
vmovd r10d, xmm4
vpminud ymm0, ymm0, ymm1
vpminud ymm0, ymm0, ymm2
vpminud ymm0, ymm0, ymm3
vextracti128 xmm1, ymm0, 1
vpminud xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpminud xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 229 # xmm1 = xmm0[1,1,2,3]
vpminud xmm0, xmm0, xmm1
vmovd eax, xmm0
mov esi, r10d
cmp r9, r8
je .LBB5_8
.p2align 4, 0x90
.LBB5_7: # =>This Inner Loop Header: Depth=1
mov esi, dword ptr [rdi + 4*r9]
cmp eax, esi
cmovae eax, esi
cmp r10d, esi
cmova esi, r10d
add r9, 1
mov r10d, esi
cmp r8, r9
jne .LBB5_7
.LBB5_8:
mov dword ptr [rcx], esi
mov dword ptr [rdx], eax
mov rsp, rbp
pop rbp
vzeroupper
ret
.Lfunc_end5:
.size uint32_max_min_avx2, .Lfunc_end5-uint32_max_min_avx2
# -- End function
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function int64_max_min_avx2
.LCPI6_0:
.quad -9223372036854775808 # 0x8000000000000000
.LCPI6_1:
.quad 9223372036854775807 # 0x7fffffffffffffff
.text
.globl int64_max_min_avx2
.p2align 4, 0x90
.type int64_max_min_avx2,@function
int64_max_min_avx2: # @int64_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
movabs rax, 9223372036854775807
test esi, esi
jle .LBB6_1
# %bb.2:
mov r8d, esi
cmp esi, 15
ja .LBB6_4
# %bb.3:
lea r10, [rax + 1]
xor r9d, r9d
jmp .LBB6_7
.LBB6_1:
lea rsi, [rax + 1]
jmp .LBB6_8
.LBB6_4:
mov r9d, r8d
vpbroadcastq ymm4, qword ptr [rip + .LCPI6_0] # ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
and r9d, -16
vpbroadcastq ymm0, qword ptr [rip + .LCPI6_1] # ymm0 = [9223372036854775807,9223372036854775807,9223372036854775807,9223372036854775807]
xor eax, eax
vmovdqa ymm3, ymm0
vmovdqa ymm2, ymm0
vmovdqa ymm1, ymm0
vmovdqa ymm7, ymm4
vmovdqa ymm6, ymm4
vmovdqa ymm5, ymm4
.p2align 4, 0x90
.LBB6_5: # =>This Inner Loop Header: Depth=1
vmovdqu ymm8, ymmword ptr [rdi + 8*rax]
vpcmpgtq ymm9, ymm8, ymm0
vblendvpd ymm0, ymm8, ymm0, ymm9
vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32]
vpcmpgtq ymm10, ymm9, ymm3
vblendvpd ymm3, ymm9, ymm3, ymm10
vmovdqu ymm10, ymmword ptr [rdi + 8*rax + 64]
vpcmpgtq ymm11, ymm10, ymm2
vblendvpd ymm2, ymm10, ymm2, ymm11
vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 96]
vpcmpgtq ymm12, ymm11, ymm1
vblendvpd ymm1, ymm11, ymm1, ymm12
vpcmpgtq ymm12, ymm4, ymm8
vblendvpd ymm4, ymm8, ymm4, ymm12
vpcmpgtq ymm8, ymm7, ymm9
vblendvpd ymm7, ymm9, ymm7, ymm8
vpcmpgtq ymm8, ymm6, ymm10
vblendvpd ymm6, ymm10, ymm6, ymm8
vpcmpgtq ymm8, ymm5, ymm11
vblendvpd ymm5, ymm11, ymm5, ymm8
add rax, 16
cmp r9, rax
jne .LBB6_5
# %bb.6:
vpcmpgtq ymm8, ymm4, ymm7
vblendvpd ymm4, ymm7, ymm4, ymm8
vpcmpgtq ymm7, ymm4, ymm6
vblendvpd ymm4, ymm6, ymm4, ymm7
vpcmpgtq ymm6, ymm4, ymm5
vblendvpd ymm4, ymm5, ymm4, ymm6
vextractf128 xmm5, ymm4, 1
vpcmpgtq xmm6, xmm4, xmm5
vblendvpd xmm4, xmm5, xmm4, xmm6
vpermilps xmm5, xmm4, 78 # xmm5 = xmm4[2,3,0,1]
vpcmpgtq xmm6, xmm4, xmm5
vblendvpd xmm4, xmm5, xmm4, xmm6
vmovq r10, xmm4
vpcmpgtq ymm4, ymm3, ymm0
vblendvpd ymm0, ymm3, ymm0, ymm4
vpcmpgtq ymm3, ymm2, ymm0
vblendvpd ymm0, ymm2, ymm0, ymm3
vpcmpgtq ymm2, ymm1, ymm0
vblendvpd ymm0, ymm1, ymm0, ymm2
vextractf128 xmm1, ymm0, 1
vpcmpgtq xmm2, xmm1, xmm0
vblendvpd xmm0, xmm1, xmm0, xmm2
vpermilps xmm1, xmm0, 78 # xmm1 = xmm0[2,3,0,1]
vpcmpgtq xmm2, xmm1, xmm0
vblendvpd xmm0, xmm1, xmm0, xmm2
vmovq rax, xmm0
mov rsi, r10
cmp r9, r8
je .LBB6_8
.p2align 4, 0x90
.LBB6_7: # =>This Inner Loop Header: Depth=1
mov rsi, qword ptr [rdi + 8*r9]
cmp rax, rsi
cmovg rax, rsi
cmp r10, rsi
cmovge rsi, r10
add r9, 1
mov r10, rsi
cmp r8, r9
jne .LBB6_7
.LBB6_8:
mov qword ptr [rcx], rsi
mov qword ptr [rdx], rax
mov rsp, rbp
pop rbp
vzeroupper
ret
.Lfunc_end6:
.size int64_max_min_avx2, .Lfunc_end6-int64_max_min_avx2
# -- End function
.section .rodata.cst8,"aM",@progbits,8
.p2align 3 # -- Begin function uint64_max_min_avx2
.LCPI7_0:
.quad -9223372036854775808 # 0x8000000000000000
.text
.globl uint64_max_min_avx2
.p2align 4, 0x90
.type uint64_max_min_avx2,@function
uint64_max_min_avx2: # @uint64_max_min_avx2
# %bb.0:
push rbp
mov rbp, rsp
and rsp, -8
test esi, esi
jle .LBB7_1
# %bb.2:
mov r8d, esi
cmp esi, 15
ja .LBB7_4
# %bb.3:
mov rax, -1
xor r9d, r9d
xor r10d, r10d
jmp .LBB7_7
.LBB7_1:
mov rax, -1
xor esi, esi
jmp .LBB7_8
.LBB7_4:
mov r9d, r8d
and r9d, -16
vpxor xmm5, xmm5, xmm5
vpcmpeqd ymm1, ymm1, ymm1
xor eax, eax
vpbroadcastq ymm0, qword ptr [rip + .LCPI7_0] # ymm0 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
vpcmpeqd ymm4, ymm4, ymm4
vpcmpeqd ymm3, ymm3, ymm3
vpcmpeqd ymm2, ymm2, ymm2
vpxor xmm8, xmm8, xmm8
vpxor xmm7, xmm7, xmm7
vpxor xmm6, xmm6, xmm6
.p2align 4, 0x90
.LBB7_5: # =>This Inner Loop Header: Depth=1
vmovdqu ymm9, ymmword ptr [rdi + 8*rax]
vpxor ymm10, ymm1, ymm0
vpxor ymm11, ymm9, ymm0
vpcmpgtq ymm10, ymm11, ymm10
vblendvpd ymm1, ymm9, ymm1, ymm10
vpxor ymm10, ymm5, ymm0
vpcmpgtq ymm10, ymm10, ymm11
vblendvpd ymm5, ymm9, ymm5, ymm10
vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 32]
vpxor ymm10, ymm4, ymm0
vpxor ymm11, ymm9, ymm0
vpcmpgtq ymm10, ymm11, ymm10
vblendvpd ymm4, ymm9, ymm4, ymm10
vpxor ymm10, ymm8, ymm0
vpcmpgtq ymm10, ymm10, ymm11
vmovdqu ymm11, ymmword ptr [rdi + 8*rax + 64]
vblendvpd ymm8, ymm9, ymm8, ymm10
vpxor ymm9, ymm3, ymm0
vpxor ymm10, ymm11, ymm0
vpcmpgtq ymm9, ymm10, ymm9
vblendvpd ymm3, ymm11, ymm3, ymm9
vpxor ymm9, ymm7, ymm0
vpcmpgtq ymm9, ymm9, ymm10
vblendvpd ymm7, ymm11, ymm7, ymm9
vmovdqu ymm9, ymmword ptr [rdi + 8*rax + 96]
vpxor ymm10, ymm2, ymm0
vpxor ymm11, ymm9, ymm0
vpcmpgtq ymm10, ymm11, ymm10
vblendvpd ymm2, ymm9, ymm2, ymm10
vpxor ymm10, ymm6, ymm0
vpcmpgtq ymm10, ymm10, ymm11
vblendvpd ymm6, ymm9, ymm6, ymm10
add rax, 16
cmp r9, rax
jne .LBB7_5
# %bb.6:
vpxor ymm9, ymm8, ymm0
vpxor ymm10, ymm5, ymm0
vpcmpgtq ymm9, ymm10, ymm9
vblendvpd ymm5, ymm8, ymm5, ymm9
vxorpd ymm8, ymm5, ymm0
vpxor ymm9, ymm7, ymm0
vpcmpgtq ymm8, ymm8, ymm9
vblendvpd ymm5, ymm7, ymm5, ymm8
vxorpd ymm7, ymm5, ymm0
vpxor ymm8, ymm6, ymm0
vpcmpgtq ymm7, ymm7, ymm8
vblendvpd ymm5, ymm6, ymm5, ymm7
vextractf128 xmm6, ymm5, 1
vxorpd xmm8, xmm6, xmm0
vxorpd xmm7, xmm5, xmm0
vpcmpgtq xmm7, xmm7, xmm8
vblendvpd xmm5, xmm6, xmm5, xmm7
vpermilps xmm6, xmm5, 78 # xmm6 = xmm5[2,3,0,1]
vxorpd xmm8, xmm5, xmm0
vxorpd xmm7, xmm6, xmm0
vpcmpgtq xmm7, xmm8, xmm7
vblendvpd xmm5, xmm6, xmm5, xmm7
vpxor ymm6, ymm1, ymm0
vpxor ymm7, ymm4, ymm0
vpcmpgtq ymm6, ymm7, ymm6
vblendvpd ymm1, ymm4, ymm1, ymm6
vxorpd ymm4, ymm1, ymm0
vpxor ymm6, ymm3, ymm0
vpcmpgtq ymm4, ymm6, ymm4
vblendvpd ymm1, ymm3, ymm1, ymm4
vmovq r10, xmm5
vxorpd ymm3, ymm1, ymm0
vpxor ymm4, ymm2, ymm0
vpcmpgtq ymm3, ymm4, ymm3
vblendvpd ymm1, ymm2, ymm1, ymm3
vextractf128 xmm2, ymm1, 1
vxorpd xmm3, xmm1, xmm0
vxorpd xmm4, xmm2, xmm0
vpcmpgtq xmm3, xmm4, xmm3
vblendvpd xmm1, xmm2, xmm1, xmm3
vpermilps xmm2, xmm1, 78 # xmm2 = xmm1[2,3,0,1]
vxorpd xmm3, xmm1, xmm0
vxorpd xmm0, xmm2, xmm0
vpcmpgtq xmm0, xmm0, xmm3
vblendvpd xmm0, xmm2, xmm1, xmm0
vmovq rax, xmm0
mov rsi, r10
cmp r9, r8
je .LBB7_8
.p2align 4, 0x90
.LBB7_7: # =>This Inner Loop Header: Depth=1
mov rsi, qword ptr [rdi + 8*r9]
cmp rax, rsi
cmovae rax, rsi
cmp r10, rsi
cmova rsi, r10
add r9, 1
mov r10, rsi
cmp r8, r9
jne .LBB7_7
.LBB7_8:
mov qword ptr [rcx], rsi
mov qword ptr [rdx], rax
mov rsp, rbp
pop rbp
vzeroupper
ret
.Lfunc_end7:
.size uint64_max_min_avx2, .Lfunc_end7-uint64_max_min_avx2
# -- End function
.ident "Debian clang version 11.0.1-2"
.section ".note.GNU-stack","",@progbits
.addrsig