arrow/bitutil/_lib/bitmap_ops_avx2_amd64.s (410 lines of code) (raw):
.text
.intel_syntax noprefix
.file "bitmap_ops.c"
.globl bitmap_aligned_and_avx2 # -- Begin function bitmap_aligned_and_avx2
.p2align 4, 0x90
.type bitmap_aligned_and_avx2,@function
bitmap_aligned_and_avx2: # @bitmap_aligned_and_avx2
# %bb.0:
push rbp
mov rbp, rsp
push rbx
and rsp, -8
test rcx, rcx
jle .LBB0_12
# %bb.1:
cmp rcx, 127
ja .LBB0_7
# %bb.2:
xor r10d, r10d
jmp .LBB0_3
.LBB0_7:
lea r9, [rdx + rcx]
lea rax, [rdi + rcx]
cmp rax, rdx
seta r11b
lea rax, [rsi + rcx]
cmp r9, rdi
seta bl
cmp rax, rdx
seta r8b
cmp r9, rsi
seta r9b
xor r10d, r10d
test r11b, bl
jne .LBB0_3
# %bb.8:
and r8b, r9b
jne .LBB0_3
# %bb.9:
mov r10, rcx
and r10, -128
xor r8d, r8d
.p2align 4, 0x90
.LBB0_10: # =>This Inner Loop Header: Depth=1
vmovups ymm0, ymmword ptr [rsi + r8]
vmovups ymm1, ymmword ptr [rsi + r8 + 32]
vmovups ymm2, ymmword ptr [rsi + r8 + 64]
vmovups ymm3, ymmword ptr [rsi + r8 + 96]
vandps ymm0, ymm0, ymmword ptr [rdi + r8]
vandps ymm1, ymm1, ymmword ptr [rdi + r8 + 32]
vandps ymm2, ymm2, ymmword ptr [rdi + r8 + 64]
vandps ymm3, ymm3, ymmword ptr [rdi + r8 + 96]
vmovups ymmword ptr [rdx + r8], ymm0
vmovups ymmword ptr [rdx + r8 + 32], ymm1
vmovups ymmword ptr [rdx + r8 + 64], ymm2
vmovups ymmword ptr [rdx + r8 + 96], ymm3
sub r8, -128
cmp r10, r8
jne .LBB0_10
# %bb.11:
cmp r10, rcx
je .LBB0_12
.LBB0_3:
mov r8, r10
not r8
add r8, rcx
mov r9, rcx
and r9, 3
je .LBB0_5
.p2align 4, 0x90
.LBB0_4: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r10]
and al, byte ptr [rdi + r10]
mov byte ptr [rdx + r10], al
add r10, 1
add r9, -1
jne .LBB0_4
.LBB0_5:
cmp r8, 3
jb .LBB0_12
.p2align 4, 0x90
.LBB0_6: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r10]
and al, byte ptr [rdi + r10]
mov byte ptr [rdx + r10], al
movzx eax, byte ptr [rsi + r10 + 1]
and al, byte ptr [rdi + r10 + 1]
mov byte ptr [rdx + r10 + 1], al
movzx eax, byte ptr [rsi + r10 + 2]
and al, byte ptr [rdi + r10 + 2]
mov byte ptr [rdx + r10 + 2], al
movzx eax, byte ptr [rsi + r10 + 3]
and al, byte ptr [rdi + r10 + 3]
mov byte ptr [rdx + r10 + 3], al
add r10, 4
cmp rcx, r10
jne .LBB0_6
.LBB0_12:
lea rsp, [rbp - 8]
pop rbx
pop rbp
vzeroupper
ret
.Lfunc_end0:
.size bitmap_aligned_and_avx2, .Lfunc_end0-bitmap_aligned_and_avx2
# -- End function
.globl bitmap_aligned_or_avx2 # -- Begin function bitmap_aligned_or_avx2
.p2align 4, 0x90
.type bitmap_aligned_or_avx2,@function
bitmap_aligned_or_avx2: # @bitmap_aligned_or_avx2
# %bb.0:
push rbp
mov rbp, rsp
push rbx
and rsp, -8
test rcx, rcx
jle .LBB1_12
# %bb.1:
cmp rcx, 127
ja .LBB1_7
# %bb.2:
xor r10d, r10d
jmp .LBB1_3
.LBB1_7:
lea r9, [rdx + rcx]
lea rax, [rdi + rcx]
cmp rax, rdx
seta r11b
lea rax, [rsi + rcx]
cmp r9, rdi
seta bl
cmp rax, rdx
seta r8b
cmp r9, rsi
seta r9b
xor r10d, r10d
test r11b, bl
jne .LBB1_3
# %bb.8:
and r8b, r9b
jne .LBB1_3
# %bb.9:
mov r10, rcx
and r10, -128
xor r8d, r8d
.p2align 4, 0x90
.LBB1_10: # =>This Inner Loop Header: Depth=1
vmovups ymm0, ymmword ptr [rsi + r8]
vmovups ymm1, ymmword ptr [rsi + r8 + 32]
vmovups ymm2, ymmword ptr [rsi + r8 + 64]
vmovups ymm3, ymmword ptr [rsi + r8 + 96]
vorps ymm0, ymm0, ymmword ptr [rdi + r8]
vorps ymm1, ymm1, ymmword ptr [rdi + r8 + 32]
vorps ymm2, ymm2, ymmword ptr [rdi + r8 + 64]
vorps ymm3, ymm3, ymmword ptr [rdi + r8 + 96]
vmovups ymmword ptr [rdx + r8], ymm0
vmovups ymmword ptr [rdx + r8 + 32], ymm1
vmovups ymmword ptr [rdx + r8 + 64], ymm2
vmovups ymmword ptr [rdx + r8 + 96], ymm3
sub r8, -128
cmp r10, r8
jne .LBB1_10
# %bb.11:
cmp r10, rcx
je .LBB1_12
.LBB1_3:
mov r8, r10
not r8
add r8, rcx
mov r9, rcx
and r9, 3
je .LBB1_5
.p2align 4, 0x90
.LBB1_4: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r10]
or al, byte ptr [rdi + r10]
mov byte ptr [rdx + r10], al
add r10, 1
add r9, -1
jne .LBB1_4
.LBB1_5:
cmp r8, 3
jb .LBB1_12
.p2align 4, 0x90
.LBB1_6: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r10]
or al, byte ptr [rdi + r10]
mov byte ptr [rdx + r10], al
movzx eax, byte ptr [rsi + r10 + 1]
or al, byte ptr [rdi + r10 + 1]
mov byte ptr [rdx + r10 + 1], al
movzx eax, byte ptr [rsi + r10 + 2]
or al, byte ptr [rdi + r10 + 2]
mov byte ptr [rdx + r10 + 2], al
movzx eax, byte ptr [rsi + r10 + 3]
or al, byte ptr [rdi + r10 + 3]
mov byte ptr [rdx + r10 + 3], al
add r10, 4
cmp rcx, r10
jne .LBB1_6
.LBB1_12:
lea rsp, [rbp - 8]
pop rbx
pop rbp
vzeroupper
ret
.Lfunc_end1:
.size bitmap_aligned_or_avx2, .Lfunc_end1-bitmap_aligned_or_avx2
# -- End function
.globl bitmap_aligned_and_not_avx2 # -- Begin function bitmap_aligned_and_not_avx2
.p2align 4, 0x90
.type bitmap_aligned_and_not_avx2,@function
bitmap_aligned_and_not_avx2: # @bitmap_aligned_and_not_avx2
# %bb.0:
push rbp
mov rbp, rsp
push rbx
and rsp, -8
test rcx, rcx
jle .LBB2_12
# %bb.1:
cmp rcx, 127
ja .LBB2_7
# %bb.2:
xor r8d, r8d
jmp .LBB2_3
.LBB2_7:
lea r8, [rdx + rcx]
lea rax, [rdi + rcx]
cmp rax, rdx
seta r11b
lea rax, [rsi + rcx]
cmp r8, rdi
seta bl
cmp rax, rdx
seta r10b
cmp r8, rsi
seta r9b
xor r8d, r8d
test r11b, bl
jne .LBB2_3
# %bb.8:
and r10b, r9b
jne .LBB2_3
# %bb.9:
mov r8, rcx
and r8, -128
xor eax, eax
.p2align 4, 0x90
.LBB2_10: # =>This Inner Loop Header: Depth=1
vmovups ymm0, ymmword ptr [rsi + rax]
vmovups ymm1, ymmword ptr [rsi + rax + 32]
vmovups ymm2, ymmword ptr [rsi + rax + 64]
vmovups ymm3, ymmword ptr [rsi + rax + 96]
vandnps ymm0, ymm0, ymmword ptr [rdi + rax]
vandnps ymm1, ymm1, ymmword ptr [rdi + rax + 32]
vandnps ymm2, ymm2, ymmword ptr [rdi + rax + 64]
vandnps ymm3, ymm3, ymmword ptr [rdi + rax + 96]
vmovups ymmword ptr [rdx + rax], ymm0
vmovups ymmword ptr [rdx + rax + 32], ymm1
vmovups ymmword ptr [rdx + rax + 64], ymm2
vmovups ymmword ptr [rdx + rax + 96], ymm3
sub rax, -128
cmp r8, rax
jne .LBB2_10
# %bb.11:
cmp r8, rcx
je .LBB2_12
.LBB2_3:
mov r9, r8
not r9
test cl, 1
je .LBB2_5
# %bb.4:
mov al, byte ptr [rsi + r8]
not al
and al, byte ptr [rdi + r8]
mov byte ptr [rdx + r8], al
or r8, 1
.LBB2_5:
add r9, rcx
je .LBB2_12
.p2align 4, 0x90
.LBB2_6: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r8]
not al
and al, byte ptr [rdi + r8]
mov byte ptr [rdx + r8], al
movzx eax, byte ptr [rsi + r8 + 1]
not al
and al, byte ptr [rdi + r8 + 1]
mov byte ptr [rdx + r8 + 1], al
add r8, 2
cmp rcx, r8
jne .LBB2_6
.LBB2_12:
lea rsp, [rbp - 8]
pop rbx
pop rbp
vzeroupper
ret
.Lfunc_end2:
.size bitmap_aligned_and_not_avx2, .Lfunc_end2-bitmap_aligned_and_not_avx2
# -- End function
.globl bitmap_aligned_xor_avx2 # -- Begin function bitmap_aligned_xor_avx2
.p2align 4, 0x90
.type bitmap_aligned_xor_avx2,@function
bitmap_aligned_xor_avx2: # @bitmap_aligned_xor_avx2
# %bb.0:
push rbp
mov rbp, rsp
push rbx
and rsp, -8
test rcx, rcx
jle .LBB3_12
# %bb.1:
cmp rcx, 127
ja .LBB3_7
# %bb.2:
xor r10d, r10d
jmp .LBB3_3
.LBB3_7:
lea r9, [rdx + rcx]
lea rax, [rdi + rcx]
cmp rax, rdx
seta r11b
lea rax, [rsi + rcx]
cmp r9, rdi
seta bl
cmp rax, rdx
seta r8b
cmp r9, rsi
seta r9b
xor r10d, r10d
test r11b, bl
jne .LBB3_3
# %bb.8:
and r8b, r9b
jne .LBB3_3
# %bb.9:
mov r10, rcx
and r10, -128
xor r8d, r8d
.p2align 4, 0x90
.LBB3_10: # =>This Inner Loop Header: Depth=1
vmovups ymm0, ymmword ptr [rsi + r8]
vmovups ymm1, ymmword ptr [rsi + r8 + 32]
vmovups ymm2, ymmword ptr [rsi + r8 + 64]
vmovups ymm3, ymmword ptr [rsi + r8 + 96]
vxorps ymm0, ymm0, ymmword ptr [rdi + r8]
vxorps ymm1, ymm1, ymmword ptr [rdi + r8 + 32]
vxorps ymm2, ymm2, ymmword ptr [rdi + r8 + 64]
vxorps ymm3, ymm3, ymmword ptr [rdi + r8 + 96]
vmovups ymmword ptr [rdx + r8], ymm0
vmovups ymmword ptr [rdx + r8 + 32], ymm1
vmovups ymmword ptr [rdx + r8 + 64], ymm2
vmovups ymmword ptr [rdx + r8 + 96], ymm3
sub r8, -128
cmp r10, r8
jne .LBB3_10
# %bb.11:
cmp r10, rcx
je .LBB3_12
.LBB3_3:
mov r8, r10
not r8
add r8, rcx
mov r9, rcx
and r9, 3
je .LBB3_5
.p2align 4, 0x90
.LBB3_4: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r10]
xor al, byte ptr [rdi + r10]
mov byte ptr [rdx + r10], al
add r10, 1
add r9, -1
jne .LBB3_4
.LBB3_5:
cmp r8, 3
jb .LBB3_12
.p2align 4, 0x90
.LBB3_6: # =>This Inner Loop Header: Depth=1
movzx eax, byte ptr [rsi + r10]
xor al, byte ptr [rdi + r10]
mov byte ptr [rdx + r10], al
movzx eax, byte ptr [rsi + r10 + 1]
xor al, byte ptr [rdi + r10 + 1]
mov byte ptr [rdx + r10 + 1], al
movzx eax, byte ptr [rsi + r10 + 2]
xor al, byte ptr [rdi + r10 + 2]
mov byte ptr [rdx + r10 + 2], al
movzx eax, byte ptr [rsi + r10 + 3]
xor al, byte ptr [rdi + r10 + 3]
mov byte ptr [rdx + r10 + 3], al
add r10, 4
cmp rcx, r10
jne .LBB3_6
.LBB3_12:
lea rsp, [rbp - 8]
pop rbx
pop rbp
vzeroupper
ret
.Lfunc_end3:
.size bitmap_aligned_xor_avx2, .Lfunc_end3-bitmap_aligned_xor_avx2
# -- End function
.ident "Ubuntu clang version 11.1.0-6"
.section ".note.GNU-stack","",@progbits
.addrsig