arrow/compute/internal/kernels/_lib/cast_numeric_sse4_amd64.s (8,837 lines of code) (raw):

.text .intel_syntax noprefix .file "cast_numeric.cc" .section .rodata.cst8,"aM",@progbits,8 .p2align 3 # -- Begin function cast_type_numeric_sse4 .LCPI0_0: .quad 0x43e0000000000000 # double 9.2233720368547758E+18 .section .rodata.cst16,"aM",@progbits,16 .p2align 4 .LCPI0_1: .byte 0 # 0x0 .byte 4 # 0x4 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .LCPI0_3: .long 0x4f000000 # float 2.14748365E+9 .long 0x4f000000 # float 2.14748365E+9 .long 0x4f000000 # float 2.14748365E+9 .long 0x4f000000 # float 2.14748365E+9 .LCPI0_4: .long 2147483648 # 0x80000000 .long 2147483648 # 0x80000000 .long 2147483648 # 0x80000000 .long 2147483648 # 0x80000000 .LCPI0_5: .byte 0 # 0x0 .byte 8 # 0x8 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .LCPI0_6: .quad 4841369599423283200 # 0x4330000000000000 .quad 4841369599423283200 # 0x4330000000000000 .LCPI0_7: .quad 4985484787499139072 # 0x4530000000000000 .quad 4985484787499139072 # 0x4530000000000000 .LCPI0_8: .quad 0x4530000000100000 # double 1.9342813118337666E+25 .quad 0x4530000000100000 # double 1.9342813118337666E+25 .LCPI0_9: .long 1127219200 # 0x43300000 .long 1160773632 # 0x45300000 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_10: .quad 0x4330000000000000 # double 4503599627370496 .quad 0x4530000000000000 # double 1.9342813113834067E+25 .LCPI0_11: .quad 1 # 0x1 .quad 1 # 0x1 .LCPI0_12: .byte 0 # 0x0 .byte 1 # 0x1 .byte 4 # 0x4 .byte 5 # 0x5 .byte 8 # 0x8 .byte 9 # 0x9 .byte 12 # 0xc .byte 13 # 0xd .byte 8 # 0x8 .byte 9 # 0x9 .byte 12 # 0xc .byte 13 # 0xd .byte 12 # 0xc .byte 13 # 0xd .byte 14 # 0xe .byte 15 # 0xf .LCPI0_13: .byte 0 # 0x0 .byte 4 # 0x4 .byte 8 # 0x8 .byte 12 # 0xc .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .LCPI0_14: .long 1258291200 # 0x4b000000 .long 1258291200 # 0x4b000000 .long 1258291200 # 0x4b000000 .long 1258291200 # 0x4b000000 .LCPI0_15: .long 1392508928 # 0x53000000 .long 1392508928 # 0x53000000 .long 1392508928 # 0x53000000 .long 1392508928 # 0x53000000 .LCPI0_16: .long 0x53000080 # float 5.49764202E+11 .long 0x53000080 # float 5.49764202E+11 .long 0x53000080 # float 5.49764202E+11 .long 0x53000080 # float 5.49764202E+11 .LCPI0_17: .byte 0 # 0x0 .byte 2 # 0x2 .byte 4 # 0x4 .byte 6 # 0x6 .byte 8 # 0x8 .byte 10 # 0xa .byte 12 # 0xc .byte 14 # 0xe .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .zero 1 .section .rodata.cst4,"aM",@progbits,4 .p2align 2 .LCPI0_2: .long 0x5f000000 # float 9.22337203E+18 .text .globl cast_type_numeric_sse4 .p2align 4, 0x90 .type cast_type_numeric_sse4,@function cast_type_numeric_sse4: # @cast_type_numeric_sse4 # %bb.0: push rbp mov rbp, rsp and rsp, -8 cmp edi, 6 jg .LBB0_13 # %bb.1: cmp edi, 3 jle .LBB0_25 # %bb.2: cmp edi, 4 je .LBB0_45 # %bb.3: cmp edi, 5 je .LBB0_53 # %bb.4: cmp edi, 6 jne .LBB0_1526 # %bb.5: cmp esi, 6 jg .LBB0_93 # %bb.6: cmp esi, 3 jle .LBB0_163 # %bb.7: cmp esi, 4 je .LBB0_263 # %bb.8: cmp esi, 5 je .LBB0_266 # %bb.9: cmp esi, 6 jne .LBB0_1526 # %bb.10: test r8d, r8d jle .LBB0_1526 # %bb.11: mov r9d, r8d cmp r8d, 8 jb .LBB0_12 # %bb.443: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_761 # %bb.444: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_761 .LBB0_12: xor esi, esi .LBB0_1104: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1106 .LBB0_1105: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 add rdi, -1 jne .LBB0_1105 .LBB0_1106: cmp r8, 3 jb .LBB0_1526 .LBB0_1107: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax mov eax, dword ptr [rdx + 4*rsi + 4] mov dword ptr [rcx + 4*rsi + 4], eax mov eax, dword ptr [rdx + 4*rsi + 8] mov dword ptr [rcx + 4*rsi + 8], eax mov eax, dword ptr [rdx + 4*rsi + 12] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1107 jmp .LBB0_1526 .LBB0_13: cmp edi, 8 jle .LBB0_35 # %bb.14: cmp edi, 9 je .LBB0_61 # %bb.15: cmp edi, 11 je .LBB0_69 # %bb.16: cmp edi, 12 jne .LBB0_1526 # %bb.17: cmp esi, 6 jg .LBB0_100 # %bb.18: cmp esi, 3 jle .LBB0_168 # %bb.19: cmp esi, 4 je .LBB0_269 # %bb.20: cmp esi, 5 je .LBB0_272 # %bb.21: cmp esi, 6 jne .LBB0_1526 # %bb.22: test r8d, r8d jle .LBB0_1526 # %bb.23: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_446 # %bb.24: xor edi, edi jmp .LBB0_448 .LBB0_25: cmp edi, 2 je .LBB0_77 # %bb.26: cmp edi, 3 jne .LBB0_1526 # %bb.27: cmp esi, 6 jg .LBB0_107 # %bb.28: cmp esi, 3 jle .LBB0_173 # %bb.29: cmp esi, 4 je .LBB0_275 # %bb.30: cmp esi, 5 je .LBB0_278 # %bb.31: cmp esi, 6 jne .LBB0_1526 # %bb.32: test r8d, r8d jle .LBB0_1526 # %bb.33: mov r9d, r8d cmp r8d, 8 jb .LBB0_34 # %bb.451: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_763 # %bb.452: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_763 .LBB0_34: xor esi, esi .LBB0_1482: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1484 .LBB0_1483: # =>This Inner Loop Header: Depth=1 movsx edi, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], edi add rsi, 1 add rax, -1 jne .LBB0_1483 .LBB0_1484: cmp r8, 3 jb .LBB0_1526 .LBB0_1485: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], eax movsx eax, byte ptr [rdx + rsi + 1] mov dword ptr [rcx + 4*rsi + 4], eax movsx eax, byte ptr [rdx + rsi + 2] mov dword ptr [rcx + 4*rsi + 8], eax movsx eax, byte ptr [rdx + rsi + 3] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1485 jmp .LBB0_1526 .LBB0_35: cmp edi, 7 je .LBB0_85 # %bb.36: cmp edi, 8 jne .LBB0_1526 # %bb.37: cmp esi, 6 jg .LBB0_114 # %bb.38: cmp esi, 3 jle .LBB0_178 # %bb.39: cmp esi, 4 je .LBB0_281 # %bb.40: cmp esi, 5 je .LBB0_284 # %bb.41: cmp esi, 6 jne .LBB0_1526 # %bb.42: test r8d, r8d jle .LBB0_1526 # %bb.43: mov r9d, r8d cmp r8d, 4 jae .LBB0_454 # %bb.44: xor esi, esi jmp .LBB0_948 .LBB0_45: cmp esi, 6 jg .LBB0_121 # %bb.46: cmp esi, 3 jle .LBB0_183 # %bb.47: cmp esi, 4 je .LBB0_287 # %bb.48: cmp esi, 5 je .LBB0_290 # %bb.49: cmp esi, 6 jne .LBB0_1526 # %bb.50: test r8d, r8d jle .LBB0_1526 # %bb.51: mov r9d, r8d cmp r8d, 8 jae .LBB0_457 # %bb.52: xor esi, esi jmp .LBB0_953 .LBB0_53: cmp esi, 6 jg .LBB0_128 # %bb.54: cmp esi, 3 jle .LBB0_188 # %bb.55: cmp esi, 4 je .LBB0_293 # %bb.56: cmp esi, 5 je .LBB0_296 # %bb.57: cmp esi, 6 jne .LBB0_1526 # %bb.58: test r8d, r8d jle .LBB0_1526 # %bb.59: mov r9d, r8d cmp r8d, 8 jae .LBB0_460 # %bb.60: xor esi, esi jmp .LBB0_958 .LBB0_61: cmp esi, 6 jg .LBB0_135 # %bb.62: cmp esi, 3 jle .LBB0_193 # %bb.63: cmp esi, 4 je .LBB0_299 # %bb.64: cmp esi, 5 je .LBB0_302 # %bb.65: cmp esi, 6 jne .LBB0_1526 # %bb.66: test r8d, r8d jle .LBB0_1526 # %bb.67: mov r9d, r8d cmp r8d, 4 jae .LBB0_463 # %bb.68: xor esi, esi jmp .LBB0_963 .LBB0_69: cmp esi, 6 jg .LBB0_142 # %bb.70: cmp esi, 3 jle .LBB0_198 # %bb.71: cmp esi, 4 je .LBB0_305 # %bb.72: cmp esi, 5 je .LBB0_308 # %bb.73: cmp esi, 6 jne .LBB0_1526 # %bb.74: test r8d, r8d jle .LBB0_1526 # %bb.75: mov r9d, r8d cmp r8d, 8 jae .LBB0_466 # %bb.76: xor esi, esi jmp .LBB0_968 .LBB0_77: cmp esi, 6 jg .LBB0_149 # %bb.78: cmp esi, 3 jle .LBB0_203 # %bb.79: cmp esi, 4 je .LBB0_311 # %bb.80: cmp esi, 5 je .LBB0_314 # %bb.81: cmp esi, 6 jne .LBB0_1526 # %bb.82: test r8d, r8d jle .LBB0_1526 # %bb.83: mov r9d, r8d cmp r8d, 8 jb .LBB0_84 # %bb.469: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_766 # %bb.470: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_766 .LBB0_84: xor esi, esi .LBB0_1490: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1492 .LBB0_1491: # =>This Inner Loop Header: Depth=1 movzx edi, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], edi add rsi, 1 add rax, -1 jne .LBB0_1491 .LBB0_1492: cmp r8, 3 jb .LBB0_1526 .LBB0_1493: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], eax movzx eax, byte ptr [rdx + rsi + 1] mov dword ptr [rcx + 4*rsi + 4], eax movzx eax, byte ptr [rdx + rsi + 2] mov dword ptr [rcx + 4*rsi + 8], eax movzx eax, byte ptr [rdx + rsi + 3] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1493 jmp .LBB0_1526 .LBB0_85: cmp esi, 6 jg .LBB0_156 # %bb.86: cmp esi, 3 jle .LBB0_208 # %bb.87: cmp esi, 4 je .LBB0_317 # %bb.88: cmp esi, 5 je .LBB0_320 # %bb.89: cmp esi, 6 jne .LBB0_1526 # %bb.90: test r8d, r8d jle .LBB0_1526 # %bb.91: mov r9d, r8d cmp r8d, 8 jb .LBB0_92 # %bb.472: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_769 # %bb.473: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_769 .LBB0_92: xor esi, esi .LBB0_1114: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1116 .LBB0_1115: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 add rdi, -1 jne .LBB0_1115 .LBB0_1116: cmp r8, 3 jb .LBB0_1526 .LBB0_1117: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax mov eax, dword ptr [rdx + 4*rsi + 4] mov dword ptr [rcx + 4*rsi + 4], eax mov eax, dword ptr [rdx + 4*rsi + 8] mov dword ptr [rcx + 4*rsi + 8], eax mov eax, dword ptr [rdx + 4*rsi + 12] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1117 jmp .LBB0_1526 .LBB0_93: cmp esi, 8 jle .LBB0_213 # %bb.94: cmp esi, 9 je .LBB0_323 # %bb.95: cmp esi, 11 je .LBB0_326 # %bb.96: cmp esi, 12 jne .LBB0_1526 # %bb.97: test r8d, r8d jle .LBB0_1526 # %bb.98: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_475 # %bb.99: xor edi, edi jmp .LBB0_477 .LBB0_100: cmp esi, 8 jle .LBB0_218 # %bb.101: cmp esi, 9 je .LBB0_329 # %bb.102: cmp esi, 11 je .LBB0_332 # %bb.103: cmp esi, 12 jne .LBB0_1526 # %bb.104: test r8d, r8d jle .LBB0_1526 # %bb.105: mov r9d, r8d cmp r8d, 4 jb .LBB0_106 # %bb.480: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_771 # %bb.481: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_771 .LBB0_106: xor esi, esi .LBB0_1124: mov edi, r9d sub edi, esi mov r8, rsi not r8 add r8, r9 and rdi, 7 je .LBB0_1126 .LBB0_1125: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1125 .LBB0_1126: cmp r8, 7 jb .LBB0_1526 .LBB0_1127: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax mov rax, qword ptr [rdx + 8*rsi + 8] mov qword ptr [rcx + 8*rsi + 8], rax mov rax, qword ptr [rdx + 8*rsi + 16] mov qword ptr [rcx + 8*rsi + 16], rax mov rax, qword ptr [rdx + 8*rsi + 24] mov qword ptr [rcx + 8*rsi + 24], rax mov rax, qword ptr [rdx + 8*rsi + 32] mov qword ptr [rcx + 8*rsi + 32], rax mov rax, qword ptr [rdx + 8*rsi + 40] mov qword ptr [rcx + 8*rsi + 40], rax mov rax, qword ptr [rdx + 8*rsi + 48] mov qword ptr [rcx + 8*rsi + 48], rax mov rax, qword ptr [rdx + 8*rsi + 56] mov qword ptr [rcx + 8*rsi + 56], rax add rsi, 8 cmp r9, rsi jne .LBB0_1127 jmp .LBB0_1526 .LBB0_107: cmp esi, 8 jle .LBB0_223 # %bb.108: cmp esi, 9 je .LBB0_335 # %bb.109: cmp esi, 11 je .LBB0_338 # %bb.110: cmp esi, 12 jne .LBB0_1526 # %bb.111: test r8d, r8d jle .LBB0_1526 # %bb.112: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_483 # %bb.113: xor edi, edi jmp .LBB0_485 .LBB0_114: cmp esi, 8 jle .LBB0_228 # %bb.115: cmp esi, 9 je .LBB0_341 # %bb.116: cmp esi, 11 je .LBB0_344 # %bb.117: cmp esi, 12 jne .LBB0_1526 # %bb.118: test r8d, r8d jle .LBB0_1526 # %bb.119: mov r9d, r8d cmp r8d, 4 jae .LBB0_488 # %bb.120: xor esi, esi jmp .LBB0_973 .LBB0_121: cmp esi, 8 jle .LBB0_233 # %bb.122: cmp esi, 9 je .LBB0_347 # %bb.123: cmp esi, 11 je .LBB0_350 # %bb.124: cmp esi, 12 jne .LBB0_1526 # %bb.125: test r8d, r8d jle .LBB0_1526 # %bb.126: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_491 # %bb.127: xor edi, edi jmp .LBB0_493 .LBB0_128: cmp esi, 8 jle .LBB0_238 # %bb.129: cmp esi, 9 je .LBB0_353 # %bb.130: cmp esi, 11 je .LBB0_356 # %bb.131: cmp esi, 12 jne .LBB0_1526 # %bb.132: test r8d, r8d jle .LBB0_1526 # %bb.133: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_496 # %bb.134: xor edi, edi jmp .LBB0_498 .LBB0_135: cmp esi, 8 jle .LBB0_243 # %bb.136: cmp esi, 9 je .LBB0_359 # %bb.137: cmp esi, 11 je .LBB0_362 # %bb.138: cmp esi, 12 jne .LBB0_1526 # %bb.139: test r8d, r8d jle .LBB0_1526 # %bb.140: mov esi, r8d lea rdi, [rsi - 1] mov eax, esi and eax, 3 cmp rdi, 3 jae .LBB0_501 # %bb.141: xor edi, edi jmp .LBB0_503 .LBB0_142: cmp esi, 8 jle .LBB0_248 # %bb.143: cmp esi, 9 je .LBB0_365 # %bb.144: cmp esi, 11 je .LBB0_368 # %bb.145: cmp esi, 12 jne .LBB0_1526 # %bb.146: test r8d, r8d jle .LBB0_1526 # %bb.147: mov r9d, r8d cmp r8d, 4 jae .LBB0_506 # %bb.148: xor esi, esi jmp .LBB0_979 .LBB0_149: cmp esi, 8 jle .LBB0_253 # %bb.150: cmp esi, 9 je .LBB0_371 # %bb.151: cmp esi, 11 je .LBB0_374 # %bb.152: cmp esi, 12 jne .LBB0_1526 # %bb.153: test r8d, r8d jle .LBB0_1526 # %bb.154: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_509 # %bb.155: xor edi, edi jmp .LBB0_511 .LBB0_156: cmp esi, 8 jle .LBB0_258 # %bb.157: cmp esi, 9 je .LBB0_377 # %bb.158: cmp esi, 11 je .LBB0_380 # %bb.159: cmp esi, 12 jne .LBB0_1526 # %bb.160: test r8d, r8d jle .LBB0_1526 # %bb.161: mov esi, r8d lea rdi, [rsi - 1] mov eax, esi and eax, 3 cmp rdi, 3 jae .LBB0_514 # %bb.162: xor edi, edi jmp .LBB0_516 .LBB0_163: cmp esi, 2 je .LBB0_383 # %bb.164: cmp esi, 3 jne .LBB0_1526 # %bb.165: test r8d, r8d jle .LBB0_1526 # %bb.166: mov r9d, r8d cmp r8d, 8 jb .LBB0_167 # %bb.519: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_773 # %bb.520: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_773 .LBB0_167: xor esi, esi .LBB0_1498: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1500 .LBB0_1499: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1499 .LBB0_1500: cmp r8, 3 jb .LBB0_1526 .LBB0_1501: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 4*rsi + 4] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 4*rsi + 8] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 4*rsi + 12] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1501 jmp .LBB0_1526 .LBB0_168: cmp esi, 2 je .LBB0_386 # %bb.169: cmp esi, 3 jne .LBB0_1526 # %bb.170: test r8d, r8d jle .LBB0_1526 # %bb.171: mov r9d, r8d cmp r8d, 4 jb .LBB0_172 # %bb.522: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_776 # %bb.523: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_776 .LBB0_172: xor esi, esi .LBB0_1506: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1508 .LBB0_1507: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1507 .LBB0_1508: cmp r8, 3 jb .LBB0_1526 .LBB0_1509: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al cvttsd2si eax, qword ptr [rdx + 8*rsi + 8] mov byte ptr [rcx + rsi + 1], al cvttsd2si eax, qword ptr [rdx + 8*rsi + 16] mov byte ptr [rcx + rsi + 2], al cvttsd2si eax, qword ptr [rdx + 8*rsi + 24] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1509 jmp .LBB0_1526 .LBB0_173: cmp esi, 2 je .LBB0_389 # %bb.174: cmp esi, 3 jne .LBB0_1526 # %bb.175: test r8d, r8d jle .LBB0_1526 # %bb.176: mov r9d, r8d cmp r8d, 32 jb .LBB0_177 # %bb.525: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_779 # %bb.526: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_779 .LBB0_177: xor esi, esi .LBB0_1134: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1136 .LBB0_1135: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1135 .LBB0_1136: cmp r8, 3 jb .LBB0_1526 .LBB0_1137: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + rsi + 1] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + rsi + 2] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + rsi + 3] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1137 jmp .LBB0_1526 .LBB0_178: cmp esi, 2 je .LBB0_392 # %bb.179: cmp esi, 3 jne .LBB0_1526 # %bb.180: test r8d, r8d jle .LBB0_1526 # %bb.181: mov r9d, r8d cmp r8d, 4 jb .LBB0_182 # %bb.528: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_781 # %bb.529: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_781 .LBB0_182: xor esi, esi .LBB0_1322: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1324 .LBB0_1323: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1323 .LBB0_1324: cmp r8, 3 jb .LBB0_1526 .LBB0_1325: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 8*rsi + 8] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 8*rsi + 16] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 8*rsi + 24] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1325 jmp .LBB0_1526 .LBB0_183: cmp esi, 2 je .LBB0_395 # %bb.184: cmp esi, 3 jne .LBB0_1526 # %bb.185: test r8d, r8d jle .LBB0_1526 # %bb.186: mov r9d, r8d cmp r8d, 16 jb .LBB0_187 # %bb.531: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_784 # %bb.532: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_784 .LBB0_187: xor esi, esi .LBB0_1330: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1332 .LBB0_1331: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1331 .LBB0_1332: cmp r8, 3 jb .LBB0_1526 .LBB0_1333: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 2*rsi + 2] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 2*rsi + 4] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 2*rsi + 6] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1333 jmp .LBB0_1526 .LBB0_188: cmp esi, 2 je .LBB0_398 # %bb.189: cmp esi, 3 jne .LBB0_1526 # %bb.190: test r8d, r8d jle .LBB0_1526 # %bb.191: mov r9d, r8d cmp r8d, 16 jb .LBB0_192 # %bb.534: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_787 # %bb.535: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_787 .LBB0_192: xor esi, esi .LBB0_1514: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1516 .LBB0_1515: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1515 .LBB0_1516: cmp r8, 3 jb .LBB0_1526 .LBB0_1517: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 2*rsi + 2] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 2*rsi + 4] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 2*rsi + 6] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1517 jmp .LBB0_1526 .LBB0_193: cmp esi, 2 je .LBB0_401 # %bb.194: cmp esi, 3 jne .LBB0_1526 # %bb.195: test r8d, r8d jle .LBB0_1526 # %bb.196: mov r9d, r8d cmp r8d, 4 jb .LBB0_197 # %bb.537: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_790 # %bb.538: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_790 .LBB0_197: xor esi, esi .LBB0_1338: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1340 .LBB0_1339: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1339 .LBB0_1340: cmp r8, 3 jb .LBB0_1526 .LBB0_1341: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 8*rsi + 8] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 8*rsi + 16] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 8*rsi + 24] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1341 jmp .LBB0_1526 .LBB0_198: cmp esi, 2 je .LBB0_404 # %bb.199: cmp esi, 3 jne .LBB0_1526 # %bb.200: test r8d, r8d jle .LBB0_1526 # %bb.201: mov r9d, r8d cmp r8d, 8 jb .LBB0_202 # %bb.540: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_793 # %bb.541: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_793 .LBB0_202: xor esi, esi .LBB0_1522: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1524 .LBB0_1523: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1523 .LBB0_1524: cmp r8, 3 jb .LBB0_1526 .LBB0_1525: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al cvttss2si eax, dword ptr [rdx + 4*rsi + 4] mov byte ptr [rcx + rsi + 1], al cvttss2si eax, dword ptr [rdx + 4*rsi + 8] mov byte ptr [rcx + rsi + 2], al cvttss2si eax, dword ptr [rdx + 4*rsi + 12] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1525 jmp .LBB0_1526 .LBB0_203: cmp esi, 2 je .LBB0_407 # %bb.204: cmp esi, 3 jne .LBB0_1526 # %bb.205: test r8d, r8d jle .LBB0_1526 # %bb.206: mov r9d, r8d cmp r8d, 32 jb .LBB0_207 # %bb.543: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_796 # %bb.544: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_796 .LBB0_207: xor esi, esi .LBB0_1144: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1146 .LBB0_1145: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1145 .LBB0_1146: cmp r8, 3 jb .LBB0_1526 .LBB0_1147: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + rsi + 1] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + rsi + 2] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + rsi + 3] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1147 jmp .LBB0_1526 .LBB0_208: cmp esi, 2 je .LBB0_410 # %bb.209: cmp esi, 3 jne .LBB0_1526 # %bb.210: test r8d, r8d jle .LBB0_1526 # %bb.211: mov r9d, r8d cmp r8d, 8 jb .LBB0_212 # %bb.546: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_798 # %bb.547: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_798 .LBB0_212: xor esi, esi .LBB0_1346: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1348 .LBB0_1347: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1347 .LBB0_1348: cmp r8, 3 jb .LBB0_1526 .LBB0_1349: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 4*rsi + 4] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 4*rsi + 8] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 4*rsi + 12] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1349 jmp .LBB0_1526 .LBB0_213: cmp esi, 7 je .LBB0_413 # %bb.214: cmp esi, 8 jne .LBB0_1526 # %bb.215: test r8d, r8d jle .LBB0_1526 # %bb.216: mov r9d, r8d cmp r8d, 4 jae .LBB0_549 # %bb.217: xor esi, esi jmp .LBB0_807 .LBB0_218: cmp esi, 7 je .LBB0_416 # %bb.219: cmp esi, 8 jne .LBB0_1526 # %bb.220: test r8d, r8d jle .LBB0_1526 # %bb.221: mov r9d, r8d lea rax, [r9 - 1] mov r8d, r9d and r8d, 3 movabs r10, -9223372036854775808 cmp rax, 3 jae .LBB0_551 # %bb.222: xor eax, eax jmp .LBB0_553 .LBB0_223: cmp esi, 7 je .LBB0_419 # %bb.224: cmp esi, 8 jne .LBB0_1526 # %bb.225: test r8d, r8d jle .LBB0_1526 # %bb.226: mov r9d, r8d cmp r8d, 4 jb .LBB0_227 # %bb.556: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_808 # %bb.557: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_808 .LBB0_227: xor esi, esi .LBB0_1154: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1156 .LBB0_1155: # =>This Inner Loop Header: Depth=1 movsx rax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1155 .LBB0_1156: cmp r8, 3 jb .LBB0_1526 .LBB0_1157: # =>This Inner Loop Header: Depth=1 movsx rax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax movsx rax, byte ptr [rdx + rsi + 1] mov qword ptr [rcx + 8*rsi + 8], rax movsx rax, byte ptr [rdx + rsi + 2] mov qword ptr [rcx + 8*rsi + 16], rax movsx rax, byte ptr [rdx + rsi + 3] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1157 jmp .LBB0_1526 .LBB0_228: cmp esi, 7 je .LBB0_422 # %bb.229: cmp esi, 8 jne .LBB0_1526 # %bb.230: test r8d, r8d jle .LBB0_1526 # %bb.231: mov r9d, r8d cmp r8d, 4 jb .LBB0_232 # %bb.559: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_810 # %bb.560: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_810 .LBB0_232: xor esi, esi .LBB0_1164: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1166 .LBB0_1165: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1165 .LBB0_1166: cmp r8, 3 jb .LBB0_1526 .LBB0_1167: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax mov rax, qword ptr [rdx + 8*rsi + 8] mov qword ptr [rcx + 8*rsi + 8], rax mov rax, qword ptr [rdx + 8*rsi + 16] mov qword ptr [rcx + 8*rsi + 16], rax mov rax, qword ptr [rdx + 8*rsi + 24] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1167 jmp .LBB0_1526 .LBB0_233: cmp esi, 7 je .LBB0_425 # %bb.234: cmp esi, 8 jne .LBB0_1526 # %bb.235: test r8d, r8d jle .LBB0_1526 # %bb.236: mov r9d, r8d cmp r8d, 4 jae .LBB0_562 # %bb.237: xor esi, esi jmp .LBB0_818 .LBB0_238: cmp esi, 7 je .LBB0_428 # %bb.239: cmp esi, 8 jne .LBB0_1526 # %bb.240: test r8d, r8d jle .LBB0_1526 # %bb.241: mov r9d, r8d cmp r8d, 4 jae .LBB0_564 # %bb.242: xor esi, esi jmp .LBB0_825 .LBB0_243: cmp esi, 7 je .LBB0_431 # %bb.244: cmp esi, 8 jne .LBB0_1526 # %bb.245: test r8d, r8d jle .LBB0_1526 # %bb.246: mov r9d, r8d cmp r8d, 4 jb .LBB0_247 # %bb.566: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_826 # %bb.567: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_826 .LBB0_247: xor esi, esi .LBB0_1174: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1176 .LBB0_1175: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1175 .LBB0_1176: cmp r8, 3 jb .LBB0_1526 .LBB0_1177: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax mov rax, qword ptr [rdx + 8*rsi + 8] mov qword ptr [rcx + 8*rsi + 8], rax mov rax, qword ptr [rdx + 8*rsi + 16] mov qword ptr [rcx + 8*rsi + 16], rax mov rax, qword ptr [rdx + 8*rsi + 24] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1177 jmp .LBB0_1526 .LBB0_248: cmp esi, 7 je .LBB0_434 # %bb.249: cmp esi, 8 jne .LBB0_1526 # %bb.250: test r8d, r8d jle .LBB0_1526 # %bb.251: mov r9d, r8d lea rax, [r9 - 1] mov r8d, r9d and r8d, 3 cmp rax, 3 jae .LBB0_569 # %bb.252: xor edi, edi jmp .LBB0_571 .LBB0_253: cmp esi, 7 je .LBB0_437 # %bb.254: cmp esi, 8 jne .LBB0_1526 # %bb.255: test r8d, r8d jle .LBB0_1526 # %bb.256: mov r9d, r8d cmp r8d, 4 jb .LBB0_257 # %bb.574: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_828 # %bb.575: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_828 .LBB0_257: xor esi, esi .LBB0_1184: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1186 .LBB0_1185: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1185 .LBB0_1186: cmp r8, 3 jb .LBB0_1526 .LBB0_1187: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax movzx eax, byte ptr [rdx + rsi + 1] mov qword ptr [rcx + 8*rsi + 8], rax movzx eax, byte ptr [rdx + rsi + 2] mov qword ptr [rcx + 8*rsi + 16], rax movzx eax, byte ptr [rdx + rsi + 3] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1187 jmp .LBB0_1526 .LBB0_258: cmp esi, 7 je .LBB0_440 # %bb.259: cmp esi, 8 jne .LBB0_1526 # %bb.260: test r8d, r8d jle .LBB0_1526 # %bb.261: mov r9d, r8d cmp r8d, 4 jae .LBB0_577 # %bb.262: xor esi, esi jmp .LBB0_836 .LBB0_263: test r8d, r8d jle .LBB0_1526 # %bb.264: mov r9d, r8d cmp r8d, 8 jae .LBB0_579 # %bb.265: xor esi, esi jmp .LBB0_984 .LBB0_266: test r8d, r8d jle .LBB0_1526 # %bb.267: mov r9d, r8d cmp r8d, 8 jae .LBB0_582 # %bb.268: xor esi, esi jmp .LBB0_989 .LBB0_269: test r8d, r8d jle .LBB0_1526 # %bb.270: mov r9d, r8d cmp r8d, 4 jae .LBB0_585 # %bb.271: xor esi, esi jmp .LBB0_994 .LBB0_272: test r8d, r8d jle .LBB0_1526 # %bb.273: mov r9d, r8d cmp r8d, 4 jae .LBB0_588 # %bb.274: xor esi, esi jmp .LBB0_999 .LBB0_275: test r8d, r8d jle .LBB0_1526 # %bb.276: mov r9d, r8d cmp r8d, 16 jb .LBB0_277 # %bb.591: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_837 # %bb.592: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_837 .LBB0_277: xor esi, esi .LBB0_1354: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1356 .LBB0_1355: # =>This Inner Loop Header: Depth=1 movsx edi, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], di add rsi, 1 add rax, -1 jne .LBB0_1355 .LBB0_1356: cmp r8, 3 jb .LBB0_1526 .LBB0_1357: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], ax movsx eax, byte ptr [rdx + rsi + 1] mov word ptr [rcx + 2*rsi + 2], ax movsx eax, byte ptr [rdx + rsi + 2] mov word ptr [rcx + 2*rsi + 4], ax movsx eax, byte ptr [rdx + rsi + 3] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1357 jmp .LBB0_1526 .LBB0_278: test r8d, r8d jle .LBB0_1526 # %bb.279: mov r9d, r8d cmp r8d, 16 jb .LBB0_280 # %bb.594: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_840 # %bb.595: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_840 .LBB0_280: xor esi, esi .LBB0_1362: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1364 .LBB0_1363: # =>This Inner Loop Header: Depth=1 movsx edi, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], di add rsi, 1 add rax, -1 jne .LBB0_1363 .LBB0_1364: cmp r8, 3 jb .LBB0_1526 .LBB0_1365: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], ax movsx eax, byte ptr [rdx + rsi + 1] mov word ptr [rcx + 2*rsi + 2], ax movsx eax, byte ptr [rdx + rsi + 2] mov word ptr [rcx + 2*rsi + 4], ax movsx eax, byte ptr [rdx + rsi + 3] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1365 jmp .LBB0_1526 .LBB0_281: test r8d, r8d jle .LBB0_1526 # %bb.282: mov r9d, r8d cmp r8d, 4 jae .LBB0_597 # %bb.283: xor esi, esi jmp .LBB0_1004 .LBB0_284: test r8d, r8d jle .LBB0_1526 # %bb.285: mov r9d, r8d cmp r8d, 4 jae .LBB0_600 # %bb.286: xor esi, esi jmp .LBB0_1009 .LBB0_287: test r8d, r8d jle .LBB0_1526 # %bb.288: mov r9d, r8d cmp r8d, 16 jb .LBB0_289 # %bb.603: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_843 # %bb.604: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_843 .LBB0_289: xor esi, esi .LBB0_1194: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1196 .LBB0_1195: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 add rdi, -1 jne .LBB0_1195 .LBB0_1196: cmp r8, 3 jb .LBB0_1526 .LBB0_1197: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax movzx eax, word ptr [rdx + 2*rsi + 2] mov word ptr [rcx + 2*rsi + 2], ax movzx eax, word ptr [rdx + 2*rsi + 4] mov word ptr [rcx + 2*rsi + 4], ax movzx eax, word ptr [rdx + 2*rsi + 6] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1197 jmp .LBB0_1526 .LBB0_290: test r8d, r8d jle .LBB0_1526 # %bb.291: mov r9d, r8d cmp r8d, 16 jb .LBB0_292 # %bb.606: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_845 # %bb.607: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_845 .LBB0_292: xor esi, esi .LBB0_1204: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1206 .LBB0_1205: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 add rdi, -1 jne .LBB0_1205 .LBB0_1206: cmp r8, 3 jb .LBB0_1526 .LBB0_1207: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax movzx eax, word ptr [rdx + 2*rsi + 2] mov word ptr [rcx + 2*rsi + 2], ax movzx eax, word ptr [rdx + 2*rsi + 4] mov word ptr [rcx + 2*rsi + 4], ax movzx eax, word ptr [rdx + 2*rsi + 6] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1207 jmp .LBB0_1526 .LBB0_293: test r8d, r8d jle .LBB0_1526 # %bb.294: mov r9d, r8d cmp r8d, 16 jb .LBB0_295 # %bb.609: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_847 # %bb.610: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_847 .LBB0_295: xor esi, esi .LBB0_1214: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1216 .LBB0_1215: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 add rdi, -1 jne .LBB0_1215 .LBB0_1216: cmp r8, 3 jb .LBB0_1526 .LBB0_1217: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax movzx eax, word ptr [rdx + 2*rsi + 2] mov word ptr [rcx + 2*rsi + 2], ax movzx eax, word ptr [rdx + 2*rsi + 4] mov word ptr [rcx + 2*rsi + 4], ax movzx eax, word ptr [rdx + 2*rsi + 6] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1217 jmp .LBB0_1526 .LBB0_296: test r8d, r8d jle .LBB0_1526 # %bb.297: mov r9d, r8d cmp r8d, 16 jb .LBB0_298 # %bb.612: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_849 # %bb.613: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_849 .LBB0_298: xor esi, esi .LBB0_1224: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1226 .LBB0_1225: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 add rdi, -1 jne .LBB0_1225 .LBB0_1226: cmp r8, 3 jb .LBB0_1526 .LBB0_1227: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov word ptr [rcx + 2*rsi], ax movzx eax, word ptr [rdx + 2*rsi + 2] mov word ptr [rcx + 2*rsi + 2], ax movzx eax, word ptr [rdx + 2*rsi + 4] mov word ptr [rcx + 2*rsi + 4], ax movzx eax, word ptr [rdx + 2*rsi + 6] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1227 jmp .LBB0_1526 .LBB0_299: test r8d, r8d jle .LBB0_1526 # %bb.300: mov r9d, r8d cmp r8d, 4 jae .LBB0_615 # %bb.301: xor esi, esi jmp .LBB0_1014 .LBB0_302: test r8d, r8d jle .LBB0_1526 # %bb.303: mov r9d, r8d cmp r8d, 4 jae .LBB0_618 # %bb.304: xor esi, esi jmp .LBB0_1019 .LBB0_305: test r8d, r8d jle .LBB0_1526 # %bb.306: mov r9d, r8d cmp r8d, 8 jae .LBB0_621 # %bb.307: xor esi, esi jmp .LBB0_1024 .LBB0_308: test r8d, r8d jle .LBB0_1526 # %bb.309: mov r9d, r8d cmp r8d, 8 jae .LBB0_624 # %bb.310: xor esi, esi jmp .LBB0_1029 .LBB0_311: test r8d, r8d jle .LBB0_1526 # %bb.312: mov r9d, r8d cmp r8d, 16 jb .LBB0_313 # %bb.627: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_851 # %bb.628: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_851 .LBB0_313: xor esi, esi .LBB0_1370: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1372 .LBB0_1371: # =>This Inner Loop Header: Depth=1 movzx edi, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], di add rsi, 1 add rax, -1 jne .LBB0_1371 .LBB0_1372: cmp r8, 3 jb .LBB0_1526 .LBB0_1373: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], ax movzx eax, byte ptr [rdx + rsi + 1] mov word ptr [rcx + 2*rsi + 2], ax movzx eax, byte ptr [rdx + rsi + 2] mov word ptr [rcx + 2*rsi + 4], ax movzx eax, byte ptr [rdx + rsi + 3] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1373 jmp .LBB0_1526 .LBB0_314: test r8d, r8d jle .LBB0_1526 # %bb.315: mov r9d, r8d cmp r8d, 16 jb .LBB0_316 # %bb.630: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_854 # %bb.631: lea rax, [rcx + 2*r9] cmp rax, rdx jbe .LBB0_854 .LBB0_316: xor esi, esi .LBB0_1378: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1380 .LBB0_1379: # =>This Inner Loop Header: Depth=1 movzx edi, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], di add rsi, 1 add rax, -1 jne .LBB0_1379 .LBB0_1380: cmp r8, 3 jb .LBB0_1526 .LBB0_1381: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov word ptr [rcx + 2*rsi], ax movzx eax, byte ptr [rdx + rsi + 1] mov word ptr [rcx + 2*rsi + 2], ax movzx eax, byte ptr [rdx + rsi + 2] mov word ptr [rcx + 2*rsi + 4], ax movzx eax, byte ptr [rdx + rsi + 3] mov word ptr [rcx + 2*rsi + 6], ax add rsi, 4 cmp r9, rsi jne .LBB0_1381 jmp .LBB0_1526 .LBB0_317: test r8d, r8d jle .LBB0_1526 # %bb.318: mov r9d, r8d cmp r8d, 8 jae .LBB0_633 # %bb.319: xor esi, esi jmp .LBB0_1034 .LBB0_320: test r8d, r8d jle .LBB0_1526 # %bb.321: mov r9d, r8d cmp r8d, 8 jae .LBB0_636 # %bb.322: xor esi, esi jmp .LBB0_1039 .LBB0_323: test r8d, r8d jle .LBB0_1526 # %bb.324: mov r9d, r8d cmp r8d, 4 jae .LBB0_639 # %bb.325: xor esi, esi jmp .LBB0_863 .LBB0_326: test r8d, r8d jle .LBB0_1526 # %bb.327: mov r9d, r8d cmp r8d, 8 jae .LBB0_641 # %bb.328: xor esi, esi jmp .LBB0_1044 .LBB0_329: test r8d, r8d jle .LBB0_1526 # %bb.330: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_644 # %bb.331: xor edi, edi jmp .LBB0_646 .LBB0_332: test r8d, r8d jle .LBB0_1526 # %bb.333: mov r9d, r8d cmp r8d, 4 jae .LBB0_649 # %bb.334: xor esi, esi jmp .LBB0_1049 .LBB0_335: test r8d, r8d jle .LBB0_1526 # %bb.336: mov r9d, r8d cmp r8d, 4 jb .LBB0_337 # %bb.652: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_864 # %bb.653: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_864 .LBB0_337: xor esi, esi .LBB0_1234: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1236 .LBB0_1235: # =>This Inner Loop Header: Depth=1 movsx rax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1235 .LBB0_1236: cmp r8, 3 jb .LBB0_1526 .LBB0_1237: # =>This Inner Loop Header: Depth=1 movsx rax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax movsx rax, byte ptr [rdx + rsi + 1] mov qword ptr [rcx + 8*rsi + 8], rax movsx rax, byte ptr [rdx + rsi + 2] mov qword ptr [rcx + 8*rsi + 16], rax movsx rax, byte ptr [rdx + rsi + 3] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1237 jmp .LBB0_1526 .LBB0_338: test r8d, r8d jle .LBB0_1526 # %bb.339: mov r9d, r8d cmp r8d, 8 jb .LBB0_340 # %bb.655: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_866 # %bb.656: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_866 .LBB0_340: xor esi, esi .LBB0_1386: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1388 .LBB0_1387: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 add rdi, -1 jne .LBB0_1387 .LBB0_1388: cmp r8, 3 jb .LBB0_1526 .LBB0_1389: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi], xmm0 movsx eax, byte ptr [rdx + rsi + 1] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi + 4], xmm0 movsx eax, byte ptr [rdx + rsi + 2] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi + 8], xmm0 movsx eax, byte ptr [rdx + rsi + 3] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi + 12], xmm0 add rsi, 4 cmp r9, rsi jne .LBB0_1389 jmp .LBB0_1526 .LBB0_341: test r8d, r8d jle .LBB0_1526 # %bb.342: mov r9d, r8d cmp r8d, 4 jb .LBB0_343 # %bb.658: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_869 # %bb.659: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_869 .LBB0_343: xor esi, esi .LBB0_1244: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1246 .LBB0_1245: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1245 .LBB0_1246: cmp r8, 3 jb .LBB0_1526 .LBB0_1247: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax mov rax, qword ptr [rdx + 8*rsi + 8] mov qword ptr [rcx + 8*rsi + 8], rax mov rax, qword ptr [rdx + 8*rsi + 16] mov qword ptr [rcx + 8*rsi + 16], rax mov rax, qword ptr [rdx + 8*rsi + 24] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1247 jmp .LBB0_1526 .LBB0_344: test r8d, r8d jle .LBB0_1526 # %bb.345: mov r9d, r8d cmp r8d, 4 jae .LBB0_661 # %bb.346: xor esi, esi jmp .LBB0_1056 .LBB0_347: test r8d, r8d jle .LBB0_1526 # %bb.348: mov r9d, r8d cmp r8d, 4 jae .LBB0_664 # %bb.349: xor esi, esi jmp .LBB0_877 .LBB0_350: test r8d, r8d jle .LBB0_1526 # %bb.351: mov r9d, r8d cmp r8d, 8 jae .LBB0_666 # %bb.352: xor esi, esi jmp .LBB0_1062 .LBB0_353: test r8d, r8d jle .LBB0_1526 # %bb.354: mov r9d, r8d cmp r8d, 4 jae .LBB0_669 # %bb.355: xor esi, esi jmp .LBB0_884 .LBB0_356: test r8d, r8d jle .LBB0_1526 # %bb.357: mov r9d, r8d cmp r8d, 8 jae .LBB0_671 # %bb.358: xor esi, esi jmp .LBB0_1067 .LBB0_359: test r8d, r8d jle .LBB0_1526 # %bb.360: mov r9d, r8d cmp r8d, 4 jb .LBB0_361 # %bb.674: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_885 # %bb.675: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_885 .LBB0_361: xor esi, esi .LBB0_1254: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1256 .LBB0_1255: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1255 .LBB0_1256: cmp r8, 3 jb .LBB0_1526 .LBB0_1257: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax mov rax, qword ptr [rdx + 8*rsi + 8] mov qword ptr [rcx + 8*rsi + 8], rax mov rax, qword ptr [rdx + 8*rsi + 16] mov qword ptr [rcx + 8*rsi + 16], rax mov rax, qword ptr [rdx + 8*rsi + 24] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1257 jmp .LBB0_1526 .LBB0_362: test r8d, r8d jle .LBB0_1526 # %bb.363: mov esi, r8d lea rdi, [rsi - 1] mov eax, esi and eax, 3 cmp rdi, 3 jae .LBB0_677 # %bb.364: xor edi, edi jmp .LBB0_679 .LBB0_365: test r8d, r8d jle .LBB0_1526 # %bb.366: mov esi, r8d lea rdi, [rsi - 1] mov r8d, esi and r8d, 3 cmp rdi, 3 jae .LBB0_682 # %bb.367: xor edi, edi jmp .LBB0_684 .LBB0_368: test r8d, r8d jle .LBB0_1526 # %bb.369: mov r9d, r8d cmp r8d, 8 jb .LBB0_370 # %bb.687: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_887 # %bb.688: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_887 .LBB0_370: xor esi, esi .LBB0_1264: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 7 je .LBB0_1266 .LBB0_1265: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 add rdi, -1 jne .LBB0_1265 .LBB0_1266: cmp r8, 7 jb .LBB0_1526 .LBB0_1267: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax mov eax, dword ptr [rdx + 4*rsi + 4] mov dword ptr [rcx + 4*rsi + 4], eax mov eax, dword ptr [rdx + 4*rsi + 8] mov dword ptr [rcx + 4*rsi + 8], eax mov eax, dword ptr [rdx + 4*rsi + 12] mov dword ptr [rcx + 4*rsi + 12], eax mov eax, dword ptr [rdx + 4*rsi + 16] mov dword ptr [rcx + 4*rsi + 16], eax mov eax, dword ptr [rdx + 4*rsi + 20] mov dword ptr [rcx + 4*rsi + 20], eax mov eax, dword ptr [rdx + 4*rsi + 24] mov dword ptr [rcx + 4*rsi + 24], eax mov eax, dword ptr [rdx + 4*rsi + 28] mov dword ptr [rcx + 4*rsi + 28], eax add rsi, 8 cmp r9, rsi jne .LBB0_1267 jmp .LBB0_1526 .LBB0_371: test r8d, r8d jle .LBB0_1526 # %bb.372: mov r9d, r8d cmp r8d, 4 jb .LBB0_373 # %bb.690: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_889 # %bb.691: lea rax, [rcx + 8*r9] cmp rax, rdx jbe .LBB0_889 .LBB0_373: xor esi, esi .LBB0_1274: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1276 .LBB0_1275: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 add rdi, -1 jne .LBB0_1275 .LBB0_1276: cmp r8, 3 jb .LBB0_1526 .LBB0_1277: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov qword ptr [rcx + 8*rsi], rax movzx eax, byte ptr [rdx + rsi + 1] mov qword ptr [rcx + 8*rsi + 8], rax movzx eax, byte ptr [rdx + rsi + 2] mov qword ptr [rcx + 8*rsi + 16], rax movzx eax, byte ptr [rdx + rsi + 3] mov qword ptr [rcx + 8*rsi + 24], rax add rsi, 4 cmp r9, rsi jne .LBB0_1277 jmp .LBB0_1526 .LBB0_374: test r8d, r8d jle .LBB0_1526 # %bb.375: mov r9d, r8d cmp r8d, 8 jb .LBB0_376 # %bb.693: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_891 # %bb.694: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_891 .LBB0_376: xor esi, esi .LBB0_1394: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1396 .LBB0_1395: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 add rdi, -1 jne .LBB0_1395 .LBB0_1396: cmp r8, 3 jb .LBB0_1526 .LBB0_1397: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi], xmm0 movzx eax, byte ptr [rdx + rsi + 1] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi + 4], xmm0 movzx eax, byte ptr [rdx + rsi + 2] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi + 8], xmm0 movzx eax, byte ptr [rdx + rsi + 3] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi + 12], xmm0 add rsi, 4 cmp r9, rsi jne .LBB0_1397 jmp .LBB0_1526 .LBB0_377: test r8d, r8d jle .LBB0_1526 # %bb.378: mov r9d, r8d cmp r8d, 4 jae .LBB0_696 # %bb.379: xor esi, esi jmp .LBB0_900 .LBB0_380: test r8d, r8d jle .LBB0_1526 # %bb.381: mov r9d, r8d cmp r8d, 8 jae .LBB0_698 # %bb.382: xor esi, esi jmp .LBB0_1072 .LBB0_383: test r8d, r8d jle .LBB0_1526 # %bb.384: mov r9d, r8d cmp r8d, 8 jb .LBB0_385 # %bb.701: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_901 # %bb.702: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_901 .LBB0_385: xor esi, esi .LBB0_1402: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1404 .LBB0_1403: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1403 .LBB0_1404: cmp r8, 3 jb .LBB0_1526 .LBB0_1405: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 4*rsi + 4] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 4*rsi + 8] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 4*rsi + 12] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1405 jmp .LBB0_1526 .LBB0_386: test r8d, r8d jle .LBB0_1526 # %bb.387: mov r9d, r8d cmp r8d, 4 jb .LBB0_388 # %bb.704: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_904 # %bb.705: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_904 .LBB0_388: xor esi, esi .LBB0_1410: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1412 .LBB0_1411: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1411 .LBB0_1412: cmp r8, 3 jb .LBB0_1526 .LBB0_1413: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al cvttsd2si eax, qword ptr [rdx + 8*rsi + 8] mov byte ptr [rcx + rsi + 1], al cvttsd2si eax, qword ptr [rdx + 8*rsi + 16] mov byte ptr [rcx + rsi + 2], al cvttsd2si eax, qword ptr [rdx + 8*rsi + 24] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1413 jmp .LBB0_1526 .LBB0_389: test r8d, r8d jle .LBB0_1526 # %bb.390: mov r9d, r8d cmp r8d, 32 jb .LBB0_391 # %bb.707: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_907 # %bb.708: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_907 .LBB0_391: xor esi, esi .LBB0_1284: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1286 .LBB0_1285: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1285 .LBB0_1286: cmp r8, 3 jb .LBB0_1526 .LBB0_1287: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + rsi + 1] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + rsi + 2] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + rsi + 3] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1287 jmp .LBB0_1526 .LBB0_392: test r8d, r8d jle .LBB0_1526 # %bb.393: mov r9d, r8d cmp r8d, 4 jb .LBB0_394 # %bb.710: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_909 # %bb.711: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_909 .LBB0_394: xor esi, esi .LBB0_1418: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1420 .LBB0_1419: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1419 .LBB0_1420: cmp r8, 3 jb .LBB0_1526 .LBB0_1421: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 8*rsi + 8] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 8*rsi + 16] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 8*rsi + 24] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1421 jmp .LBB0_1526 .LBB0_395: test r8d, r8d jle .LBB0_1526 # %bb.396: mov r9d, r8d cmp r8d, 16 jb .LBB0_397 # %bb.713: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_912 # %bb.714: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_912 .LBB0_397: xor esi, esi .LBB0_1426: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1428 .LBB0_1427: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1427 .LBB0_1428: cmp r8, 3 jb .LBB0_1526 .LBB0_1429: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 2*rsi + 2] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 2*rsi + 4] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 2*rsi + 6] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1429 jmp .LBB0_1526 .LBB0_398: test r8d, r8d jle .LBB0_1526 # %bb.399: mov r9d, r8d cmp r8d, 16 jb .LBB0_400 # %bb.716: lea rax, [rdx + 2*r9] cmp rax, rcx jbe .LBB0_915 # %bb.717: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_915 .LBB0_400: xor esi, esi .LBB0_1434: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1436 .LBB0_1435: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1435 .LBB0_1436: cmp r8, 3 jb .LBB0_1526 .LBB0_1437: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 2*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 2*rsi + 2] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 2*rsi + 4] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 2*rsi + 6] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1437 jmp .LBB0_1526 .LBB0_401: test r8d, r8d jle .LBB0_1526 # %bb.402: mov r9d, r8d cmp r8d, 4 jb .LBB0_403 # %bb.719: lea rax, [rdx + 8*r9] cmp rax, rcx jbe .LBB0_918 # %bb.720: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_918 .LBB0_403: xor esi, esi .LBB0_1442: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1444 .LBB0_1443: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1443 .LBB0_1444: cmp r8, 3 jb .LBB0_1526 .LBB0_1445: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 8*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 8*rsi + 8] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 8*rsi + 16] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 8*rsi + 24] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1445 jmp .LBB0_1526 .LBB0_404: test r8d, r8d jle .LBB0_1526 # %bb.405: mov r9d, r8d cmp r8d, 8 jb .LBB0_406 # %bb.722: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_921 # %bb.723: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_921 .LBB0_406: xor esi, esi .LBB0_1450: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1452 .LBB0_1451: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1451 .LBB0_1452: cmp r8, 3 jb .LBB0_1526 .LBB0_1453: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al cvttss2si eax, dword ptr [rdx + 4*rsi + 4] mov byte ptr [rcx + rsi + 1], al cvttss2si eax, dword ptr [rdx + 4*rsi + 8] mov byte ptr [rcx + rsi + 2], al cvttss2si eax, dword ptr [rdx + 4*rsi + 12] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1453 jmp .LBB0_1526 .LBB0_407: test r8d, r8d jle .LBB0_1526 # %bb.408: mov r9d, r8d cmp r8d, 32 jb .LBB0_409 # %bb.725: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_924 # %bb.726: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_924 .LBB0_409: xor esi, esi .LBB0_1294: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1296 .LBB0_1295: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1295 .LBB0_1296: cmp r8, 3 jb .LBB0_1526 .LBB0_1297: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + rsi + 1] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + rsi + 2] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + rsi + 3] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1297 jmp .LBB0_1526 .LBB0_410: test r8d, r8d jle .LBB0_1526 # %bb.411: mov r9d, r8d cmp r8d, 8 jb .LBB0_412 # %bb.728: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_926 # %bb.729: lea rax, [rcx + r9] cmp rax, rdx jbe .LBB0_926 .LBB0_412: xor esi, esi .LBB0_1458: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1460 .LBB0_1459: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al add rsi, 1 add rdi, -1 jne .LBB0_1459 .LBB0_1460: cmp r8, 3 jb .LBB0_1526 .LBB0_1461: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + 4*rsi] mov byte ptr [rcx + rsi], al movzx eax, byte ptr [rdx + 4*rsi + 4] mov byte ptr [rcx + rsi + 1], al movzx eax, byte ptr [rdx + 4*rsi + 8] mov byte ptr [rcx + rsi + 2], al movzx eax, byte ptr [rdx + 4*rsi + 12] mov byte ptr [rcx + rsi + 3], al add rsi, 4 cmp r9, rsi jne .LBB0_1461 jmp .LBB0_1526 .LBB0_413: test r8d, r8d jle .LBB0_1526 # %bb.414: mov r9d, r8d cmp r8d, 8 jb .LBB0_415 # %bb.731: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_929 # %bb.732: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_929 .LBB0_415: xor esi, esi .LBB0_1304: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1306 .LBB0_1305: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 add rdi, -1 jne .LBB0_1305 .LBB0_1306: cmp r8, 3 jb .LBB0_1526 .LBB0_1307: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax mov eax, dword ptr [rdx + 4*rsi + 4] mov dword ptr [rcx + 4*rsi + 4], eax mov eax, dword ptr [rdx + 4*rsi + 8] mov dword ptr [rcx + 4*rsi + 8], eax mov eax, dword ptr [rdx + 4*rsi + 12] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1307 jmp .LBB0_1526 .LBB0_416: test r8d, r8d jle .LBB0_1526 # %bb.417: mov r9d, r8d cmp r8d, 4 jae .LBB0_734 # %bb.418: xor esi, esi jmp .LBB0_1077 .LBB0_419: test r8d, r8d jle .LBB0_1526 # %bb.420: mov r9d, r8d cmp r8d, 8 jb .LBB0_421 # %bb.737: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_931 # %bb.738: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_931 .LBB0_421: xor esi, esi .LBB0_1466: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1468 .LBB0_1467: # =>This Inner Loop Header: Depth=1 movsx edi, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], edi add rsi, 1 add rax, -1 jne .LBB0_1467 .LBB0_1468: cmp r8, 3 jb .LBB0_1526 .LBB0_1469: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], eax movsx eax, byte ptr [rdx + rsi + 1] mov dword ptr [rcx + 4*rsi + 4], eax movsx eax, byte ptr [rdx + rsi + 2] mov dword ptr [rcx + 4*rsi + 8], eax movsx eax, byte ptr [rdx + rsi + 3] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1469 jmp .LBB0_1526 .LBB0_422: test r8d, r8d jle .LBB0_1526 # %bb.423: mov r9d, r8d cmp r8d, 4 jae .LBB0_740 # %bb.424: xor esi, esi jmp .LBB0_943 .LBB0_425: test r8d, r8d jle .LBB0_1526 # %bb.426: mov r9d, r8d cmp r8d, 8 jae .LBB0_743 # %bb.427: xor esi, esi jmp .LBB0_1082 .LBB0_428: test r8d, r8d jle .LBB0_1526 # %bb.429: mov r9d, r8d cmp r8d, 8 jae .LBB0_746 # %bb.430: xor esi, esi jmp .LBB0_1087 .LBB0_431: test r8d, r8d jle .LBB0_1526 # %bb.432: mov r9d, r8d cmp r8d, 4 jae .LBB0_749 # %bb.433: xor esi, esi jmp .LBB0_1092 .LBB0_434: test r8d, r8d jle .LBB0_1526 # %bb.435: mov r9d, r8d cmp r8d, 8 jae .LBB0_752 # %bb.436: xor esi, esi jmp .LBB0_1097 .LBB0_437: test r8d, r8d jle .LBB0_1526 # %bb.438: mov r9d, r8d cmp r8d, 8 jb .LBB0_439 # %bb.755: lea rax, [rdx + r9] cmp rax, rcx jbe .LBB0_934 # %bb.756: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_934 .LBB0_439: xor esi, esi .LBB0_1474: mov r8, rsi not r8 add r8, r9 mov rax, r9 and rax, 3 je .LBB0_1476 .LBB0_1475: # =>This Inner Loop Header: Depth=1 movzx edi, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], edi add rsi, 1 add rax, -1 jne .LBB0_1475 .LBB0_1476: cmp r8, 3 jb .LBB0_1526 .LBB0_1477: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] mov dword ptr [rcx + 4*rsi], eax movzx eax, byte ptr [rdx + rsi + 1] mov dword ptr [rcx + 4*rsi + 4], eax movzx eax, byte ptr [rdx + rsi + 2] mov dword ptr [rcx + 4*rsi + 8], eax movzx eax, byte ptr [rdx + rsi + 3] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1477 jmp .LBB0_1526 .LBB0_440: test r8d, r8d jle .LBB0_1526 # %bb.441: mov r9d, r8d cmp r8d, 8 jb .LBB0_442 # %bb.758: lea rax, [rdx + 4*r9] cmp rax, rcx jbe .LBB0_937 # %bb.759: lea rax, [rcx + 4*r9] cmp rax, rdx jbe .LBB0_937 .LBB0_442: xor esi, esi .LBB0_1314: mov r8, rsi not r8 add r8, r9 mov rdi, r9 and rdi, 3 je .LBB0_1316 .LBB0_1315: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 add rdi, -1 jne .LBB0_1315 .LBB0_1316: cmp r8, 3 jb .LBB0_1526 .LBB0_1317: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax mov eax, dword ptr [rdx + 4*rsi + 4] mov dword ptr [rcx + 4*rsi + 4], eax mov eax, dword ptr [rdx + 4*rsi + 8] mov dword ptr [rcx + 4*rsi + 8], eax mov eax, dword ptr [rdx + 4*rsi + 12] mov dword ptr [rcx + 4*rsi + 12], eax add rsi, 4 cmp r9, rsi jne .LBB0_1317 jmp .LBB0_1526 .LBB0_446: and esi, -4 xor edi, edi .LBB0_447: # =>This Inner Loop Header: Depth=1 cvttsd2si rax, qword ptr [rdx + 8*rdi] mov dword ptr [rcx + 4*rdi], eax cvttsd2si rax, qword ptr [rdx + 8*rdi + 8] mov dword ptr [rcx + 4*rdi + 4], eax cvttsd2si rax, qword ptr [rdx + 8*rdi + 16] mov dword ptr [rcx + 4*rdi + 8], eax cvttsd2si rax, qword ptr [rdx + 8*rdi + 24] mov dword ptr [rcx + 4*rdi + 12], eax add rdi, 4 cmp rsi, rdi jne .LBB0_447 .LBB0_448: test r8, r8 je .LBB0_1526 # %bb.449: lea rcx, [rcx + 4*rdi] lea rdx, [rdx + 8*rdi] xor esi, esi .LBB0_450: # =>This Inner Loop Header: Depth=1 cvttsd2si rax, qword ptr [rdx + 8*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r8, rsi jne .LBB0_450 jmp .LBB0_1526 .LBB0_454: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_944 # %bb.455: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_456: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 add rdi, 8 add rax, 2 jne .LBB0_456 jmp .LBB0_945 .LBB0_457: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_949 # %bb.458: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_459: # =>This Inner Loop Header: Depth=1 pmovzxwd xmm0, qword ptr [rdx + 2*rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovzxwd xmm0, qword ptr [rdx + 2*rdi + 16] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 24] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_459 jmp .LBB0_950 .LBB0_460: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_954 # %bb.461: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_462: # =>This Inner Loop Header: Depth=1 pmovsxwd xmm0, qword ptr [rdx + 2*rdi] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovsxwd xmm0, qword ptr [rdx + 2*rdi + 16] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 24] movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_462 jmp .LBB0_955 .LBB0_463: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_959 # %bb.464: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_465: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 add rdi, 8 add rax, 2 jne .LBB0_465 jmp .LBB0_960 .LBB0_466: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_964 # %bb.467: mov rax, r8 and rax, -2 neg rax xor edi, edi movaps xmm1, xmmword ptr [rip + .LCPI0_3] # xmm1 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] movaps xmm2, xmmword ptr [rip + .LCPI0_4] # xmm2 = [2147483648,2147483648,2147483648,2147483648] .LBB0_468: # =>This Inner Loop Header: Depth=1 movups xmm3, xmmword ptr [rdx + 4*rdi] movups xmm4, xmmword ptr [rdx + 4*rdi + 16] movaps xmm0, xmm3 cmpltps xmm0, xmm1 cvttps2dq xmm5, xmm3 subps xmm3, xmm1 cvttps2dq xmm3, xmm3 xorps xmm3, xmm2 blendvps xmm3, xmm5, xmm0 movaps xmm0, xmm4 cmpltps xmm0, xmm1 cvttps2dq xmm5, xmm4 subps xmm4, xmm1 cvttps2dq xmm4, xmm4 xorps xmm4, xmm2 blendvps xmm4, xmm5, xmm0 movups xmmword ptr [rcx + 4*rdi], xmm3 movups xmmword ptr [rcx + 4*rdi + 16], xmm4 movups xmm3, xmmword ptr [rdx + 4*rdi + 32] movaps xmm0, xmm3 cmpltps xmm0, xmm1 cvttps2dq xmm4, xmm3 subps xmm3, xmm1 cvttps2dq xmm3, xmm3 xorps xmm3, xmm2 blendvps xmm3, xmm4, xmm0 movups xmm4, xmmword ptr [rdx + 4*rdi + 48] movaps xmm0, xmm4 cmpltps xmm0, xmm1 cvttps2dq xmm5, xmm4 subps xmm4, xmm1 cvttps2dq xmm4, xmm4 xorps xmm4, xmm2 blendvps xmm4, xmm5, xmm0 movups xmmword ptr [rcx + 4*rdi + 32], xmm3 movups xmmword ptr [rcx + 4*rdi + 48], xmm4 add rdi, 16 add rax, 2 jne .LBB0_468 jmp .LBB0_965 .LBB0_475: and esi, -4 xor edi, edi .LBB0_476: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rdi] xorps xmm0, xmm0 cvtsi2sd xmm0, rax movsd qword ptr [rcx + 8*rdi], xmm0 mov eax, dword ptr [rdx + 4*rdi + 4] xorps xmm0, xmm0 cvtsi2sd xmm0, rax movsd qword ptr [rcx + 8*rdi + 8], xmm0 mov eax, dword ptr [rdx + 4*rdi + 8] xorps xmm0, xmm0 cvtsi2sd xmm0, rax movsd qword ptr [rcx + 8*rdi + 16], xmm0 mov eax, dword ptr [rdx + 4*rdi + 12] xorps xmm0, xmm0 cvtsi2sd xmm0, rax movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_476 .LBB0_477: test r8, r8 je .LBB0_1526 # %bb.478: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 4*rdi] xor esi, esi .LBB0_479: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] xorps xmm0, xmm0 cvtsi2sd xmm0, rax movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp r8, rsi jne .LBB0_479 jmp .LBB0_1526 .LBB0_483: and esi, -4 xor edi, edi .LBB0_484: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rdi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi], xmm0 movsx eax, byte ptr [rdx + rdi + 1] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 8], xmm0 movsx eax, byte ptr [rdx + rdi + 2] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 16], xmm0 movsx eax, byte ptr [rdx + rdi + 3] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_484 .LBB0_485: test r8, r8 je .LBB0_1526 # %bb.486: lea rcx, [rcx + 8*rdi] add rdx, rdi xor esi, esi .LBB0_487: # =>This Inner Loop Header: Depth=1 movsx eax, byte ptr [rdx + rsi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp r8, rsi jne .LBB0_487 jmp .LBB0_1526 .LBB0_488: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_969 # %bb.489: mov rax, r8 and rax, -2 neg rax xor edi, edi pxor xmm0, xmm0 movdqa xmm1, xmmword ptr [rip + .LCPI0_6] # xmm1 = [4841369599423283200,4841369599423283200] movdqa xmm2, xmmword ptr [rip + .LCPI0_7] # xmm2 = [4985484787499139072,4985484787499139072] movapd xmm3, xmmword ptr [rip + .LCPI0_8] # xmm3 = [1.9342813118337666E+25,1.9342813118337666E+25] .LBB0_490: # =>This Inner Loop Header: Depth=1 movdqu xmm4, xmmword ptr [rdx + 8*rdi] movdqu xmm5, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm6, xmm4 pblendw xmm6, xmm0, 204 # xmm6 = xmm6[0,1],xmm0[2,3],xmm6[4,5],xmm0[6,7] por xmm6, xmm1 psrlq xmm4, 32 por xmm4, xmm2 subpd xmm4, xmm3 addpd xmm4, xmm6 movdqa xmm6, xmm5 pblendw xmm6, xmm0, 204 # xmm6 = xmm6[0,1],xmm0[2,3],xmm6[4,5],xmm0[6,7] por xmm6, xmm1 psrlq xmm5, 32 por xmm5, xmm2 subpd xmm5, xmm3 addpd xmm5, xmm6 movupd xmmword ptr [rcx + 8*rdi], xmm4 movupd xmmword ptr [rcx + 8*rdi + 16], xmm5 movdqu xmm4, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm5, xmmword ptr [rdx + 8*rdi + 48] movdqa xmm6, xmm4 pblendw xmm6, xmm0, 204 # xmm6 = xmm6[0,1],xmm0[2,3],xmm6[4,5],xmm0[6,7] por xmm6, xmm1 psrlq xmm4, 32 por xmm4, xmm2 subpd xmm4, xmm3 addpd xmm4, xmm6 movdqa xmm6, xmm5 pblendw xmm6, xmm0, 204 # xmm6 = xmm6[0,1],xmm0[2,3],xmm6[4,5],xmm0[6,7] por xmm6, xmm1 psrlq xmm5, 32 por xmm5, xmm2 subpd xmm5, xmm3 addpd xmm5, xmm6 movupd xmmword ptr [rcx + 8*rdi + 32], xmm4 movupd xmmword ptr [rcx + 8*rdi + 48], xmm5 add rdi, 8 add rax, 2 jne .LBB0_490 jmp .LBB0_970 .LBB0_491: and esi, -4 xor edi, edi .LBB0_492: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rdi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi], xmm0 movzx eax, word ptr [rdx + 2*rdi + 2] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 8], xmm0 movzx eax, word ptr [rdx + 2*rdi + 4] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 16], xmm0 movzx eax, word ptr [rdx + 2*rdi + 6] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_492 .LBB0_493: test r8, r8 je .LBB0_1526 # %bb.494: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 2*rdi] xor esi, esi .LBB0_495: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp r8, rsi jne .LBB0_495 jmp .LBB0_1526 .LBB0_496: and esi, -4 xor edi, edi .LBB0_497: # =>This Inner Loop Header: Depth=1 movsx eax, word ptr [rdx + 2*rdi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi], xmm0 movsx eax, word ptr [rdx + 2*rdi + 2] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 8], xmm0 movsx eax, word ptr [rdx + 2*rdi + 4] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 16], xmm0 movsx eax, word ptr [rdx + 2*rdi + 6] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_497 .LBB0_498: test r8, r8 je .LBB0_1526 # %bb.499: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 2*rdi] xor esi, esi .LBB0_500: # =>This Inner Loop Header: Depth=1 movsx eax, word ptr [rdx + 2*rsi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp r8, rsi jne .LBB0_500 jmp .LBB0_1526 .LBB0_501: and esi, -4 xor edi, edi .LBB0_502: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2sd xmm0, qword ptr [rdx + 8*rdi] movsd qword ptr [rcx + 8*rdi], xmm0 xorps xmm0, xmm0 cvtsi2sd xmm0, qword ptr [rdx + 8*rdi + 8] movsd qword ptr [rcx + 8*rdi + 8], xmm0 xorps xmm0, xmm0 cvtsi2sd xmm0, qword ptr [rdx + 8*rdi + 16] movsd qword ptr [rcx + 8*rdi + 16], xmm0 xorps xmm0, xmm0 cvtsi2sd xmm0, qword ptr [rdx + 8*rdi + 24] movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_502 .LBB0_503: test rax, rax je .LBB0_1526 # %bb.504: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 8*rdi] xor esi, esi .LBB0_505: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2sd xmm0, qword ptr [rdx + 8*rsi] movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp rax, rsi jne .LBB0_505 jmp .LBB0_1526 .LBB0_506: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_975 # %bb.507: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_508: # =>This Inner Loop Header: Depth=1 cvtps2pd xmm0, qword ptr [rdx + 4*rdi] cvtps2pd xmm1, qword ptr [rdx + 4*rdi + 8] movups xmmword ptr [rcx + 8*rdi], xmm0 movups xmmword ptr [rcx + 8*rdi + 16], xmm1 cvtps2pd xmm0, qword ptr [rdx + 4*rdi + 16] cvtps2pd xmm1, qword ptr [rdx + 4*rdi + 24] movupd xmmword ptr [rcx + 8*rdi + 32], xmm0 movupd xmmword ptr [rcx + 8*rdi + 48], xmm1 add rdi, 8 add rax, 2 jne .LBB0_508 jmp .LBB0_976 .LBB0_509: and esi, -4 xor edi, edi .LBB0_510: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rdi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi], xmm0 movzx eax, byte ptr [rdx + rdi + 1] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 8], xmm0 movzx eax, byte ptr [rdx + rdi + 2] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 16], xmm0 movzx eax, byte ptr [rdx + rdi + 3] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_510 .LBB0_511: test r8, r8 je .LBB0_1526 # %bb.512: lea rcx, [rcx + 8*rdi] add rdx, rdi xor esi, esi .LBB0_513: # =>This Inner Loop Header: Depth=1 movzx eax, byte ptr [rdx + rsi] xorps xmm0, xmm0 cvtsi2sd xmm0, eax movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp r8, rsi jne .LBB0_513 jmp .LBB0_1526 .LBB0_514: and esi, -4 xor edi, edi .LBB0_515: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2sd xmm0, dword ptr [rdx + 4*rdi] movsd qword ptr [rcx + 8*rdi], xmm0 xorps xmm0, xmm0 cvtsi2sd xmm0, dword ptr [rdx + 4*rdi + 4] movsd qword ptr [rcx + 8*rdi + 8], xmm0 xorps xmm0, xmm0 cvtsi2sd xmm0, dword ptr [rdx + 4*rdi + 8] movsd qword ptr [rcx + 8*rdi + 16], xmm0 xorps xmm0, xmm0 cvtsi2sd xmm0, dword ptr [rdx + 4*rdi + 12] movsd qword ptr [rcx + 8*rdi + 24], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_515 .LBB0_516: test rax, rax je .LBB0_1526 # %bb.517: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 4*rdi] xor esi, esi .LBB0_518: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2sd xmm0, dword ptr [rdx + 4*rsi] movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp rax, rsi jne .LBB0_518 jmp .LBB0_1526 .LBB0_549: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_801 # %bb.550: xor eax, eax jmp .LBB0_803 .LBB0_551: and r9d, -4 xor eax, eax movsd xmm0, qword ptr [rip + .LCPI0_0] # xmm0 = mem[0],zero .LBB0_552: # =>This Inner Loop Header: Depth=1 movsd xmm1, qword ptr [rdx + 8*rax] # xmm1 = mem[0],zero movapd xmm2, xmm1 subsd xmm2, xmm0 cvttsd2si rdi, xmm2 xor rdi, r10 cvttsd2si rsi, xmm1 ucomisd xmm0, xmm1 cmovbe rsi, rdi mov qword ptr [rcx + 8*rax], rsi movsd xmm1, qword ptr [rdx + 8*rax + 8] # xmm1 = mem[0],zero movapd xmm2, xmm1 subsd xmm2, xmm0 cvttsd2si rsi, xmm2 xor rsi, r10 cvttsd2si rdi, xmm1 ucomisd xmm0, xmm1 cmovbe rdi, rsi mov qword ptr [rcx + 8*rax + 8], rdi movsd xmm1, qword ptr [rdx + 8*rax + 16] # xmm1 = mem[0],zero movapd xmm2, xmm1 subsd xmm2, xmm0 cvttsd2si rsi, xmm2 xor rsi, r10 cvttsd2si rdi, xmm1 ucomisd xmm0, xmm1 cmovbe rdi, rsi mov qword ptr [rcx + 8*rax + 16], rdi movsd xmm1, qword ptr [rdx + 8*rax + 24] # xmm1 = mem[0],zero movapd xmm2, xmm1 subsd xmm2, xmm0 cvttsd2si rsi, xmm2 xor rsi, r10 cvttsd2si rdi, xmm1 ucomisd xmm0, xmm1 cmovbe rdi, rsi mov qword ptr [rcx + 8*rax + 24], rdi add rax, 4 cmp r9, rax jne .LBB0_552 .LBB0_553: test r8, r8 je .LBB0_1526 # %bb.554: lea rcx, [rcx + 8*rax] lea rax, [rdx + 8*rax] xor edx, edx movsd xmm0, qword ptr [rip + .LCPI0_0] # xmm0 = mem[0],zero .LBB0_555: # =>This Inner Loop Header: Depth=1 movsd xmm1, qword ptr [rax + 8*rdx] # xmm1 = mem[0],zero movapd xmm2, xmm1 subsd xmm2, xmm0 cvttsd2si rsi, xmm2 xor rsi, r10 cvttsd2si rdi, xmm1 ucomisd xmm0, xmm1 cmovbe rdi, rsi mov qword ptr [rcx + 8*rdx], rdi add rdx, 1 cmp r8, rdx jne .LBB0_555 jmp .LBB0_1526 .LBB0_562: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_812 # %bb.563: xor eax, eax jmp .LBB0_814 .LBB0_564: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_819 # %bb.565: xor eax, eax jmp .LBB0_821 .LBB0_569: and r9d, -4 xor edi, edi movss xmm0, dword ptr [rip + .LCPI0_2] # xmm0 = mem[0],zero,zero,zero movabs r10, -9223372036854775808 .LBB0_570: # =>This Inner Loop Header: Depth=1 movss xmm1, dword ptr [rdx + 4*rdi] # xmm1 = mem[0],zero,zero,zero movaps xmm2, xmm1 subss xmm2, xmm0 cvttss2si rsi, xmm2 xor rsi, r10 cvttss2si rax, xmm1 ucomiss xmm0, xmm1 cmovbe rax, rsi mov qword ptr [rcx + 8*rdi], rax movss xmm1, dword ptr [rdx + 4*rdi + 4] # xmm1 = mem[0],zero,zero,zero movaps xmm2, xmm1 subss xmm2, xmm0 cvttss2si rax, xmm2 xor rax, r10 cvttss2si rsi, xmm1 ucomiss xmm0, xmm1 cmovbe rsi, rax mov qword ptr [rcx + 8*rdi + 8], rsi movss xmm1, dword ptr [rdx + 4*rdi + 8] # xmm1 = mem[0],zero,zero,zero movaps xmm2, xmm1 subss xmm2, xmm0 cvttss2si rax, xmm2 xor rax, r10 cvttss2si rsi, xmm1 ucomiss xmm0, xmm1 cmovbe rsi, rax mov qword ptr [rcx + 8*rdi + 16], rsi movss xmm1, dword ptr [rdx + 4*rdi + 12] # xmm1 = mem[0],zero,zero,zero movaps xmm2, xmm1 subss xmm2, xmm0 cvttss2si rax, xmm2 xor rax, r10 cvttss2si rsi, xmm1 ucomiss xmm0, xmm1 cmovbe rsi, rax mov qword ptr [rcx + 8*rdi + 24], rsi add rdi, 4 cmp r9, rdi jne .LBB0_570 .LBB0_571: test r8, r8 je .LBB0_1526 # %bb.572: lea rax, [rcx + 8*rdi] lea rcx, [rdx + 4*rdi] xor edx, edx movss xmm0, dword ptr [rip + .LCPI0_2] # xmm0 = mem[0],zero,zero,zero movabs r9, -9223372036854775808 .LBB0_573: # =>This Inner Loop Header: Depth=1 movss xmm1, dword ptr [rcx + 4*rdx] # xmm1 = mem[0],zero,zero,zero movaps xmm2, xmm1 subss xmm2, xmm0 cvttss2si rdi, xmm2 xor rdi, r9 cvttss2si rsi, xmm1 ucomiss xmm0, xmm1 cmovbe rsi, rdi mov qword ptr [rax + 8*rdx], rsi add rdx, 1 cmp r8, rdx jne .LBB0_573 jmp .LBB0_1526 .LBB0_577: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_830 # %bb.578: xor eax, eax jmp .LBB0_832 .LBB0_579: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_980 # %bb.580: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_12] # xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] .LBB0_581: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 add rdi, 16 add rax, 2 jne .LBB0_581 jmp .LBB0_981 .LBB0_582: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_985 # %bb.583: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_12] # xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] .LBB0_584: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 add rdi, 16 add rax, 2 jne .LBB0_584 jmp .LBB0_986 .LBB0_585: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_990 # %bb.586: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_587: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] cvttpd2dq xmm0, xmm0 pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] cvttpd2dq xmm1, xmm1 pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi + 8], xmm0 movd dword ptr [rcx + 2*rdi + 12], xmm1 add rdi, 8 add rax, 2 jne .LBB0_587 jmp .LBB0_991 .LBB0_588: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_995 # %bb.589: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_590: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] cvttpd2dq xmm0, xmm0 pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] cvttpd2dq xmm1, xmm1 pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi + 8], xmm0 movd dword ptr [rcx + 2*rdi + 12], xmm1 add rdi, 8 add rax, 2 jne .LBB0_590 jmp .LBB0_996 .LBB0_597: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1000 # %bb.598: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_599: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi + 8], xmm0 movd dword ptr [rcx + 2*rdi + 12], xmm1 add rdi, 8 add rax, 2 jne .LBB0_599 jmp .LBB0_1001 .LBB0_600: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1005 # %bb.601: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_602: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi + 8], xmm0 movd dword ptr [rcx + 2*rdi + 12], xmm1 add rdi, 8 add rax, 2 jne .LBB0_602 jmp .LBB0_1006 .LBB0_615: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1010 # %bb.616: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_617: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi + 8], xmm0 movd dword ptr [rcx + 2*rdi + 12], xmm1 add rdi, 8 add rax, 2 jne .LBB0_617 jmp .LBB0_1011 .LBB0_618: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1015 # %bb.619: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_620: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi + 8], xmm0 movd dword ptr [rcx + 2*rdi + 12], xmm1 add rdi, 8 add rax, 2 jne .LBB0_620 jmp .LBB0_1016 .LBB0_621: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1020 # %bb.622: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_623: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 packusdw xmm0, xmm1 movdqu xmmword ptr [rcx + 2*rdi], xmm0 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] movups xmm1, xmmword ptr [rdx + 4*rdi + 48] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 packusdw xmm0, xmm1 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm0 add rdi, 16 add rax, 2 jne .LBB0_623 jmp .LBB0_1021 .LBB0_624: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1025 # %bb.625: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_626: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 packssdw xmm0, xmm1 movdqu xmmword ptr [rcx + 2*rdi], xmm0 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] movups xmm1, xmmword ptr [rdx + 4*rdi + 48] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 packssdw xmm0, xmm1 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm0 add rdi, 16 add rax, 2 jne .LBB0_626 jmp .LBB0_1026 .LBB0_633: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1030 # %bb.634: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_12] # xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] .LBB0_635: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 add rdi, 16 add rax, 2 jne .LBB0_635 jmp .LBB0_1031 .LBB0_636: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1035 # %bb.637: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_12] # xmm0 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] .LBB0_638: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 add rdi, 16 add rax, 2 jne .LBB0_638 jmp .LBB0_1036 .LBB0_639: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_857 # %bb.640: xor eax, eax jmp .LBB0_859 .LBB0_641: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1040 # %bb.642: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_14] # xmm0 = [1258291200,1258291200,1258291200,1258291200] movdqa xmm1, xmmword ptr [rip + .LCPI0_15] # xmm1 = [1392508928,1392508928,1392508928,1392508928] movaps xmm2, xmmword ptr [rip + .LCPI0_16] # xmm2 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] .LBB0_643: # =>This Inner Loop Header: Depth=1 movdqu xmm3, xmmword ptr [rdx + 4*rdi] movdqu xmm4, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm5, xmm3 pblendw xmm5, xmm0, 170 # xmm5 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] psrld xmm3, 16 pblendw xmm3, xmm1, 170 # xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] subps xmm3, xmm2 addps xmm3, xmm5 movdqa xmm5, xmm4 pblendw xmm5, xmm0, 170 # xmm5 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] psrld xmm4, 16 pblendw xmm4, xmm1, 170 # xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] subps xmm4, xmm2 addps xmm4, xmm5 movups xmmword ptr [rcx + 4*rdi], xmm3 movups xmmword ptr [rcx + 4*rdi + 16], xmm4 movdqu xmm3, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm4, xmmword ptr [rdx + 4*rdi + 48] movdqa xmm5, xmm3 pblendw xmm5, xmm0, 170 # xmm5 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] psrld xmm3, 16 pblendw xmm3, xmm1, 170 # xmm3 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4],xmm1[5],xmm3[6],xmm1[7] subps xmm3, xmm2 addps xmm3, xmm5 movdqa xmm5, xmm4 pblendw xmm5, xmm0, 170 # xmm5 = xmm5[0],xmm0[1],xmm5[2],xmm0[3],xmm5[4],xmm0[5],xmm5[6],xmm0[7] psrld xmm4, 16 pblendw xmm4, xmm1, 170 # xmm4 = xmm4[0],xmm1[1],xmm4[2],xmm1[3],xmm4[4],xmm1[5],xmm4[6],xmm1[7] subps xmm4, xmm2 addps xmm4, xmm5 movups xmmword ptr [rcx + 4*rdi + 32], xmm3 movups xmmword ptr [rcx + 4*rdi + 48], xmm4 add rdi, 16 add rax, 2 jne .LBB0_643 jmp .LBB0_1041 .LBB0_644: and esi, -4 xor edi, edi .LBB0_645: # =>This Inner Loop Header: Depth=1 cvttsd2si rax, qword ptr [rdx + 8*rdi] mov qword ptr [rcx + 8*rdi], rax cvttsd2si rax, qword ptr [rdx + 8*rdi + 8] mov qword ptr [rcx + 8*rdi + 8], rax cvttsd2si rax, qword ptr [rdx + 8*rdi + 16] mov qword ptr [rcx + 8*rdi + 16], rax cvttsd2si rax, qword ptr [rdx + 8*rdi + 24] mov qword ptr [rcx + 8*rdi + 24], rax add rdi, 4 cmp rsi, rdi jne .LBB0_645 .LBB0_646: test r8, r8 je .LBB0_1526 # %bb.647: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 8*rdi] xor esi, esi .LBB0_648: # =>This Inner Loop Header: Depth=1 cvttsd2si rax, qword ptr [rdx + 8*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r8, rsi jne .LBB0_648 jmp .LBB0_1526 .LBB0_649: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1045 # %bb.650: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_651: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvtpd2ps xmm0, xmm0 cvtpd2ps xmm1, xmm1 unpcklpd xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movupd xmmword ptr [rcx + 4*rdi], xmm0 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] cvtpd2ps xmm0, xmm0 cvtpd2ps xmm1, xmm1 unpcklpd xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movupd xmmword ptr [rcx + 4*rdi + 16], xmm0 add rdi, 8 add rax, 2 jne .LBB0_651 jmp .LBB0_1046 .LBB0_661: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1050 # %bb.662: mov r10, r8 and r10, -2 neg r10 xor edi, edi movdqa xmm2, xmmword ptr [rip + .LCPI0_11] # xmm2 = [1,1] .LBB0_663: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqa xmm1, xmm0 pand xmm1, xmm2 movdqa xmm3, xmm0 psrlq xmm3, 1 por xmm3, xmm1 pxor xmm4, xmm4 pcmpgtq xmm4, xmm0 blendvpd xmm0, xmm3, xmm0 pextrq rax, xmm0, 1 xorps xmm5, xmm5 cvtsi2ss xmm5, rax movq rax, xmm0 xorps xmm3, xmm3 cvtsi2ss xmm3, rax movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] insertps xmm3, xmm5, 28 # xmm3 = xmm3[0],xmm5[0],zero,zero movaps xmm5, xmm3 addps xmm5, xmm3 pshufd xmm0, xmm4, 237 # xmm0 = xmm4[1,3,2,3] blendvps xmm3, xmm5, xmm0 movdqa xmm0, xmm1 pand xmm0, xmm2 movdqa xmm4, xmm1 psrlq xmm4, 1 por xmm4, xmm0 xorps xmm5, xmm5 pcmpgtq xmm5, xmm1 movdqa xmm0, xmm1 blendvpd xmm1, xmm4, xmm0 pextrq rax, xmm1, 1 xorps xmm0, xmm0 cvtsi2ss xmm0, rax movq rax, xmm1 xorps xmm1, xmm1 cvtsi2ss xmm1, rax insertps xmm1, xmm0, 28 # xmm1 = xmm1[0],xmm0[0],zero,zero movaps xmm4, xmm1 addps xmm4, xmm1 pshufd xmm0, xmm5, 237 # xmm0 = xmm5[1,3,2,3] blendvps xmm1, xmm4, xmm0 movlhps xmm3, xmm1 # xmm3 = xmm3[0],xmm1[0] movups xmmword ptr [rcx + 4*rdi], xmm3 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqa xmm1, xmm0 pand xmm1, xmm2 movdqa xmm3, xmm0 psrlq xmm3, 1 por xmm3, xmm1 xorps xmm4, xmm4 pcmpgtq xmm4, xmm0 blendvpd xmm0, xmm3, xmm0 pextrq rax, xmm0, 1 xorps xmm5, xmm5 cvtsi2ss xmm5, rax movq rax, xmm0 xorps xmm3, xmm3 cvtsi2ss xmm3, rax movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] insertps xmm3, xmm5, 28 # xmm3 = xmm3[0],xmm5[0],zero,zero movaps xmm5, xmm3 addps xmm5, xmm3 pshufd xmm0, xmm4, 237 # xmm0 = xmm4[1,3,2,3] blendvps xmm3, xmm5, xmm0 movdqa xmm0, xmm1 pand xmm0, xmm2 movdqa xmm4, xmm1 psrlq xmm4, 1 por xmm4, xmm0 xorps xmm5, xmm5 pcmpgtq xmm5, xmm1 movdqa xmm0, xmm1 blendvpd xmm1, xmm4, xmm0 pextrq rax, xmm1, 1 xorps xmm0, xmm0 cvtsi2ss xmm0, rax movq rax, xmm1 xorps xmm1, xmm1 cvtsi2ss xmm1, rax insertps xmm1, xmm0, 28 # xmm1 = xmm1[0],xmm0[0],zero,zero movaps xmm4, xmm1 addps xmm4, xmm1 pshufd xmm0, xmm5, 237 # xmm0 = xmm5[1,3,2,3] blendvps xmm1, xmm4, xmm0 movlhps xmm3, xmm1 # xmm3 = xmm3[0],xmm1[0] movups xmmword ptr [rcx + 4*rdi + 16], xmm3 add rdi, 8 add r10, 2 jne .LBB0_663 jmp .LBB0_1051 .LBB0_664: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_871 # %bb.665: xor eax, eax jmp .LBB0_873 .LBB0_666: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1058 # %bb.667: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_668: # =>This Inner Loop Header: Depth=1 pmovzxwd xmm0, qword ptr [rdx + 2*rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovzxwd xmm0, qword ptr [rdx + 2*rdi + 16] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 24] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi + 32], xmm0 movups xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_668 jmp .LBB0_1059 .LBB0_669: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_878 # %bb.670: xor eax, eax jmp .LBB0_880 .LBB0_671: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1063 # %bb.672: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_673: # =>This Inner Loop Header: Depth=1 pmovsxwd xmm0, qword ptr [rdx + 2*rdi] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovsxwd xmm0, qword ptr [rdx + 2*rdi + 16] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 24] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi + 32], xmm0 movups xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_673 jmp .LBB0_1064 .LBB0_677: and esi, -4 xor edi, edi .LBB0_678: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2ss xmm0, qword ptr [rdx + 8*rdi] movss dword ptr [rcx + 4*rdi], xmm0 xorps xmm0, xmm0 cvtsi2ss xmm0, qword ptr [rdx + 8*rdi + 8] movss dword ptr [rcx + 4*rdi + 4], xmm0 xorps xmm0, xmm0 cvtsi2ss xmm0, qword ptr [rdx + 8*rdi + 16] movss dword ptr [rcx + 4*rdi + 8], xmm0 xorps xmm0, xmm0 cvtsi2ss xmm0, qword ptr [rdx + 8*rdi + 24] movss dword ptr [rcx + 4*rdi + 12], xmm0 add rdi, 4 cmp rsi, rdi jne .LBB0_678 .LBB0_679: test rax, rax je .LBB0_1526 # %bb.680: lea rcx, [rcx + 4*rdi] lea rdx, [rdx + 8*rdi] xor esi, esi .LBB0_681: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2ss xmm0, qword ptr [rdx + 8*rsi] movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp rax, rsi jne .LBB0_681 jmp .LBB0_1526 .LBB0_682: and esi, -4 xor edi, edi .LBB0_683: # =>This Inner Loop Header: Depth=1 cvttss2si rax, dword ptr [rdx + 4*rdi] mov qword ptr [rcx + 8*rdi], rax cvttss2si rax, dword ptr [rdx + 4*rdi + 4] mov qword ptr [rcx + 8*rdi + 8], rax cvttss2si rax, dword ptr [rdx + 4*rdi + 8] mov qword ptr [rcx + 8*rdi + 16], rax cvttss2si rax, dword ptr [rdx + 4*rdi + 12] mov qword ptr [rcx + 8*rdi + 24], rax add rdi, 4 cmp rsi, rdi jne .LBB0_683 .LBB0_684: test r8, r8 je .LBB0_1526 # %bb.685: lea rcx, [rcx + 8*rdi] lea rdx, [rdx + 4*rdi] xor esi, esi .LBB0_686: # =>This Inner Loop Header: Depth=1 cvttss2si rax, dword ptr [rdx + 4*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r8, rsi jne .LBB0_686 jmp .LBB0_1526 .LBB0_696: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_894 # %bb.697: xor eax, eax jmp .LBB0_896 .LBB0_698: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1068 # %bb.699: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_700: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] movups xmm1, xmmword ptr [rdx + 4*rdi + 48] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi + 32], xmm0 movups xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_700 jmp .LBB0_1069 .LBB0_734: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1073 # %bb.735: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_736: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 unpcklpd xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movupd xmmword ptr [rcx + 4*rdi], xmm0 movupd xmm0, xmmword ptr [rdx + 8*rdi + 32] movupd xmm1, xmmword ptr [rdx + 8*rdi + 48] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 unpcklpd xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movupd xmmword ptr [rcx + 4*rdi + 16], xmm0 add rdi, 8 add rax, 2 jne .LBB0_736 jmp .LBB0_1074 .LBB0_740: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_939 # %bb.741: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_742: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 add rdi, 8 add rax, 2 jne .LBB0_742 jmp .LBB0_940 .LBB0_743: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1078 # %bb.744: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_745: # =>This Inner Loop Header: Depth=1 pmovzxwd xmm0, qword ptr [rdx + 2*rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovzxwd xmm0, qword ptr [rdx + 2*rdi + 16] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 24] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_745 jmp .LBB0_1079 .LBB0_746: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1083 # %bb.747: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_748: # =>This Inner Loop Header: Depth=1 pmovsxwd xmm0, qword ptr [rdx + 2*rdi] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovsxwd xmm0, qword ptr [rdx + 2*rdi + 16] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 24] movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_748 jmp .LBB0_1084 .LBB0_749: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1088 # %bb.750: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_751: # =>This Inner Loop Header: Depth=1 movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmm0, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 48] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi + 16], xmm0 add rdi, 8 add rax, 2 jne .LBB0_751 jmp .LBB0_1089 .LBB0_752: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1093 # %bb.753: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_754: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] movups xmm1, xmmword ptr [rdx + 4*rdi + 48] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 movupd xmmword ptr [rcx + 4*rdi + 32], xmm0 movupd xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_754 jmp .LBB0_1094 .LBB0_761: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov rdi, rax shr rdi, 3 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 24 jae .LBB0_1098 # %bb.762: xor eax, eax jmp .LBB0_1100 .LBB0_763: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1478 # %bb.764: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_765: # =>This Inner Loop Header: Depth=1 pmovsxbd xmm0, dword ptr [rdx + rdi] pmovsxbd xmm1, dword ptr [rdx + rdi + 4] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovsxbd xmm0, dword ptr [rdx + rdi + 8] pmovsxbd xmm1, dword ptr [rdx + rdi + 12] movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_765 jmp .LBB0_1479 .LBB0_766: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1486 # %bb.767: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_768: # =>This Inner Loop Header: Depth=1 pmovzxbd xmm0, dword ptr [rdx + rdi] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovzxbd xmm0, dword ptr [rdx + rdi + 8] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 12] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_768 jmp .LBB0_1487 .LBB0_769: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov rdi, rax shr rdi, 3 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 24 jae .LBB0_1108 # %bb.770: xor eax, eax jmp .LBB0_1110 .LBB0_771: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1118 # %bb.772: xor eax, eax jmp .LBB0_1120 .LBB0_773: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1494 # %bb.774: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_13] # xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_775: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi], xmm1 movd dword ptr [rcx + rdi + 4], xmm2 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi + 8], xmm1 movd dword ptr [rcx + rdi + 12], xmm2 add rdi, 16 add rax, 2 jne .LBB0_775 jmp .LBB0_1495 .LBB0_776: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1502 # %bb.777: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_1] # xmm0 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_778: # =>This Inner Loop Header: Depth=1 movupd xmm1, xmmword ptr [rdx + 8*rdi] movupd xmm2, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm1, xmm1 cvttpd2dq xmm2, xmm2 pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 2], xmm2, 0 movupd xmm1, xmmword ptr [rdx + 8*rdi + 32] movupd xmm2, xmmword ptr [rdx + 8*rdi + 48] cvttpd2dq xmm1, xmm1 cvttpd2dq xmm2, xmm2 pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi + 4], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 6], xmm2, 0 add rdi, 8 add rax, 2 jne .LBB0_778 jmp .LBB0_1503 .LBB0_779: mov esi, r9d and esi, -32 lea rax, [rsi - 32] mov rdi, rax shr rdi, 5 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 96 jae .LBB0_1128 # %bb.780: xor eax, eax jmp .LBB0_1130 .LBB0_781: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1318 # %bb.782: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_5] # xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_783: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 8*rdi] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 2], xmm2, 0 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi + 4], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 6], xmm2, 0 add rdi, 8 add rax, 2 jne .LBB0_783 jmp .LBB0_1319 .LBB0_784: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1326 # %bb.785: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_17] # xmm0 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> .LBB0_786: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 2*rdi] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi + 16], xmm1 add rdi, 32 add rax, 2 jne .LBB0_786 jmp .LBB0_1327 .LBB0_787: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1510 # %bb.788: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_17] # xmm0 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> .LBB0_789: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 2*rdi] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi + 16], xmm1 add rdi, 32 add rax, 2 jne .LBB0_789 jmp .LBB0_1511 .LBB0_790: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1334 # %bb.791: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_5] # xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_792: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 8*rdi] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 2], xmm2, 0 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi + 4], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 6], xmm2, 0 add rdi, 8 add rax, 2 jne .LBB0_792 jmp .LBB0_1335 .LBB0_793: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1518 # %bb.794: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_795: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 packssdw xmm0, xmm0 packsswb xmm0, xmm0 cvttps2dq xmm1, xmm1 packssdw xmm1, xmm1 packsswb xmm1, xmm1 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] movups xmm1, xmmword ptr [rdx + 4*rdi + 48] cvttps2dq xmm0, xmm0 packssdw xmm0, xmm0 packsswb xmm0, xmm0 cvttps2dq xmm1, xmm1 packssdw xmm1, xmm1 packsswb xmm1, xmm1 movd dword ptr [rcx + rdi + 8], xmm0 movd dword ptr [rcx + rdi + 12], xmm1 add rdi, 16 add rax, 2 jne .LBB0_795 jmp .LBB0_1519 .LBB0_796: mov esi, r9d and esi, -32 lea rax, [rsi - 32] mov rdi, rax shr rdi, 5 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 96 jae .LBB0_1138 # %bb.797: xor eax, eax jmp .LBB0_1140 .LBB0_798: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1342 # %bb.799: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_13] # xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_800: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi], xmm1 movd dword ptr [rcx + rdi + 4], xmm2 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi + 8], xmm1 movd dword ptr [rcx + rdi + 12], xmm2 add rdi, 16 add rax, 2 jne .LBB0_800 jmp .LBB0_1343 .LBB0_808: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1148 # %bb.809: xor eax, eax jmp .LBB0_1150 .LBB0_810: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1158 # %bb.811: xor eax, eax jmp .LBB0_1160 .LBB0_826: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1168 # %bb.827: xor eax, eax jmp .LBB0_1170 .LBB0_828: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1178 # %bb.829: xor eax, eax jmp .LBB0_1180 .LBB0_837: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1350 # %bb.838: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_839: # =>This Inner Loop Header: Depth=1 pmovsxbw xmm0, qword ptr [rdx + rdi] pmovsxbw xmm1, qword ptr [rdx + rdi + 8] movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 pmovsxbw xmm0, qword ptr [rdx + rdi + 16] pmovsxbw xmm1, qword ptr [rdx + rdi + 24] movdqu xmmword ptr [rcx + 2*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm1 add rdi, 32 add rax, 2 jne .LBB0_839 jmp .LBB0_1351 .LBB0_840: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1358 # %bb.841: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_842: # =>This Inner Loop Header: Depth=1 pmovsxbw xmm0, qword ptr [rdx + rdi] pmovsxbw xmm1, qword ptr [rdx + rdi + 8] movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 pmovsxbw xmm0, qword ptr [rdx + rdi + 16] pmovsxbw xmm1, qword ptr [rdx + rdi + 24] movdqu xmmword ptr [rcx + 2*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm1 add rdi, 32 add rax, 2 jne .LBB0_842 jmp .LBB0_1359 .LBB0_843: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov rdi, rax shr rdi, 4 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 48 jae .LBB0_1188 # %bb.844: xor eax, eax jmp .LBB0_1190 .LBB0_845: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov rdi, rax shr rdi, 4 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 48 jae .LBB0_1198 # %bb.846: xor eax, eax jmp .LBB0_1200 .LBB0_847: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov rdi, rax shr rdi, 4 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 48 jae .LBB0_1208 # %bb.848: xor eax, eax jmp .LBB0_1210 .LBB0_849: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov rdi, rax shr rdi, 4 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 48 jae .LBB0_1218 # %bb.850: xor eax, eax jmp .LBB0_1220 .LBB0_851: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1366 # %bb.852: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_853: # =>This Inner Loop Header: Depth=1 pmovzxbw xmm0, qword ptr [rdx + rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero pmovzxbw xmm1, qword ptr [rdx + rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 pmovzxbw xmm0, qword ptr [rdx + rdi + 16] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero pmovzxbw xmm1, qword ptr [rdx + rdi + 24] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero movdqu xmmword ptr [rcx + 2*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm1 add rdi, 32 add rax, 2 jne .LBB0_853 jmp .LBB0_1367 .LBB0_854: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1374 # %bb.855: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_856: # =>This Inner Loop Header: Depth=1 pmovzxbw xmm0, qword ptr [rdx + rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero pmovzxbw xmm1, qword ptr [rdx + rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 pmovzxbw xmm0, qword ptr [rdx + rdi + 16] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero pmovzxbw xmm1, qword ptr [rdx + rdi + 24] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero movdqu xmmword ptr [rcx + 2*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 48], xmm1 add rdi, 32 add rax, 2 jne .LBB0_856 jmp .LBB0_1375 .LBB0_864: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1228 # %bb.865: xor eax, eax jmp .LBB0_1230 .LBB0_866: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1382 # %bb.867: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_868: # =>This Inner Loop Header: Depth=1 pmovsxbd xmm0, dword ptr [rdx + rdi] pmovsxbd xmm1, dword ptr [rdx + rdi + 4] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovsxbd xmm0, dword ptr [rdx + rdi + 8] pmovsxbd xmm1, dword ptr [rdx + rdi + 12] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi + 32], xmm0 movups xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_868 jmp .LBB0_1383 .LBB0_869: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1238 # %bb.870: xor eax, eax jmp .LBB0_1240 .LBB0_885: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1248 # %bb.886: xor eax, eax jmp .LBB0_1250 .LBB0_887: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov rdi, rax shr rdi, 3 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 24 jae .LBB0_1258 # %bb.888: xor eax, eax jmp .LBB0_1260 .LBB0_889: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov rdi, rax shr rdi, 2 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 12 jae .LBB0_1268 # %bb.890: xor eax, eax jmp .LBB0_1270 .LBB0_891: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1390 # %bb.892: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_893: # =>This Inner Loop Header: Depth=1 pmovzxbd xmm0, dword ptr [rdx + rdi] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovzxbd xmm0, dword ptr [rdx + rdi + 8] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 12] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi + 32], xmm0 movups xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_893 jmp .LBB0_1391 .LBB0_901: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1398 # %bb.902: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_13] # xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_903: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi], xmm1 movd dword ptr [rcx + rdi + 4], xmm2 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi + 8], xmm1 movd dword ptr [rcx + rdi + 12], xmm2 add rdi, 16 add rax, 2 jne .LBB0_903 jmp .LBB0_1399 .LBB0_904: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1406 # %bb.905: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_1] # xmm0 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_906: # =>This Inner Loop Header: Depth=1 movupd xmm1, xmmword ptr [rdx + 8*rdi] movupd xmm2, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm1, xmm1 cvttpd2dq xmm2, xmm2 pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 2], xmm2, 0 movupd xmm1, xmmword ptr [rdx + 8*rdi + 32] movupd xmm2, xmmword ptr [rdx + 8*rdi + 48] cvttpd2dq xmm1, xmm1 cvttpd2dq xmm2, xmm2 pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi + 4], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 6], xmm2, 0 add rdi, 8 add rax, 2 jne .LBB0_906 jmp .LBB0_1407 .LBB0_907: mov esi, r9d and esi, -32 lea rax, [rsi - 32] mov rdi, rax shr rdi, 5 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 96 jae .LBB0_1278 # %bb.908: xor eax, eax jmp .LBB0_1280 .LBB0_909: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1414 # %bb.910: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_5] # xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_911: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 8*rdi] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 2], xmm2, 0 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi + 4], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 6], xmm2, 0 add rdi, 8 add rax, 2 jne .LBB0_911 jmp .LBB0_1415 .LBB0_912: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1422 # %bb.913: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_17] # xmm0 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> .LBB0_914: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 2*rdi] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi + 16], xmm1 add rdi, 32 add rax, 2 jne .LBB0_914 jmp .LBB0_1423 .LBB0_915: mov esi, r9d and esi, -16 lea rax, [rsi - 16] mov r8, rax shr r8, 4 add r8, 1 test rax, rax je .LBB0_1430 # %bb.916: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_17] # xmm0 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> .LBB0_917: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 2*rdi] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi], xmm1 movdqu xmm1, xmmword ptr [rdx + 2*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 2*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 punpcklqdq xmm1, xmm2 # xmm1 = xmm1[0],xmm2[0] movdqu xmmword ptr [rcx + rdi + 16], xmm1 add rdi, 32 add rax, 2 jne .LBB0_917 jmp .LBB0_1431 .LBB0_918: mov esi, r9d and esi, -4 lea rax, [rsi - 4] mov r8, rax shr r8, 2 add r8, 1 test rax, rax je .LBB0_1438 # %bb.919: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_5] # xmm0 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_920: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 8*rdi] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 16] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 2], xmm2, 0 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 8*rdi + 48] pshufb xmm1, xmm0 pextrw word ptr [rcx + rdi + 4], xmm1, 0 pshufb xmm2, xmm0 pextrw word ptr [rcx + rdi + 6], xmm2, 0 add rdi, 8 add rax, 2 jne .LBB0_920 jmp .LBB0_1439 .LBB0_921: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1446 # %bb.922: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_923: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 packusdw xmm0, xmm0 packuswb xmm0, xmm0 cvttps2dq xmm1, xmm1 packusdw xmm1, xmm1 packuswb xmm1, xmm1 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 movups xmm0, xmmword ptr [rdx + 4*rdi + 32] movups xmm1, xmmword ptr [rdx + 4*rdi + 48] cvttps2dq xmm0, xmm0 packusdw xmm0, xmm0 packuswb xmm0, xmm0 cvttps2dq xmm1, xmm1 packusdw xmm1, xmm1 packuswb xmm1, xmm1 movd dword ptr [rcx + rdi + 8], xmm0 movd dword ptr [rcx + rdi + 12], xmm1 add rdi, 16 add rax, 2 jne .LBB0_923 jmp .LBB0_1447 .LBB0_924: mov esi, r9d and esi, -32 lea rax, [rsi - 32] mov rdi, rax shr rdi, 5 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 96 jae .LBB0_1288 # %bb.925: xor eax, eax jmp .LBB0_1290 .LBB0_926: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1454 # %bb.927: mov rax, r8 and rax, -2 neg rax xor edi, edi movdqa xmm0, xmmword ptr [rip + .LCPI0_13] # xmm0 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> .LBB0_928: # =>This Inner Loop Header: Depth=1 movdqu xmm1, xmmword ptr [rdx + 4*rdi] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 16] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi], xmm1 movd dword ptr [rcx + rdi + 4], xmm2 movdqu xmm1, xmmword ptr [rdx + 4*rdi + 32] movdqu xmm2, xmmword ptr [rdx + 4*rdi + 48] pshufb xmm1, xmm0 pshufb xmm2, xmm0 movd dword ptr [rcx + rdi + 8], xmm1 movd dword ptr [rcx + rdi + 12], xmm2 add rdi, 16 add rax, 2 jne .LBB0_928 jmp .LBB0_1455 .LBB0_929: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov rdi, rax shr rdi, 3 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 24 jae .LBB0_1298 # %bb.930: xor eax, eax jmp .LBB0_1300 .LBB0_931: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1462 # %bb.932: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_933: # =>This Inner Loop Header: Depth=1 pmovsxbd xmm0, dword ptr [rdx + rdi] pmovsxbd xmm1, dword ptr [rdx + rdi + 4] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovsxbd xmm0, dword ptr [rdx + rdi + 8] pmovsxbd xmm1, dword ptr [rdx + rdi + 12] movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_933 jmp .LBB0_1463 .LBB0_934: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov r8, rax shr r8, 3 add r8, 1 test rax, rax je .LBB0_1470 # %bb.935: mov rax, r8 and rax, -2 neg rax xor edi, edi .LBB0_936: # =>This Inner Loop Header: Depth=1 pmovzxbd xmm0, dword ptr [rdx + rdi] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 pmovzxbd xmm0, dword ptr [rdx + rdi + 8] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 12] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero movdqu xmmword ptr [rcx + 4*rdi + 32], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 48], xmm1 add rdi, 16 add rax, 2 jne .LBB0_936 jmp .LBB0_1471 .LBB0_937: mov esi, r9d and esi, -8 lea rax, [rsi - 8] mov rdi, rax shr rdi, 3 add rdi, 1 mov r8d, edi and r8d, 3 cmp rax, 24 jae .LBB0_1308 # %bb.938: xor eax, eax jmp .LBB0_1310 .LBB0_801: and rdi, -4 neg rdi xor eax, eax .LBB0_802: # =>This Inner Loop Header: Depth=1 pmovzxdq xmm0, qword ptr [rdx + 4*rax] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 8] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovzxdq xmm0, qword ptr [rdx + 4*rax + 16] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 24] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovzxdq xmm0, qword ptr [rdx + 4*rax + 32] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 40] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovzxdq xmm0, qword ptr [rdx + 4*rax + 48] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 56] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_802 .LBB0_803: test r8, r8 je .LBB0_806 # %bb.804: lea rax, [4*rax + 8] neg r8 .LBB0_805: # =>This Inner Loop Header: Depth=1 pmovzxdq xmm0, qword ptr [rdx + rax - 8] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + rax] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 2*rax - 16], xmm0 movdqu xmmword ptr [rcx + 2*rax], xmm1 add rax, 16 inc r8 jne .LBB0_805 .LBB0_806: cmp rsi, r9 je .LBB0_1526 .LBB0_807: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_807 jmp .LBB0_1526 .LBB0_812: and rdi, -4 neg rdi xor eax, eax .LBB0_813: # =>This Inner Loop Header: Depth=1 pmovzxwq xmm0, dword ptr [rdx + 2*rax] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovzxwq xmm0, dword ptr [rdx + 2*rax + 8] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 12] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovzxwq xmm0, dword ptr [rdx + 2*rax + 16] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 20] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovzxwq xmm0, dword ptr [rdx + 2*rax + 24] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 28] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_813 .LBB0_814: test r8, r8 je .LBB0_817 # %bb.815: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rdx + 2*rax] add r10, 4 xor eax, eax .LBB0_816: # =>This Inner Loop Header: Depth=1 pmovzxwq xmm0, dword ptr [r10 + 8*rax - 4] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [r10 + 8*rax] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_816 .LBB0_817: cmp rsi, r9 je .LBB0_1526 .LBB0_818: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_818 jmp .LBB0_1526 .LBB0_819: and rdi, -4 neg rdi xor eax, eax .LBB0_820: # =>This Inner Loop Header: Depth=1 pmovsxwq xmm0, dword ptr [rdx + 2*rax] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 4] movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovsxwq xmm0, dword ptr [rdx + 2*rax + 8] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 12] movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovsxwq xmm0, dword ptr [rdx + 2*rax + 16] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 20] movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovsxwq xmm0, dword ptr [rdx + 2*rax + 24] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 28] movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_820 .LBB0_821: test r8, r8 je .LBB0_824 # %bb.822: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rdx + 2*rax] add r10, 4 xor eax, eax .LBB0_823: # =>This Inner Loop Header: Depth=1 pmovsxwq xmm0, dword ptr [r10 + 8*rax - 4] pmovsxwq xmm1, dword ptr [r10 + 8*rax] movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_823 .LBB0_824: cmp rsi, r9 je .LBB0_1526 .LBB0_825: # =>This Inner Loop Header: Depth=1 movsx rax, word ptr [rdx + 2*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_825 jmp .LBB0_1526 .LBB0_830: and rdi, -4 neg rdi xor eax, eax .LBB0_831: # =>This Inner Loop Header: Depth=1 pmovsxdq xmm0, qword ptr [rdx + 4*rax] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 8] movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovsxdq xmm0, qword ptr [rdx + 4*rax + 16] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 24] movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovsxdq xmm0, qword ptr [rdx + 4*rax + 32] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 40] movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovsxdq xmm0, qword ptr [rdx + 4*rax + 48] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 56] movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_831 .LBB0_832: test r8, r8 je .LBB0_835 # %bb.833: lea rax, [4*rax + 8] neg r8 .LBB0_834: # =>This Inner Loop Header: Depth=1 pmovsxdq xmm0, qword ptr [rdx + rax - 8] pmovsxdq xmm1, qword ptr [rdx + rax] movdqu xmmword ptr [rcx + 2*rax - 16], xmm0 movdqu xmmword ptr [rcx + 2*rax], xmm1 add rax, 16 inc r8 jne .LBB0_834 .LBB0_835: cmp rsi, r9 je .LBB0_1526 .LBB0_836: # =>This Inner Loop Header: Depth=1 movsxd rax, dword ptr [rdx + 4*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_836 jmp .LBB0_1526 .LBB0_857: and rdi, -4 neg rdi xor eax, eax .LBB0_858: # =>This Inner Loop Header: Depth=1 pmovzxdq xmm0, qword ptr [rdx + 4*rax] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 8] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovzxdq xmm0, qword ptr [rdx + 4*rax + 16] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 24] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovzxdq xmm0, qword ptr [rdx + 4*rax + 32] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 40] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovzxdq xmm0, qword ptr [rdx + 4*rax + 48] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + 4*rax + 56] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_858 .LBB0_859: test r8, r8 je .LBB0_862 # %bb.860: lea rax, [4*rax + 8] neg r8 .LBB0_861: # =>This Inner Loop Header: Depth=1 pmovzxdq xmm0, qword ptr [rdx + rax - 8] # xmm0 = mem[0],zero,mem[1],zero pmovzxdq xmm1, qword ptr [rdx + rax] # xmm1 = mem[0],zero,mem[1],zero movdqu xmmword ptr [rcx + 2*rax - 16], xmm0 movdqu xmmword ptr [rcx + 2*rax], xmm1 add rax, 16 inc r8 jne .LBB0_861 .LBB0_862: cmp rsi, r9 je .LBB0_1526 .LBB0_863: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_863 jmp .LBB0_1526 .LBB0_871: and rdi, -4 neg rdi xor eax, eax .LBB0_872: # =>This Inner Loop Header: Depth=1 pmovzxwq xmm0, dword ptr [rdx + 2*rax] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovzxwq xmm0, dword ptr [rdx + 2*rax + 8] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 12] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovzxwq xmm0, dword ptr [rdx + 2*rax + 16] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 20] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovzxwq xmm0, dword ptr [rdx + 2*rax + 24] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [rdx + 2*rax + 28] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_872 .LBB0_873: test r8, r8 je .LBB0_876 # %bb.874: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rdx + 2*rax] add r10, 4 xor eax, eax .LBB0_875: # =>This Inner Loop Header: Depth=1 pmovzxwq xmm0, dword ptr [r10 + 8*rax - 4] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero pmovzxwq xmm1, dword ptr [r10 + 8*rax] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_875 .LBB0_876: cmp rsi, r9 je .LBB0_1526 .LBB0_877: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_877 jmp .LBB0_1526 .LBB0_878: and rdi, -4 neg rdi xor eax, eax .LBB0_879: # =>This Inner Loop Header: Depth=1 pmovsxwq xmm0, dword ptr [rdx + 2*rax] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 4] movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovsxwq xmm0, dword ptr [rdx + 2*rax + 8] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 12] movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovsxwq xmm0, dword ptr [rdx + 2*rax + 16] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 20] movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovsxwq xmm0, dword ptr [rdx + 2*rax + 24] pmovsxwq xmm1, dword ptr [rdx + 2*rax + 28] movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_879 .LBB0_880: test r8, r8 je .LBB0_883 # %bb.881: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rdx + 2*rax] add r10, 4 xor eax, eax .LBB0_882: # =>This Inner Loop Header: Depth=1 pmovsxwq xmm0, dword ptr [r10 + 8*rax - 4] pmovsxwq xmm1, dword ptr [r10 + 8*rax] movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_882 .LBB0_883: cmp rsi, r9 je .LBB0_1526 .LBB0_884: # =>This Inner Loop Header: Depth=1 movsx rax, word ptr [rdx + 2*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_884 jmp .LBB0_1526 .LBB0_894: and rdi, -4 neg rdi xor eax, eax .LBB0_895: # =>This Inner Loop Header: Depth=1 pmovsxdq xmm0, qword ptr [rdx + 4*rax] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 8] movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovsxdq xmm0, qword ptr [rdx + 4*rax + 16] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 24] movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovsxdq xmm0, qword ptr [rdx + 4*rax + 32] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 40] movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovsxdq xmm0, qword ptr [rdx + 4*rax + 48] pmovsxdq xmm1, qword ptr [rdx + 4*rax + 56] movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_895 .LBB0_896: test r8, r8 je .LBB0_899 # %bb.897: lea rax, [4*rax + 8] neg r8 .LBB0_898: # =>This Inner Loop Header: Depth=1 pmovsxdq xmm0, qword ptr [rdx + rax - 8] pmovsxdq xmm1, qword ptr [rdx + rax] movdqu xmmword ptr [rcx + 2*rax - 16], xmm0 movdqu xmmword ptr [rcx + 2*rax], xmm1 add rax, 16 inc r8 jne .LBB0_898 .LBB0_899: cmp rsi, r9 je .LBB0_1526 .LBB0_900: # =>This Inner Loop Header: Depth=1 movsxd rax, dword ptr [rdx + 4*rsi] mov qword ptr [rcx + 8*rsi], rax add rsi, 1 cmp r9, rsi jne .LBB0_900 jmp .LBB0_1526 .LBB0_939: xor edi, edi .LBB0_940: test r8b, 1 je .LBB0_942 # %bb.941: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 .LBB0_942: cmp rsi, r9 je .LBB0_1526 .LBB0_943: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 8*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_943 jmp .LBB0_1526 .LBB0_944: xor edi, edi .LBB0_945: test r8b, 1 je .LBB0_947 # %bb.946: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 .LBB0_947: cmp rsi, r9 je .LBB0_1526 .LBB0_948: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 8*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_948 jmp .LBB0_1526 .LBB0_949: xor edi, edi .LBB0_950: test r8b, 1 je .LBB0_952 # %bb.951: pmovzxwd xmm0, qword ptr [rdx + 2*rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_952: cmp rsi, r9 je .LBB0_1526 .LBB0_953: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_953 jmp .LBB0_1526 .LBB0_954: xor edi, edi .LBB0_955: test r8b, 1 je .LBB0_957 # %bb.956: pmovsxwd xmm0, qword ptr [rdx + 2*rdi] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_957: cmp rsi, r9 je .LBB0_1526 .LBB0_958: # =>This Inner Loop Header: Depth=1 movsx eax, word ptr [rdx + 2*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_958 jmp .LBB0_1526 .LBB0_959: xor edi, edi .LBB0_960: test r8b, 1 je .LBB0_962 # %bb.961: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 .LBB0_962: cmp rsi, r9 je .LBB0_1526 .LBB0_963: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 8*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_963 jmp .LBB0_1526 .LBB0_964: xor edi, edi .LBB0_965: test r8b, 1 je .LBB0_967 # %bb.966: movups xmm1, xmmword ptr [rdx + 4*rdi] movups xmm2, xmmword ptr [rdx + 4*rdi + 16] movaps xmm3, xmmword ptr [rip + .LCPI0_3] # xmm3 = [2.14748365E+9,2.14748365E+9,2.14748365E+9,2.14748365E+9] movaps xmm0, xmm1 cmpltps xmm0, xmm3 cvttps2dq xmm4, xmm1 subps xmm1, xmm3 cvttps2dq xmm1, xmm1 movaps xmm5, xmmword ptr [rip + .LCPI0_4] # xmm5 = [2147483648,2147483648,2147483648,2147483648] xorps xmm1, xmm5 blendvps xmm1, xmm4, xmm0 movaps xmm0, xmm2 cmpltps xmm0, xmm3 cvttps2dq xmm4, xmm2 subps xmm2, xmm3 cvttps2dq xmm2, xmm2 xorps xmm2, xmm5 blendvps xmm2, xmm4, xmm0 movups xmmword ptr [rcx + 4*rdi], xmm1 movups xmmword ptr [rcx + 4*rdi + 16], xmm2 .LBB0_967: cmp rsi, r9 je .LBB0_1526 .LBB0_968: # =>This Inner Loop Header: Depth=1 cvttss2si rax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_968 jmp .LBB0_1526 .LBB0_969: xor edi, edi .LBB0_970: test r8b, 1 je .LBB0_972 # %bb.971: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pxor xmm2, xmm2 movdqa xmm3, xmm0 pblendw xmm3, xmm2, 204 # xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7] movdqa xmm4, xmmword ptr [rip + .LCPI0_6] # xmm4 = [4841369599423283200,4841369599423283200] por xmm3, xmm4 psrlq xmm0, 32 movdqa xmm5, xmmword ptr [rip + .LCPI0_7] # xmm5 = [4985484787499139072,4985484787499139072] por xmm0, xmm5 movapd xmm6, xmmword ptr [rip + .LCPI0_8] # xmm6 = [1.9342813118337666E+25,1.9342813118337666E+25] subpd xmm0, xmm6 addpd xmm0, xmm3 pblendw xmm2, xmm1, 51 # xmm2 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] por xmm2, xmm4 psrlq xmm1, 32 por xmm1, xmm5 subpd xmm1, xmm6 addpd xmm1, xmm2 movupd xmmword ptr [rcx + 8*rdi], xmm0 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 .LBB0_972: cmp rsi, r9 je .LBB0_1526 .LBB0_973: movapd xmm0, xmmword ptr [rip + .LCPI0_9] # xmm0 = [1127219200,1160773632,0,0] movapd xmm1, xmmword ptr [rip + .LCPI0_10] # xmm1 = [4.503599627370496E+15,1.9342813113834067E+25] .LBB0_974: # =>This Inner Loop Header: Depth=1 movsd xmm2, qword ptr [rdx + 8*rsi] # xmm2 = mem[0],zero unpcklps xmm2, xmm0 # xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] subpd xmm2, xmm1 movapd xmm3, xmm2 unpckhpd xmm3, xmm2 # xmm3 = xmm3[1],xmm2[1] addsd xmm3, xmm2 movsd qword ptr [rcx + 8*rsi], xmm3 add rsi, 1 cmp r9, rsi jne .LBB0_974 jmp .LBB0_1526 .LBB0_975: xor edi, edi .LBB0_976: test r8b, 1 je .LBB0_978 # %bb.977: cvtps2pd xmm0, qword ptr [rdx + 4*rdi] cvtps2pd xmm1, qword ptr [rdx + 4*rdi + 8] movupd xmmword ptr [rcx + 8*rdi], xmm0 movupd xmmword ptr [rcx + 8*rdi + 16], xmm1 .LBB0_978: cmp rsi, r9 je .LBB0_1526 .LBB0_979: # =>This Inner Loop Header: Depth=1 movss xmm0, dword ptr [rdx + 4*rsi] # xmm0 = mem[0],zero,zero,zero cvtss2sd xmm0, xmm0 movsd qword ptr [rcx + 8*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_979 jmp .LBB0_1526 .LBB0_980: xor edi, edi .LBB0_981: test r8b, 1 je .LBB0_983 # %bb.982: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_12] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 2*rdi], xmm0 .LBB0_983: cmp rsi, r9 je .LBB0_1526 .LBB0_984: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 4*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_984 jmp .LBB0_1526 .LBB0_985: xor edi, edi .LBB0_986: test r8b, 1 je .LBB0_988 # %bb.987: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_12] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 2*rdi], xmm0 .LBB0_988: cmp rsi, r9 je .LBB0_1526 .LBB0_989: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 4*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_989 jmp .LBB0_1526 .LBB0_990: xor edi, edi .LBB0_991: test r8b, 1 je .LBB0_993 # %bb.992: movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 .LBB0_993: cmp rsi, r9 je .LBB0_1526 .LBB0_994: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_994 jmp .LBB0_1526 .LBB0_995: xor edi, edi .LBB0_996: test r8b, 1 je .LBB0_998 # %bb.997: movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 .LBB0_998: cmp rsi, r9 je .LBB0_1526 .LBB0_999: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_999 jmp .LBB0_1526 .LBB0_1000: xor edi, edi .LBB0_1001: test r8b, 1 je .LBB0_1003 # %bb.1002: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 .LBB0_1003: cmp rsi, r9 je .LBB0_1526 .LBB0_1004: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 8*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1004 jmp .LBB0_1526 .LBB0_1005: xor edi, edi .LBB0_1006: test r8b, 1 je .LBB0_1008 # %bb.1007: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 .LBB0_1008: cmp rsi, r9 je .LBB0_1526 .LBB0_1009: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 8*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1009 jmp .LBB0_1526 .LBB0_1010: xor edi, edi .LBB0_1011: test r8b, 1 je .LBB0_1013 # %bb.1012: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 .LBB0_1013: cmp rsi, r9 je .LBB0_1526 .LBB0_1014: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 8*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1014 jmp .LBB0_1526 .LBB0_1015: xor edi, edi .LBB0_1016: test r8b, 1 je .LBB0_1018 # %bb.1017: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshuflw xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3,4,5,6,7] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] pshuflw xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3,4,5,6,7] movd dword ptr [rcx + 2*rdi], xmm0 movd dword ptr [rcx + 2*rdi + 4], xmm1 .LBB0_1018: cmp rsi, r9 je .LBB0_1526 .LBB0_1019: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 8*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1019 jmp .LBB0_1526 .LBB0_1020: xor edi, edi .LBB0_1021: test r8b, 1 je .LBB0_1023 # %bb.1022: movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 packusdw xmm0, xmm1 movdqu xmmword ptr [rcx + 2*rdi], xmm0 .LBB0_1023: cmp rsi, r9 je .LBB0_1526 .LBB0_1024: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1024 jmp .LBB0_1526 .LBB0_1025: xor edi, edi .LBB0_1026: test r8b, 1 je .LBB0_1028 # %bb.1027: movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 packssdw xmm0, xmm1 movdqu xmmword ptr [rcx + 2*rdi], xmm0 .LBB0_1028: cmp rsi, r9 je .LBB0_1526 .LBB0_1029: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1029 jmp .LBB0_1526 .LBB0_1030: xor edi, edi .LBB0_1031: test r8b, 1 je .LBB0_1033 # %bb.1032: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_12] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 2*rdi], xmm0 .LBB0_1033: cmp rsi, r9 je .LBB0_1526 .LBB0_1034: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 4*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1034 jmp .LBB0_1526 .LBB0_1035: xor edi, edi .LBB0_1036: test r8b, 1 je .LBB0_1038 # %bb.1037: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_12] # xmm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 2*rdi], xmm0 .LBB0_1038: cmp rsi, r9 je .LBB0_1526 .LBB0_1039: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 4*rsi] mov word ptr [rcx + 2*rsi], ax add rsi, 1 cmp r9, rsi jne .LBB0_1039 jmp .LBB0_1526 .LBB0_1040: xor edi, edi .LBB0_1041: test r8b, 1 je .LBB0_1043 # %bb.1042: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_14] # xmm2 = [1258291200,1258291200,1258291200,1258291200] movdqa xmm3, xmm0 pblendw xmm3, xmm2, 170 # xmm3 = xmm3[0],xmm2[1],xmm3[2],xmm2[3],xmm3[4],xmm2[5],xmm3[6],xmm2[7] psrld xmm0, 16 movdqa xmm4, xmmword ptr [rip + .LCPI0_15] # xmm4 = [1392508928,1392508928,1392508928,1392508928] pblendw xmm0, xmm4, 170 # xmm0 = xmm0[0],xmm4[1],xmm0[2],xmm4[3],xmm0[4],xmm4[5],xmm0[6],xmm4[7] movaps xmm5, xmmword ptr [rip + .LCPI0_16] # xmm5 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] subps xmm0, xmm5 addps xmm0, xmm3 pblendw xmm2, xmm1, 85 # xmm2 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] psrld xmm1, 16 pblendw xmm1, xmm4, 170 # xmm1 = xmm1[0],xmm4[1],xmm1[2],xmm4[3],xmm1[4],xmm4[5],xmm1[6],xmm4[7] subps xmm1, xmm5 addps xmm1, xmm2 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1043: cmp rsi, r9 je .LBB0_1526 .LBB0_1044: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 4*rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, rax movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_1044 jmp .LBB0_1526 .LBB0_1045: xor edi, edi .LBB0_1046: test r8b, 1 je .LBB0_1048 # %bb.1047: movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvtpd2ps xmm0, xmm0 cvtpd2ps xmm1, xmm1 unpcklpd xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movupd xmmword ptr [rcx + 4*rdi], xmm0 .LBB0_1048: cmp rsi, r9 je .LBB0_1526 .LBB0_1049: # =>This Inner Loop Header: Depth=1 movsd xmm0, qword ptr [rdx + 8*rsi] # xmm0 = mem[0],zero cvtsd2ss xmm0, xmm0 movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_1049 jmp .LBB0_1526 .LBB0_1050: xor edi, edi .LBB0_1051: test r8b, 1 je .LBB0_1053 # %bb.1052: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqa xmm3, xmmword ptr [rip + .LCPI0_11] # xmm3 = [1,1] movdqa xmm1, xmm0 movdqa xmm2, xmm0 movdqa xmm4, xmm0 pand xmm4, xmm3 psrlq xmm1, 1 por xmm1, xmm4 blendvpd xmm2, xmm1, xmm0 pextrq rax, xmm2, 1 xorps xmm4, xmm4 cvtsi2ss xmm4, rax movq rax, xmm2 xorps xmm2, xmm2 cvtsi2ss xmm2, rax pxor xmm5, xmm5 pcmpgtq xmm5, xmm0 movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] insertps xmm2, xmm4, 28 # xmm2 = xmm2[0],xmm4[0],zero,zero movaps xmm4, xmm2 addps xmm4, xmm2 pxor xmm6, xmm6 pshufd xmm0, xmm5, 237 # xmm0 = xmm5[1,3,2,3] blendvps xmm2, xmm4, xmm0 pand xmm3, xmm1 movdqa xmm4, xmm1 psrlq xmm4, 1 por xmm4, xmm3 pcmpgtq xmm6, xmm1 movdqa xmm0, xmm1 blendvpd xmm1, xmm4, xmm0 pextrq rax, xmm1, 1 xorps xmm0, xmm0 cvtsi2ss xmm0, rax movq rax, xmm1 xorps xmm1, xmm1 cvtsi2ss xmm1, rax insertps xmm1, xmm0, 28 # xmm1 = xmm1[0],xmm0[0],zero,zero movaps xmm3, xmm1 addps xmm3, xmm1 pshufd xmm0, xmm6, 237 # xmm0 = xmm6[1,3,2,3] blendvps xmm1, xmm3, xmm0 movlhps xmm2, xmm1 # xmm2 = xmm2[0],xmm1[0] movups xmmword ptr [rcx + 4*rdi], xmm2 .LBB0_1053: cmp rsi, r9 jne .LBB0_1056 jmp .LBB0_1526 .LBB0_1054: # in Loop: Header=BB0_1056 Depth=1 xorps xmm0, xmm0 cvtsi2ss xmm0, rax movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi je .LBB0_1526 .LBB0_1056: # =>This Inner Loop Header: Depth=1 mov rax, qword ptr [rdx + 8*rsi] test rax, rax jns .LBB0_1054 # %bb.1057: # in Loop: Header=BB0_1056 Depth=1 mov rdi, rax shr rdi and eax, 1 or rax, rdi xorps xmm0, xmm0 cvtsi2ss xmm0, rax addss xmm0, xmm0 movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_1056 jmp .LBB0_1526 .LBB0_1058: xor edi, edi .LBB0_1059: test r8b, 1 je .LBB0_1061 # %bb.1060: pmovzxwd xmm0, qword ptr [rdx + 2*rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1061: cmp rsi, r9 je .LBB0_1526 .LBB0_1062: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_1062 jmp .LBB0_1526 .LBB0_1063: xor edi, edi .LBB0_1064: test r8b, 1 je .LBB0_1066 # %bb.1065: pmovsxwd xmm0, qword ptr [rdx + 2*rdi] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1066: cmp rsi, r9 je .LBB0_1526 .LBB0_1067: # =>This Inner Loop Header: Depth=1 movsx eax, word ptr [rdx + 2*rsi] xorps xmm0, xmm0 cvtsi2ss xmm0, eax movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_1067 jmp .LBB0_1526 .LBB0_1068: xor edi, edi .LBB0_1069: test r8b, 1 je .LBB0_1071 # %bb.1070: movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1071: cmp rsi, r9 je .LBB0_1526 .LBB0_1072: # =>This Inner Loop Header: Depth=1 xorps xmm0, xmm0 cvtsi2ss xmm0, dword ptr [rdx + 4*rsi] movss dword ptr [rcx + 4*rsi], xmm0 add rsi, 1 cmp r9, rsi jne .LBB0_1072 jmp .LBB0_1526 .LBB0_1073: xor edi, edi .LBB0_1074: test r8b, 1 je .LBB0_1076 # %bb.1075: movupd xmm0, xmmword ptr [rdx + 8*rdi] movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] cvttpd2dq xmm0, xmm0 cvttpd2dq xmm1, xmm1 unpcklpd xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movupd xmmword ptr [rcx + 4*rdi], xmm0 .LBB0_1076: cmp rsi, r9 je .LBB0_1526 .LBB0_1077: # =>This Inner Loop Header: Depth=1 cvttsd2si eax, qword ptr [rdx + 8*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_1077 jmp .LBB0_1526 .LBB0_1078: xor edi, edi .LBB0_1079: test r8b, 1 je .LBB0_1081 # %bb.1080: pmovzxwd xmm0, qword ptr [rdx + 2*rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero pmovzxwd xmm1, qword ptr [rdx + 2*rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1081: cmp rsi, r9 je .LBB0_1526 .LBB0_1082: # =>This Inner Loop Header: Depth=1 movzx eax, word ptr [rdx + 2*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_1082 jmp .LBB0_1526 .LBB0_1083: xor edi, edi .LBB0_1084: test r8b, 1 je .LBB0_1086 # %bb.1085: pmovsxwd xmm0, qword ptr [rdx + 2*rdi] pmovsxwd xmm1, qword ptr [rdx + 2*rdi + 8] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1086: cmp rsi, r9 je .LBB0_1526 .LBB0_1087: # =>This Inner Loop Header: Depth=1 movsx eax, word ptr [rdx + 2*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_1087 jmp .LBB0_1526 .LBB0_1088: xor edi, edi .LBB0_1089: test r8b, 1 je .LBB0_1091 # %bb.1090: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] pshufd xmm0, xmm0, 232 # xmm0 = xmm0[0,2,2,3] pshufd xmm1, xmm1, 232 # xmm1 = xmm1[0,2,2,3] punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + 4*rdi], xmm0 .LBB0_1091: cmp rsi, r9 je .LBB0_1526 .LBB0_1092: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rdx + 8*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_1092 jmp .LBB0_1526 .LBB0_1093: xor edi, edi .LBB0_1094: test r8b, 1 je .LBB0_1096 # %bb.1095: movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 cvttps2dq xmm1, xmm1 movupd xmmword ptr [rcx + 4*rdi], xmm0 movupd xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1096: cmp rsi, r9 je .LBB0_1526 .LBB0_1097: # =>This Inner Loop Header: Depth=1 cvttss2si eax, dword ptr [rdx + 4*rsi] mov dword ptr [rcx + 4*rsi], eax add rsi, 1 cmp r9, rsi jne .LBB0_1097 .LBB0_1526: mov rsp, rbp pop rbp ret .LBB0_1098: and rdi, -4 neg rdi xor eax, eax .LBB0_1099: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rax] movups xmm1, xmmword ptr [rdx + 4*rax + 16] movups xmmword ptr [rcx + 4*rax], xmm0 movups xmmword ptr [rcx + 4*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 32] movups xmm1, xmmword ptr [rdx + 4*rax + 48] movups xmmword ptr [rcx + 4*rax + 32], xmm0 movups xmmword ptr [rcx + 4*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 64] movups xmm1, xmmword ptr [rdx + 4*rax + 80] movups xmmword ptr [rcx + 4*rax + 64], xmm0 movups xmmword ptr [rcx + 4*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] movupd xmm1, xmmword ptr [rdx + 4*rax + 112] movupd xmmword ptr [rcx + 4*rax + 96], xmm0 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 add rax, 32 add rdi, 4 jne .LBB0_1099 .LBB0_1100: test r8, r8 je .LBB0_1103 # %bb.1101: lea rax, [4*rax + 16] neg r8 .LBB0_1102: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1102 .LBB0_1103: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1104 .LBB0_1108: and rdi, -4 neg rdi xor eax, eax .LBB0_1109: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rax] movups xmm1, xmmword ptr [rdx + 4*rax + 16] movups xmmword ptr [rcx + 4*rax], xmm0 movups xmmword ptr [rcx + 4*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 32] movups xmm1, xmmword ptr [rdx + 4*rax + 48] movups xmmword ptr [rcx + 4*rax + 32], xmm0 movups xmmword ptr [rcx + 4*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 64] movups xmm1, xmmword ptr [rdx + 4*rax + 80] movups xmmword ptr [rcx + 4*rax + 64], xmm0 movups xmmword ptr [rcx + 4*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] movupd xmm1, xmmword ptr [rdx + 4*rax + 112] movupd xmmword ptr [rcx + 4*rax + 96], xmm0 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 add rax, 32 add rdi, 4 jne .LBB0_1109 .LBB0_1110: test r8, r8 je .LBB0_1113 # %bb.1111: lea rax, [4*rax + 16] neg r8 .LBB0_1112: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1112 .LBB0_1113: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1114 .LBB0_1118: and rdi, -4 neg rdi xor eax, eax .LBB0_1119: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 8*rax] movups xmm1, xmmword ptr [rdx + 8*rax + 16] movups xmmword ptr [rcx + 8*rax], xmm0 movups xmmword ptr [rcx + 8*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 32] movups xmm1, xmmword ptr [rdx + 8*rax + 48] movups xmmword ptr [rcx + 8*rax + 32], xmm0 movups xmmword ptr [rcx + 8*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 64] movups xmm1, xmmword ptr [rdx + 8*rax + 80] movups xmmword ptr [rcx + 8*rax + 64], xmm0 movups xmmword ptr [rcx + 8*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] movupd xmm1, xmmword ptr [rdx + 8*rax + 112] movupd xmmword ptr [rcx + 8*rax + 96], xmm0 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1119 .LBB0_1120: test r8, r8 je .LBB0_1123 # %bb.1121: lea rax, [8*rax + 16] neg r8 .LBB0_1122: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1122 .LBB0_1123: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1124 .LBB0_1128: and rdi, -4 neg rdi xor eax, eax .LBB0_1129: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + rax] movups xmm1, xmmword ptr [rdx + rax + 16] movups xmmword ptr [rcx + rax], xmm0 movups xmmword ptr [rcx + rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + rax + 32] movups xmm1, xmmword ptr [rdx + rax + 48] movups xmmword ptr [rcx + rax + 32], xmm0 movups xmmword ptr [rcx + rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + rax + 64] movups xmm1, xmmword ptr [rdx + rax + 80] movups xmmword ptr [rcx + rax + 64], xmm0 movups xmmword ptr [rcx + rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + rax + 96] movupd xmm1, xmmword ptr [rdx + rax + 112] movupd xmmword ptr [rcx + rax + 96], xmm0 movupd xmmword ptr [rcx + rax + 112], xmm1 sub rax, -128 add rdi, 4 jne .LBB0_1129 .LBB0_1130: test r8, r8 je .LBB0_1133 # %bb.1131: add rax, 16 neg r8 .LBB0_1132: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1132 .LBB0_1133: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1134 .LBB0_1138: and rdi, -4 neg rdi xor eax, eax .LBB0_1139: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + rax] movups xmm1, xmmword ptr [rdx + rax + 16] movups xmmword ptr [rcx + rax], xmm0 movups xmmword ptr [rcx + rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + rax + 32] movups xmm1, xmmword ptr [rdx + rax + 48] movups xmmword ptr [rcx + rax + 32], xmm0 movups xmmword ptr [rcx + rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + rax + 64] movups xmm1, xmmword ptr [rdx + rax + 80] movups xmmword ptr [rcx + rax + 64], xmm0 movups xmmword ptr [rcx + rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + rax + 96] movupd xmm1, xmmword ptr [rdx + rax + 112] movupd xmmword ptr [rcx + rax + 96], xmm0 movupd xmmword ptr [rcx + rax + 112], xmm1 sub rax, -128 add rdi, 4 jne .LBB0_1139 .LBB0_1140: test r8, r8 je .LBB0_1143 # %bb.1141: add rax, 16 neg r8 .LBB0_1142: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1142 .LBB0_1143: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1144 .LBB0_1148: and rdi, -4 neg rdi xor eax, eax .LBB0_1149: # =>This Inner Loop Header: Depth=1 pmovsxbq xmm0, word ptr [rdx + rax] pmovsxbq xmm1, word ptr [rdx + rax + 2] movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovsxbq xmm0, word ptr [rdx + rax + 4] pmovsxbq xmm1, word ptr [rdx + rax + 6] movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovsxbq xmm0, word ptr [rdx + rax + 8] pmovsxbq xmm1, word ptr [rdx + rax + 10] movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovsxbq xmm0, word ptr [rdx + rax + 12] pmovsxbq xmm1, word ptr [rdx + rax + 14] movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1149 .LBB0_1150: test r8, r8 je .LBB0_1153 # %bb.1151: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rax + rdx] add r10, 2 xor eax, eax .LBB0_1152: # =>This Inner Loop Header: Depth=1 pmovsxbq xmm0, word ptr [r10 + 4*rax - 2] pmovsxbq xmm1, word ptr [r10 + 4*rax] movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_1152 .LBB0_1153: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1154 .LBB0_1158: and rdi, -4 neg rdi xor eax, eax .LBB0_1159: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 8*rax] movups xmm1, xmmword ptr [rdx + 8*rax + 16] movups xmmword ptr [rcx + 8*rax], xmm0 movups xmmword ptr [rcx + 8*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 32] movups xmm1, xmmword ptr [rdx + 8*rax + 48] movups xmmword ptr [rcx + 8*rax + 32], xmm0 movups xmmword ptr [rcx + 8*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 64] movups xmm1, xmmword ptr [rdx + 8*rax + 80] movups xmmword ptr [rcx + 8*rax + 64], xmm0 movups xmmword ptr [rcx + 8*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] movupd xmm1, xmmword ptr [rdx + 8*rax + 112] movupd xmmword ptr [rcx + 8*rax + 96], xmm0 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1159 .LBB0_1160: test r8, r8 je .LBB0_1163 # %bb.1161: lea rax, [8*rax + 16] neg r8 .LBB0_1162: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1162 .LBB0_1163: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1164 .LBB0_1168: and rdi, -4 neg rdi xor eax, eax .LBB0_1169: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 8*rax] movups xmm1, xmmword ptr [rdx + 8*rax + 16] movups xmmword ptr [rcx + 8*rax], xmm0 movups xmmword ptr [rcx + 8*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 32] movups xmm1, xmmword ptr [rdx + 8*rax + 48] movups xmmword ptr [rcx + 8*rax + 32], xmm0 movups xmmword ptr [rcx + 8*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 64] movups xmm1, xmmword ptr [rdx + 8*rax + 80] movups xmmword ptr [rcx + 8*rax + 64], xmm0 movups xmmword ptr [rcx + 8*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] movupd xmm1, xmmword ptr [rdx + 8*rax + 112] movupd xmmword ptr [rcx + 8*rax + 96], xmm0 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1169 .LBB0_1170: test r8, r8 je .LBB0_1173 # %bb.1171: lea rax, [8*rax + 16] neg r8 .LBB0_1172: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1172 .LBB0_1173: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1174 .LBB0_1178: and rdi, -4 neg rdi xor eax, eax .LBB0_1179: # =>This Inner Loop Header: Depth=1 pmovzxbq xmm0, word ptr [rdx + rax] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 2] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovzxbq xmm0, word ptr [rdx + rax + 4] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 6] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovzxbq xmm0, word ptr [rdx + rax + 8] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 10] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovzxbq xmm0, word ptr [rdx + rax + 12] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 14] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1179 .LBB0_1180: test r8, r8 je .LBB0_1183 # %bb.1181: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rax + rdx] add r10, 2 xor eax, eax .LBB0_1182: # =>This Inner Loop Header: Depth=1 pmovzxbq xmm0, word ptr [r10 + 4*rax - 2] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [r10 + 4*rax] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_1182 .LBB0_1183: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1184 .LBB0_1188: and rdi, -4 neg rdi xor eax, eax .LBB0_1189: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 2*rax] movups xmm1, xmmword ptr [rdx + 2*rax + 16] movups xmmword ptr [rcx + 2*rax], xmm0 movups xmmword ptr [rcx + 2*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 32] movups xmm1, xmmword ptr [rdx + 2*rax + 48] movups xmmword ptr [rcx + 2*rax + 32], xmm0 movups xmmword ptr [rcx + 2*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 64] movups xmm1, xmmword ptr [rdx + 2*rax + 80] movups xmmword ptr [rcx + 2*rax + 64], xmm0 movups xmmword ptr [rcx + 2*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 2*rax + 96] movupd xmm1, xmmword ptr [rdx + 2*rax + 112] movupd xmmword ptr [rcx + 2*rax + 96], xmm0 movupd xmmword ptr [rcx + 2*rax + 112], xmm1 add rax, 64 add rdi, 4 jne .LBB0_1189 .LBB0_1190: test r8, r8 je .LBB0_1193 # %bb.1191: add rax, rax add rax, 16 neg r8 .LBB0_1192: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1192 .LBB0_1193: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1194 .LBB0_1198: and rdi, -4 neg rdi xor eax, eax .LBB0_1199: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 2*rax] movups xmm1, xmmword ptr [rdx + 2*rax + 16] movups xmmword ptr [rcx + 2*rax], xmm0 movups xmmword ptr [rcx + 2*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 32] movups xmm1, xmmword ptr [rdx + 2*rax + 48] movups xmmword ptr [rcx + 2*rax + 32], xmm0 movups xmmword ptr [rcx + 2*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 64] movups xmm1, xmmword ptr [rdx + 2*rax + 80] movups xmmword ptr [rcx + 2*rax + 64], xmm0 movups xmmword ptr [rcx + 2*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 2*rax + 96] movupd xmm1, xmmword ptr [rdx + 2*rax + 112] movupd xmmword ptr [rcx + 2*rax + 96], xmm0 movupd xmmword ptr [rcx + 2*rax + 112], xmm1 add rax, 64 add rdi, 4 jne .LBB0_1199 .LBB0_1200: test r8, r8 je .LBB0_1203 # %bb.1201: add rax, rax add rax, 16 neg r8 .LBB0_1202: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1202 .LBB0_1203: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1204 .LBB0_1208: and rdi, -4 neg rdi xor eax, eax .LBB0_1209: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 2*rax] movups xmm1, xmmword ptr [rdx + 2*rax + 16] movups xmmword ptr [rcx + 2*rax], xmm0 movups xmmword ptr [rcx + 2*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 32] movups xmm1, xmmword ptr [rdx + 2*rax + 48] movups xmmword ptr [rcx + 2*rax + 32], xmm0 movups xmmword ptr [rcx + 2*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 64] movups xmm1, xmmword ptr [rdx + 2*rax + 80] movups xmmword ptr [rcx + 2*rax + 64], xmm0 movups xmmword ptr [rcx + 2*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 2*rax + 96] movupd xmm1, xmmword ptr [rdx + 2*rax + 112] movupd xmmword ptr [rcx + 2*rax + 96], xmm0 movupd xmmword ptr [rcx + 2*rax + 112], xmm1 add rax, 64 add rdi, 4 jne .LBB0_1209 .LBB0_1210: test r8, r8 je .LBB0_1213 # %bb.1211: add rax, rax add rax, 16 neg r8 .LBB0_1212: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1212 .LBB0_1213: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1214 .LBB0_1218: and rdi, -4 neg rdi xor eax, eax .LBB0_1219: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 2*rax] movups xmm1, xmmword ptr [rdx + 2*rax + 16] movups xmmword ptr [rcx + 2*rax], xmm0 movups xmmword ptr [rcx + 2*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 32] movups xmm1, xmmword ptr [rdx + 2*rax + 48] movups xmmword ptr [rcx + 2*rax + 32], xmm0 movups xmmword ptr [rcx + 2*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 2*rax + 64] movups xmm1, xmmword ptr [rdx + 2*rax + 80] movups xmmword ptr [rcx + 2*rax + 64], xmm0 movups xmmword ptr [rcx + 2*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 2*rax + 96] movupd xmm1, xmmword ptr [rdx + 2*rax + 112] movupd xmmword ptr [rcx + 2*rax + 96], xmm0 movupd xmmword ptr [rcx + 2*rax + 112], xmm1 add rax, 64 add rdi, 4 jne .LBB0_1219 .LBB0_1220: test r8, r8 je .LBB0_1223 # %bb.1221: add rax, rax add rax, 16 neg r8 .LBB0_1222: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1222 .LBB0_1223: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1224 .LBB0_1228: and rdi, -4 neg rdi xor eax, eax .LBB0_1229: # =>This Inner Loop Header: Depth=1 pmovsxbq xmm0, word ptr [rdx + rax] pmovsxbq xmm1, word ptr [rdx + rax + 2] movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovsxbq xmm0, word ptr [rdx + rax + 4] pmovsxbq xmm1, word ptr [rdx + rax + 6] movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovsxbq xmm0, word ptr [rdx + rax + 8] pmovsxbq xmm1, word ptr [rdx + rax + 10] movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovsxbq xmm0, word ptr [rdx + rax + 12] pmovsxbq xmm1, word ptr [rdx + rax + 14] movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1229 .LBB0_1230: test r8, r8 je .LBB0_1233 # %bb.1231: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rax + rdx] add r10, 2 xor eax, eax .LBB0_1232: # =>This Inner Loop Header: Depth=1 pmovsxbq xmm0, word ptr [r10 + 4*rax - 2] pmovsxbq xmm1, word ptr [r10 + 4*rax] movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_1232 .LBB0_1233: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1234 .LBB0_1238: and rdi, -4 neg rdi xor eax, eax .LBB0_1239: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 8*rax] movups xmm1, xmmword ptr [rdx + 8*rax + 16] movups xmmword ptr [rcx + 8*rax], xmm0 movups xmmword ptr [rcx + 8*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 32] movups xmm1, xmmword ptr [rdx + 8*rax + 48] movups xmmword ptr [rcx + 8*rax + 32], xmm0 movups xmmword ptr [rcx + 8*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 64] movups xmm1, xmmword ptr [rdx + 8*rax + 80] movups xmmword ptr [rcx + 8*rax + 64], xmm0 movups xmmword ptr [rcx + 8*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] movupd xmm1, xmmword ptr [rdx + 8*rax + 112] movupd xmmword ptr [rcx + 8*rax + 96], xmm0 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1239 .LBB0_1240: test r8, r8 je .LBB0_1243 # %bb.1241: lea rax, [8*rax + 16] neg r8 .LBB0_1242: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1242 .LBB0_1243: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1244 .LBB0_1248: and rdi, -4 neg rdi xor eax, eax .LBB0_1249: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 8*rax] movups xmm1, xmmword ptr [rdx + 8*rax + 16] movups xmmword ptr [rcx + 8*rax], xmm0 movups xmmword ptr [rcx + 8*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 32] movups xmm1, xmmword ptr [rdx + 8*rax + 48] movups xmmword ptr [rcx + 8*rax + 32], xmm0 movups xmmword ptr [rcx + 8*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 8*rax + 64] movups xmm1, xmmword ptr [rdx + 8*rax + 80] movups xmmword ptr [rcx + 8*rax + 64], xmm0 movups xmmword ptr [rcx + 8*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 8*rax + 96] movupd xmm1, xmmword ptr [rdx + 8*rax + 112] movupd xmmword ptr [rcx + 8*rax + 96], xmm0 movupd xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1249 .LBB0_1250: test r8, r8 je .LBB0_1253 # %bb.1251: lea rax, [8*rax + 16] neg r8 .LBB0_1252: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1252 .LBB0_1253: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1254 .LBB0_1258: and rdi, -4 neg rdi xor eax, eax .LBB0_1259: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rax] movups xmm1, xmmword ptr [rdx + 4*rax + 16] movups xmmword ptr [rcx + 4*rax], xmm0 movups xmmword ptr [rcx + 4*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 32] movups xmm1, xmmword ptr [rdx + 4*rax + 48] movups xmmword ptr [rcx + 4*rax + 32], xmm0 movups xmmword ptr [rcx + 4*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 64] movups xmm1, xmmword ptr [rdx + 4*rax + 80] movups xmmword ptr [rcx + 4*rax + 64], xmm0 movups xmmword ptr [rcx + 4*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] movupd xmm1, xmmword ptr [rdx + 4*rax + 112] movupd xmmword ptr [rcx + 4*rax + 96], xmm0 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 add rax, 32 add rdi, 4 jne .LBB0_1259 .LBB0_1260: test r8, r8 je .LBB0_1263 # %bb.1261: lea rax, [4*rax + 16] neg r8 .LBB0_1262: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1262 .LBB0_1263: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1264 .LBB0_1268: and rdi, -4 neg rdi xor eax, eax .LBB0_1269: # =>This Inner Loop Header: Depth=1 pmovzxbq xmm0, word ptr [rdx + rax] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 2] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax], xmm0 movdqu xmmword ptr [rcx + 8*rax + 16], xmm1 pmovzxbq xmm0, word ptr [rdx + rax + 4] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 6] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 32], xmm0 movdqu xmmword ptr [rcx + 8*rax + 48], xmm1 pmovzxbq xmm0, word ptr [rdx + rax + 8] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 10] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 64], xmm0 movdqu xmmword ptr [rcx + 8*rax + 80], xmm1 pmovzxbq xmm0, word ptr [rdx + rax + 12] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [rdx + rax + 14] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rcx + 8*rax + 96], xmm0 movdqu xmmword ptr [rcx + 8*rax + 112], xmm1 add rax, 16 add rdi, 4 jne .LBB0_1269 .LBB0_1270: test r8, r8 je .LBB0_1273 # %bb.1271: lea rdi, [rcx + 8*rax] add rdi, 16 lea r10, [rax + rdx] add r10, 2 xor eax, eax .LBB0_1272: # =>This Inner Loop Header: Depth=1 pmovzxbq xmm0, word ptr [r10 + 4*rax - 2] # xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero pmovzxbq xmm1, word ptr [r10 + 4*rax] # xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero movdqu xmmword ptr [rdi - 16], xmm0 movdqu xmmword ptr [rdi], xmm1 add rdi, 32 add rax, 1 cmp r8, rax jne .LBB0_1272 .LBB0_1273: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1274 .LBB0_1278: and rdi, -4 neg rdi xor eax, eax .LBB0_1279: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + rax] movups xmm1, xmmword ptr [rdx + rax + 16] movups xmmword ptr [rcx + rax], xmm0 movups xmmword ptr [rcx + rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + rax + 32] movups xmm1, xmmword ptr [rdx + rax + 48] movups xmmword ptr [rcx + rax + 32], xmm0 movups xmmword ptr [rcx + rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + rax + 64] movups xmm1, xmmword ptr [rdx + rax + 80] movups xmmword ptr [rcx + rax + 64], xmm0 movups xmmword ptr [rcx + rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + rax + 96] movupd xmm1, xmmword ptr [rdx + rax + 112] movupd xmmword ptr [rcx + rax + 96], xmm0 movupd xmmword ptr [rcx + rax + 112], xmm1 sub rax, -128 add rdi, 4 jne .LBB0_1279 .LBB0_1280: test r8, r8 je .LBB0_1283 # %bb.1281: add rax, 16 neg r8 .LBB0_1282: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1282 .LBB0_1283: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1284 .LBB0_1288: and rdi, -4 neg rdi xor eax, eax .LBB0_1289: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + rax] movups xmm1, xmmword ptr [rdx + rax + 16] movups xmmword ptr [rcx + rax], xmm0 movups xmmword ptr [rcx + rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + rax + 32] movups xmm1, xmmword ptr [rdx + rax + 48] movups xmmword ptr [rcx + rax + 32], xmm0 movups xmmword ptr [rcx + rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + rax + 64] movups xmm1, xmmword ptr [rdx + rax + 80] movups xmmword ptr [rcx + rax + 64], xmm0 movups xmmword ptr [rcx + rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + rax + 96] movupd xmm1, xmmword ptr [rdx + rax + 112] movupd xmmword ptr [rcx + rax + 96], xmm0 movupd xmmword ptr [rcx + rax + 112], xmm1 sub rax, -128 add rdi, 4 jne .LBB0_1289 .LBB0_1290: test r8, r8 je .LBB0_1293 # %bb.1291: add rax, 16 neg r8 .LBB0_1292: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1292 .LBB0_1293: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1294 .LBB0_1298: and rdi, -4 neg rdi xor eax, eax .LBB0_1299: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rax] movups xmm1, xmmword ptr [rdx + 4*rax + 16] movups xmmword ptr [rcx + 4*rax], xmm0 movups xmmword ptr [rcx + 4*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 32] movups xmm1, xmmword ptr [rdx + 4*rax + 48] movups xmmword ptr [rcx + 4*rax + 32], xmm0 movups xmmword ptr [rcx + 4*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 64] movups xmm1, xmmword ptr [rdx + 4*rax + 80] movups xmmword ptr [rcx + 4*rax + 64], xmm0 movups xmmword ptr [rcx + 4*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] movupd xmm1, xmmword ptr [rdx + 4*rax + 112] movupd xmmword ptr [rcx + 4*rax + 96], xmm0 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 add rax, 32 add rdi, 4 jne .LBB0_1299 .LBB0_1300: test r8, r8 je .LBB0_1303 # %bb.1301: lea rax, [4*rax + 16] neg r8 .LBB0_1302: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1302 .LBB0_1303: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1304 .LBB0_1308: and rdi, -4 neg rdi xor eax, eax .LBB0_1309: # =>This Inner Loop Header: Depth=1 movups xmm0, xmmword ptr [rdx + 4*rax] movups xmm1, xmmword ptr [rdx + 4*rax + 16] movups xmmword ptr [rcx + 4*rax], xmm0 movups xmmword ptr [rcx + 4*rax + 16], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 32] movups xmm1, xmmword ptr [rdx + 4*rax + 48] movups xmmword ptr [rcx + 4*rax + 32], xmm0 movups xmmword ptr [rcx + 4*rax + 48], xmm1 movups xmm0, xmmword ptr [rdx + 4*rax + 64] movups xmm1, xmmword ptr [rdx + 4*rax + 80] movups xmmword ptr [rcx + 4*rax + 64], xmm0 movups xmmword ptr [rcx + 4*rax + 80], xmm1 movupd xmm0, xmmword ptr [rdx + 4*rax + 96] movupd xmm1, xmmword ptr [rdx + 4*rax + 112] movupd xmmword ptr [rcx + 4*rax + 96], xmm0 movupd xmmword ptr [rcx + 4*rax + 112], xmm1 add rax, 32 add rdi, 4 jne .LBB0_1309 .LBB0_1310: test r8, r8 je .LBB0_1313 # %bb.1311: lea rax, [4*rax + 16] neg r8 .LBB0_1312: # =>This Inner Loop Header: Depth=1 movupd xmm0, xmmword ptr [rdx + rax - 16] movupd xmm1, xmmword ptr [rdx + rax] movupd xmmword ptr [rcx + rax - 16], xmm0 movupd xmmword ptr [rcx + rax], xmm1 add rax, 32 inc r8 jne .LBB0_1312 .LBB0_1313: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1314 .LBB0_1318: xor edi, edi .LBB0_1319: test r8b, 1 je .LBB0_1321 # %bb.1320: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_5] # xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pextrw word ptr [rcx + rdi], xmm0, 0 pshufb xmm1, xmm2 pextrw word ptr [rcx + rdi + 2], xmm1, 0 .LBB0_1321: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1322 .LBB0_1326: xor edi, edi .LBB0_1327: test r8b, 1 je .LBB0_1329 # %bb.1328: movdqu xmm0, xmmword ptr [rdx + 2*rdi] movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_17] # xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + rdi], xmm0 .LBB0_1329: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1330 .LBB0_1334: xor edi, edi .LBB0_1335: test r8b, 1 je .LBB0_1337 # %bb.1336: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_5] # xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pextrw word ptr [rcx + rdi], xmm0, 0 pshufb xmm1, xmm2 pextrw word ptr [rcx + rdi + 2], xmm1, 0 .LBB0_1337: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1338 .LBB0_1342: xor edi, edi .LBB0_1343: test r8b, 1 je .LBB0_1345 # %bb.1344: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_13] # xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 .LBB0_1345: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1346 .LBB0_1350: xor edi, edi .LBB0_1351: test r8b, 1 je .LBB0_1353 # %bb.1352: pmovsxbw xmm0, qword ptr [rdx + rdi] pmovsxbw xmm1, qword ptr [rdx + rdi + 8] movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 .LBB0_1353: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1354 .LBB0_1358: xor edi, edi .LBB0_1359: test r8b, 1 je .LBB0_1361 # %bb.1360: pmovsxbw xmm0, qword ptr [rdx + rdi] pmovsxbw xmm1, qword ptr [rdx + rdi + 8] movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 .LBB0_1361: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1362 .LBB0_1366: xor edi, edi .LBB0_1367: test r8b, 1 je .LBB0_1369 # %bb.1368: pmovzxbw xmm0, qword ptr [rdx + rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero pmovzxbw xmm1, qword ptr [rdx + rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 .LBB0_1369: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1370 .LBB0_1374: xor edi, edi .LBB0_1375: test r8b, 1 je .LBB0_1377 # %bb.1376: pmovzxbw xmm0, qword ptr [rdx + rdi] # xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero pmovzxbw xmm1, qword ptr [rdx + rdi + 8] # xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero movdqu xmmword ptr [rcx + 2*rdi], xmm0 movdqu xmmword ptr [rcx + 2*rdi + 16], xmm1 .LBB0_1377: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1378 .LBB0_1382: xor edi, edi .LBB0_1383: test r8b, 1 je .LBB0_1385 # %bb.1384: pmovsxbd xmm0, dword ptr [rdx + rdi] pmovsxbd xmm1, dword ptr [rdx + rdi + 4] cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1385: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1386 .LBB0_1390: xor edi, edi .LBB0_1391: test r8b, 1 je .LBB0_1393 # %bb.1392: pmovzxbd xmm0, dword ptr [rdx + rdi] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero cvtdq2ps xmm0, xmm0 cvtdq2ps xmm1, xmm1 movups xmmword ptr [rcx + 4*rdi], xmm0 movups xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1393: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1394 .LBB0_1398: xor edi, edi .LBB0_1399: test r8b, 1 je .LBB0_1401 # %bb.1400: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_13] # xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 .LBB0_1401: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1402 .LBB0_1406: xor edi, edi .LBB0_1407: test r8b, 1 je .LBB0_1409 # %bb.1408: movupd xmm0, xmmword ptr [rdx + 8*rdi] cvttpd2dq xmm0, xmm0 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> cvttpd2dq xmm1, xmm1 pshufb xmm0, xmm2 pextrw word ptr [rcx + rdi], xmm0, 0 pshufb xmm1, xmm2 pextrw word ptr [rcx + rdi + 2], xmm1, 0 .LBB0_1409: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1410 .LBB0_1414: xor edi, edi .LBB0_1415: test r8b, 1 je .LBB0_1417 # %bb.1416: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_5] # xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pextrw word ptr [rcx + rdi], xmm0, 0 pshufb xmm1, xmm2 pextrw word ptr [rcx + rdi + 2], xmm1, 0 .LBB0_1417: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1418 .LBB0_1422: xor edi, edi .LBB0_1423: test r8b, 1 je .LBB0_1425 # %bb.1424: movdqu xmm0, xmmword ptr [rdx + 2*rdi] movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_17] # xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + rdi], xmm0 .LBB0_1425: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1426 .LBB0_1430: xor edi, edi .LBB0_1431: test r8b, 1 je .LBB0_1433 # %bb.1432: movdqu xmm0, xmmword ptr [rdx + 2*rdi] movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_17] # xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + rdi], xmm0 .LBB0_1433: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1434 .LBB0_1438: xor edi, edi .LBB0_1439: test r8b, 1 je .LBB0_1441 # %bb.1440: movdqu xmm0, xmmword ptr [rdx + 8*rdi] movdqu xmm1, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_5] # xmm2 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pextrw word ptr [rcx + rdi], xmm0, 0 pshufb xmm1, xmm2 pextrw word ptr [rcx + rdi + 2], xmm1, 0 .LBB0_1441: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1442 .LBB0_1446: xor edi, edi .LBB0_1447: test r8b, 1 je .LBB0_1449 # %bb.1448: movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 packusdw xmm0, xmm0 packuswb xmm0, xmm0 cvttps2dq xmm1, xmm1 packusdw xmm1, xmm1 packuswb xmm1, xmm1 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 .LBB0_1449: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1450 .LBB0_1454: xor edi, edi .LBB0_1455: test r8b, 1 je .LBB0_1457 # %bb.1456: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_13] # xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 .LBB0_1457: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1458 .LBB0_1462: xor edi, edi .LBB0_1463: test r8b, 1 je .LBB0_1465 # %bb.1464: pmovsxbd xmm0, dword ptr [rdx + rdi] pmovsxbd xmm1, dword ptr [rdx + rdi + 4] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1465: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1466 .LBB0_1470: xor edi, edi .LBB0_1471: test r8b, 1 je .LBB0_1473 # %bb.1472: pmovzxbd xmm0, dword ptr [rdx + rdi] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1473: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1474 .LBB0_1478: xor edi, edi .LBB0_1479: test r8b, 1 je .LBB0_1481 # %bb.1480: pmovsxbd xmm0, dword ptr [rdx + rdi] pmovsxbd xmm1, dword ptr [rdx + rdi + 4] movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1481: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1482 .LBB0_1486: xor edi, edi .LBB0_1487: test r8b, 1 je .LBB0_1489 # %bb.1488: pmovzxbd xmm0, dword ptr [rdx + rdi] # xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero pmovzxbd xmm1, dword ptr [rdx + rdi + 4] # xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero movdqu xmmword ptr [rcx + 4*rdi], xmm0 movdqu xmmword ptr [rcx + 4*rdi + 16], xmm1 .LBB0_1489: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1490 .LBB0_1494: xor edi, edi .LBB0_1495: test r8b, 1 je .LBB0_1497 # %bb.1496: movdqu xmm0, xmmword ptr [rdx + 4*rdi] movdqu xmm1, xmmword ptr [rdx + 4*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_13] # xmm2 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 .LBB0_1497: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1498 .LBB0_1502: xor edi, edi .LBB0_1503: test r8b, 1 je .LBB0_1505 # %bb.1504: movupd xmm0, xmmword ptr [rdx + 8*rdi] cvttpd2dq xmm0, xmm0 movupd xmm1, xmmword ptr [rdx + 8*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_1] # xmm2 = <0,4,u,u,u,u,u,u,u,u,u,u,u,u,u,u> cvttpd2dq xmm1, xmm1 pshufb xmm0, xmm2 pextrw word ptr [rcx + rdi], xmm0, 0 pshufb xmm1, xmm2 pextrw word ptr [rcx + rdi + 2], xmm1, 0 .LBB0_1505: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1506 .LBB0_1510: xor edi, edi .LBB0_1511: test r8b, 1 je .LBB0_1513 # %bb.1512: movdqu xmm0, xmmword ptr [rdx + 2*rdi] movdqu xmm1, xmmword ptr [rdx + 2*rdi + 16] movdqa xmm2, xmmword ptr [rip + .LCPI0_17] # xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u> pshufb xmm0, xmm2 pshufb xmm1, xmm2 punpcklqdq xmm0, xmm1 # xmm0 = xmm0[0],xmm1[0] movdqu xmmword ptr [rcx + rdi], xmm0 .LBB0_1513: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1514 .LBB0_1518: xor edi, edi .LBB0_1519: test r8b, 1 je .LBB0_1521 # %bb.1520: movups xmm0, xmmword ptr [rdx + 4*rdi] movups xmm1, xmmword ptr [rdx + 4*rdi + 16] cvttps2dq xmm0, xmm0 packssdw xmm0, xmm0 packsswb xmm0, xmm0 cvttps2dq xmm1, xmm1 packssdw xmm1, xmm1 packsswb xmm1, xmm1 movd dword ptr [rcx + rdi], xmm0 movd dword ptr [rcx + rdi + 4], xmm1 .LBB0_1521: cmp rsi, r9 je .LBB0_1526 jmp .LBB0_1522 .Lfunc_end0: .size cast_type_numeric_sse4, .Lfunc_end0-cast_type_numeric_sse4 # -- End function .ident "Ubuntu clang version 11.1.0-6" .section ".note.GNU-stack","",@progbits .addrsig