parquet/internal/utils/_lib/bit_packing_avx2.s (4,012 lines of code) (raw):

.text .intel_syntax noprefix .file "bit_packing_avx2.c" .section .rodata.cst8,"aM",@progbits,8 .p2align 3 # -- Begin function unpack32_avx2 .LCPI0_0: .quad 9223372034707292159 # 0x7fffffff7fffffff .LCPI0_8: .quad 4611686015206162431 # 0x3fffffff3fffffff .LCPI0_12: .quad 2305843005455597567 # 0x1fffffff1fffffff .LCPI0_23: .quad 1152921500580315135 # 0xfffffff0fffffff .LCPI0_25: .quad 576460748142673919 # 0x7ffffff07ffffff .LCPI0_34: .quad 288230371923853311 # 0x3ffffff03ffffff .LCPI0_35: .quad 42949672976 # 0xa00000010 .LCPI0_36: .quad 94489280528 # 0x1600000010 .LCPI0_38: .quad 144115183814443007 # 0x1ffffff01ffffff .LCPI0_49: .quad 36028792732385279 # 0x7fffff007fffff .LCPI0_56: .quad 18014394218708991 # 0x3fffff003fffff .LCPI0_59: .quad 9007194961870847 # 0x1fffff001fffff .LCPI0_66: .quad 4503595333451775 # 0xfffff000fffff .LCPI0_68: .quad 2251795519242239 # 0x7ffff0007ffff .LCPI0_73: .quad 1125895612137471 # 0x3ffff0003ffff .LCPI0_76: .quad 562945658585087 # 0x1ffff0001ffff .LCPI0_80: .quad 68719476736 # 0x1000000000 .LCPI0_82: .quad 140733193420799 # 0x7fff00007fff .LCPI0_87: .quad 70364449226751 # 0x3fff00003fff .LCPI0_90: .quad 35180077129727 # 0x1fff00001fff .LCPI0_95: .quad 17587891081215 # 0xfff00000fff .LCPI0_97: .quad 8791798056959 # 0x7ff000007ff .LCPI0_102: .quad 4393751544831 # 0x3ff000003ff .LCPI0_105: .quad 2194728288767 # 0x1ff000001ff .LCPI0_112: .quad 545460846719 # 0x7f0000007f .LCPI0_117: .quad 270582939711 # 0x3f0000003f .LCPI0_120: .quad 133143986207 # 0x1f0000001f .LCPI0_125: .quad 64424509455 # 0xf0000000f .LCPI0_127: .quad 30064771079 # 0x700000007 .LCPI0_132: .quad 12884901891 # 0x300000003 .LCPI0_135: .quad 4294967297 # 0x100000001 .section .rodata.cst32,"aM",@progbits,32 .p2align 5 .LCPI0_1: .long 24 # 0x18 .long 23 # 0x17 .long 22 # 0x16 .long 21 # 0x15 .long 20 # 0x14 .long 19 # 0x13 .long 18 # 0x12 .long 17 # 0x11 .LCPI0_2: .long 8 # 0x8 .long 9 # 0x9 .long 10 # 0xa .long 11 # 0xb .long 12 # 0xc .long 13 # 0xd .long 14 # 0xe .long 15 # 0xf .LCPI0_3: .long 16 # 0x10 .long 15 # 0xf .long 14 # 0xe .long 13 # 0xd .long 12 # 0xc .long 11 # 0xb .long 10 # 0xa .long 9 # 0x9 .LCPI0_4: .long 16 # 0x10 .long 17 # 0x11 .long 18 # 0x12 .long 19 # 0x13 .long 20 # 0x14 .long 21 # 0x15 .long 22 # 0x16 .long 23 # 0x17 .LCPI0_7: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 1 # 0x1 .LCPI0_11: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .LCPI0_15: .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_18: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 1 # 0x1 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_21: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 3 # 0x3 .LCPI0_22: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .LCPI0_24: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .LCPI0_28: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_31: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 1 # 0x1 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_32: .long 0 # 0x0 .long 3 # 0x3 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 5 # 0x5 .LCPI0_33: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_37: .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 6 # 0x6 .LCPI0_39: .long 0 # 0x0 .long 1 # 0x1 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 5 # 0x5 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_42: .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .LCPI0_45: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 3 # 0x3 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 7 # 0x7 .LCPI0_48: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 5 # 0x5 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 1 # 0x1 .LCPI0_52: .long 0 # 0x0 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .LCPI0_53: .long 0 # 0x0 .long 7 # 0x7 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 3 # 0x3 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_54: .long 8 # 0x8 .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 0 # 0x0 .long 9 # 0x9 .LCPI0_55: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .LCPI0_57: .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .long 0 # 0x0 .long 8 # 0x8 .long 0 # 0x0 .long 0 # 0x0 .long 10 # 0xa .LCPI0_58: .long 0 # 0x0 .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .long 0 # 0x0 .long 9 # 0x9 .long 0 # 0x0 .long 0 # 0x0 .LCPI0_60: .long 8 # 0x8 .long 0 # 0x0 .long 0 # 0x0 .long 7 # 0x7 .long 0 # 0x0 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .LCPI0_61: .long 0 # 0x0 .long 5 # 0x5 .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 0 # 0x0 .long 3 # 0x3 .LCPI0_64: .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 0 # 0x0 .long 1 # 0x1 .long 0 # 0x0 .long 11 # 0xb .LCPI0_65: .long 0 # 0x0 .long 0 # 0x0 .long 8 # 0x8 .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 12 # 0xc .LCPI0_67: .long 0 # 0x0 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .long 0 # 0x0 .long 5 # 0x5 .LCPI0_69: .long 0 # 0x0 .long 11 # 0xb .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .LCPI0_70: .long 0 # 0x0 .long 3 # 0x3 .long 0 # 0x0 .long 9 # 0x9 .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .LCPI0_71: .long 8 # 0x8 .long 0 # 0x0 .long 0 # 0x0 .long 1 # 0x1 .long 0 # 0x0 .long 7 # 0x7 .long 0 # 0x0 .long 13 # 0xd .LCPI0_72: .long 0 # 0x0 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 8 # 0x8 .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .LCPI0_74: .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .long 14 # 0xe .LCPI0_75: .long 0 # 0x0 .long 0 # 0x0 .long 2 # 0x2 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .LCPI0_77: .long 8 # 0x8 .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .long 14 # 0xe .long 0 # 0x0 .LCPI0_78: .long 0 # 0x0 .long 1 # 0x1 .long 0 # 0x0 .long 3 # 0x3 .long 0 # 0x0 .long 5 # 0x5 .long 0 # 0x0 .long 7 # 0x7 .LCPI0_79: .long 0 # 0x0 .long 9 # 0x9 .long 0 # 0x0 .long 11 # 0xb .long 0 # 0x0 .long 13 # 0xd .long 0 # 0x0 .long 15 # 0xf .LCPI0_81: .long 0 # 0x0 .long 15 # 0xf .long 0 # 0x0 .long 13 # 0xd .long 0 # 0x0 .long 11 # 0xb .long 0 # 0x0 .long 9 # 0x9 .LCPI0_83: .long 0 # 0x0 .long 7 # 0x7 .long 0 # 0x0 .long 5 # 0x5 .long 0 # 0x0 .long 3 # 0x3 .long 0 # 0x0 .long 1 # 0x1 .LCPI0_84: .long 16 # 0x10 .long 0 # 0x0 .long 14 # 0xe .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .LCPI0_85: .long 8 # 0x8 .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .long 4 # 0x4 .long 0 # 0x0 .long 2 # 0x2 .long 17 # 0x11 .LCPI0_86: .long 0 # 0x0 .long 14 # 0xe .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .long 6 # 0x6 .long 0 # 0x0 .long 2 # 0x2 .LCPI0_88: .long 16 # 0x10 .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .long 8 # 0x8 .long 0 # 0x0 .long 4 # 0x4 .long 18 # 0x12 .LCPI0_89: .long 0 # 0x0 .long 13 # 0xd .long 0 # 0x0 .long 7 # 0x7 .long 0 # 0x0 .long 1 # 0x1 .long 14 # 0xe .long 0 # 0x0 .LCPI0_91: .long 8 # 0x8 .long 0 # 0x0 .long 2 # 0x2 .long 15 # 0xf .long 0 # 0x0 .long 9 # 0x9 .long 0 # 0x0 .long 3 # 0x3 .LCPI0_92: .long 16 # 0x10 .long 0 # 0x0 .long 10 # 0xa .long 0 # 0x0 .long 4 # 0x4 .long 17 # 0x11 .long 0 # 0x0 .long 11 # 0xb .LCPI0_93: .long 0 # 0x0 .long 5 # 0x5 .long 18 # 0x12 .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .long 6 # 0x6 .long 19 # 0x13 .LCPI0_94: .long 0 # 0x0 .long 12 # 0xc .long 0 # 0x0 .long 4 # 0x4 .long 16 # 0x10 .long 0 # 0x0 .long 8 # 0x8 .long 20 # 0x14 .LCPI0_96: .long 0 # 0x0 .long 11 # 0xb .long 0 # 0x0 .long 1 # 0x1 .long 12 # 0xc .long 0 # 0x0 .long 2 # 0x2 .long 13 # 0xd .LCPI0_98: .long 0 # 0x0 .long 3 # 0x3 .long 14 # 0xe .long 0 # 0x0 .long 4 # 0x4 .long 15 # 0xf .long 0 # 0x0 .long 5 # 0x5 .LCPI0_99: .long 16 # 0x10 .long 0 # 0x0 .long 6 # 0x6 .long 17 # 0x11 .long 0 # 0x0 .long 7 # 0x7 .long 18 # 0x12 .long 0 # 0x0 .LCPI0_100: .long 8 # 0x8 .long 19 # 0x13 .long 0 # 0x0 .long 9 # 0x9 .long 20 # 0x14 .long 0 # 0x0 .long 10 # 0xa .long 21 # 0x15 .LCPI0_101: .long 0 # 0x0 .long 10 # 0xa .long 20 # 0x14 .long 0 # 0x0 .long 8 # 0x8 .long 18 # 0x12 .long 0 # 0x0 .long 6 # 0x6 .LCPI0_103: .long 16 # 0x10 .long 0 # 0x0 .long 4 # 0x4 .long 14 # 0xe .long 0 # 0x0 .long 2 # 0x2 .long 12 # 0xc .long 22 # 0x16 .LCPI0_104: .long 0 # 0x0 .long 9 # 0x9 .long 18 # 0x12 .long 0 # 0x0 .long 4 # 0x4 .long 13 # 0xd .long 22 # 0x16 .long 0 # 0x0 .LCPI0_106: .long 8 # 0x8 .long 17 # 0x11 .long 0 # 0x0 .long 3 # 0x3 .long 12 # 0xc .long 21 # 0x15 .long 0 # 0x0 .long 7 # 0x7 .LCPI0_107: .long 16 # 0x10 .long 0 # 0x0 .long 2 # 0x2 .long 11 # 0xb .long 20 # 0x14 .long 0 # 0x0 .long 6 # 0x6 .long 15 # 0xf .LCPI0_108: .long 0 # 0x0 .long 1 # 0x1 .long 10 # 0xa .long 19 # 0x13 .long 0 # 0x0 .long 5 # 0x5 .long 14 # 0xe .long 23 # 0x17 .LCPI0_111: .long 0 # 0x0 .long 7 # 0x7 .long 14 # 0xe .long 21 # 0x15 .long 0 # 0x0 .long 3 # 0x3 .long 10 # 0xa .long 17 # 0x11 .LCPI0_113: .long 24 # 0x18 .long 0 # 0x0 .long 6 # 0x6 .long 13 # 0xd .long 20 # 0x14 .long 0 # 0x0 .long 2 # 0x2 .long 9 # 0x9 .LCPI0_114: .long 16 # 0x10 .long 23 # 0x17 .long 0 # 0x0 .long 5 # 0x5 .long 12 # 0xc .long 19 # 0x13 .long 0 # 0x0 .long 1 # 0x1 .LCPI0_115: .long 8 # 0x8 .long 15 # 0xf .long 22 # 0x16 .long 0 # 0x0 .long 4 # 0x4 .long 11 # 0xb .long 18 # 0x12 .long 25 # 0x19 .LCPI0_116: .long 0 # 0x0 .long 6 # 0x6 .long 12 # 0xc .long 18 # 0x12 .long 24 # 0x18 .long 0 # 0x0 .long 4 # 0x4 .long 10 # 0xa .LCPI0_118: .long 16 # 0x10 .long 22 # 0x16 .long 0 # 0x0 .long 2 # 0x2 .long 8 # 0x8 .long 14 # 0xe .long 20 # 0x14 .long 26 # 0x1a .LCPI0_119: .long 0 # 0x0 .long 5 # 0x5 .long 10 # 0xa .long 15 # 0xf .long 20 # 0x14 .long 25 # 0x19 .long 0 # 0x0 .long 3 # 0x3 .LCPI0_121: .long 8 # 0x8 .long 13 # 0xd .long 18 # 0x12 .long 23 # 0x17 .long 0 # 0x0 .long 1 # 0x1 .long 6 # 0x6 .long 11 # 0xb .LCPI0_122: .long 16 # 0x10 .long 21 # 0x15 .long 26 # 0x1a .long 0 # 0x0 .long 4 # 0x4 .long 9 # 0x9 .long 14 # 0xe .long 19 # 0x13 .LCPI0_123: .long 24 # 0x18 .long 0 # 0x0 .long 2 # 0x2 .long 7 # 0x7 .long 12 # 0xc .long 17 # 0x11 .long 22 # 0x16 .long 27 # 0x1b .LCPI0_124: .long 0 # 0x0 .long 4 # 0x4 .long 8 # 0x8 .long 12 # 0xc .long 16 # 0x10 .long 20 # 0x14 .long 24 # 0x18 .long 28 # 0x1c .LCPI0_126: .long 0 # 0x0 .long 3 # 0x3 .long 6 # 0x6 .long 9 # 0x9 .long 12 # 0xc .long 15 # 0xf .long 18 # 0x12 .long 21 # 0x15 .LCPI0_128: .long 24 # 0x18 .long 27 # 0x1b .long 0 # 0x0 .long 1 # 0x1 .long 4 # 0x4 .long 7 # 0x7 .long 10 # 0xa .long 13 # 0xd .LCPI0_129: .long 16 # 0x10 .long 19 # 0x13 .long 22 # 0x16 .long 25 # 0x19 .long 28 # 0x1c .long 0 # 0x0 .long 2 # 0x2 .long 5 # 0x5 .LCPI0_130: .long 8 # 0x8 .long 11 # 0xb .long 14 # 0xe .long 17 # 0x11 .long 20 # 0x14 .long 23 # 0x17 .long 26 # 0x1a .long 29 # 0x1d .LCPI0_131: .long 0 # 0x0 .long 2 # 0x2 .long 4 # 0x4 .long 6 # 0x6 .long 8 # 0x8 .long 10 # 0xa .long 12 # 0xc .long 14 # 0xe .LCPI0_133: .long 16 # 0x10 .long 18 # 0x12 .long 20 # 0x14 .long 22 # 0x16 .long 24 # 0x18 .long 26 # 0x1a .long 28 # 0x1c .long 30 # 0x1e .LCPI0_134: .long 0 # 0x0 .long 1 # 0x1 .long 2 # 0x2 .long 3 # 0x3 .long 4 # 0x4 .long 5 # 0x5 .long 6 # 0x6 .long 7 # 0x7 .LCPI0_136: .long 24 # 0x18 .long 25 # 0x19 .long 26 # 0x1a .long 27 # 0x1b .long 28 # 0x1c .long 29 # 0x1d .long 30 # 0x1e .long 31 # 0x1f .section .rodata.cst16,"aM",@progbits,16 .p2align 4 .LCPI0_5: .long 8 # 0x8 .long 7 # 0x7 .long 6 # 0x6 .long 5 # 0x5 .LCPI0_6: .long 24 # 0x18 .long 25 # 0x19 .long 26 # 0x1a .long 27 # 0x1b .LCPI0_9: .long 16 # 0x10 .long 14 # 0xe .long 12 # 0xc .long 10 # 0xa .LCPI0_10: .long 16 # 0x10 .long 18 # 0x12 .long 20 # 0x14 .long 22 # 0x16 .LCPI0_13: .long 8 # 0x8 .long 5 # 0x5 .zero 4 .zero 4 .LCPI0_14: .long 24 # 0x18 .long 27 # 0x1b .zero 4 .zero 4 .LCPI0_16: .long 16 # 0x10 .long 13 # 0xd .long 10 # 0xa .long 7 # 0x7 .LCPI0_17: .long 16 # 0x10 .long 19 # 0x13 .long 22 # 0x16 .long 25 # 0x19 .LCPI0_19: .long 24 # 0x18 .long 21 # 0x15 .long 18 # 0x12 .long 15 # 0xf .LCPI0_20: .long 8 # 0x8 .long 11 # 0xb .long 14 # 0xe .long 17 # 0x11 .LCPI0_26: .long 24 # 0x18 .long 19 # 0x13 .long 14 # 0xe .long 9 # 0x9 .LCPI0_27: .long 8 # 0x8 .long 13 # 0xd .long 18 # 0x12 .long 23 # 0x17 .LCPI0_29: .long 16 # 0x10 .long 11 # 0xb .zero 4 .zero 4 .LCPI0_30: .long 16 # 0x10 .long 21 # 0x15 .zero 4 .zero 4 .LCPI0_40: .long 16 # 0x10 .long 9 # 0x9 .zero 4 .zero 4 .LCPI0_41: .long 16 # 0x10 .long 23 # 0x17 .zero 4 .zero 4 .LCPI0_43: .long 24 # 0x18 .long 17 # 0x11 .zero 4 .zero 4 .LCPI0_44: .long 8 # 0x8 .long 15 # 0xf .zero 4 .zero 4 .LCPI0_46: .long 0 # 0x0 .long 0 # 0x0 .long 0 # 0x0 .long 8 # 0x8 .LCPI0_50: .long 24 # 0x18 .long 15 # 0xf .zero 4 .zero 4 .LCPI0_51: .long 8 # 0x8 .long 17 # 0x11 .zero 4 .zero 4 .LCPI0_62: .long 24 # 0x18 .long 13 # 0xd .zero 4 .zero 4 .LCPI0_63: .long 8 # 0x8 .long 19 # 0x13 .zero 4 .zero 4 .LCPI0_109: .long 0 # 0x0 .long 8 # 0x8 .long 16 # 0x10 .long 24 # 0x18 .section .rodata.cst4,"aM",@progbits,4 .p2align 2 .LCPI0_47: .long 16777215 # 0xffffff .LCPI0_110: .long 255 # 0xff .text .globl unpack32_avx2 .p2align 4, 0x90 .type unpack32_avx2,@function unpack32_avx2: # @unpack32_avx2 # %bb.0: push rbp mov rbp, rsp push r15 push r14 push r12 push rbx and rsp, -16 # kill: def $edx killed $edx def $rdx mov r15, rsi mov rbx, rdi lea r14d, [rdx + 31] test edx, edx cmovns r14d, edx sar r14d, 5 cmp ecx, 15 jle .LBB0_1 # %bb.48: cmp ecx, 23 jle .LBB0_49 # %bb.72: cmp ecx, 27 jle .LBB0_73 # %bb.84: cmp ecx, 29 jle .LBB0_85 # %bb.90: cmp ecx, 30 je .LBB0_99 # %bb.91: cmp ecx, 31 je .LBB0_96 # %bb.92: cmp ecx, 32 jne .LBB0_147 # %bb.93: cmp edx, 32 jl .LBB0_147 # %bb.94: mov r12d, r14d .p2align 4, 0x90 .LBB0_95: # =>This Inner Loop Header: Depth=1 mov edx, 128 mov rdi, r15 mov rsi, rbx call clib·_memcpy(SB) sub rbx, -128 sub r15, -128 add r12, -1 jne .LBB0_95 jmp .LBB0_147 .LBB0_1: cmp ecx, 7 jg .LBB0_25 # %bb.2: cmp ecx, 3 jg .LBB0_14 # %bb.3: cmp ecx, 1 jg .LBB0_9 # %bb.4: test ecx, ecx je .LBB0_144 # %bb.5: cmp ecx, 1 jne .LBB0_147 # %bb.6: cmp edx, 32 jl .LBB0_147 # %bb.7: mov eax, r14d add r15, 96 xor ecx, ecx vpbroadcastq ymm0, qword ptr [rip + .LCPI0_135] # ymm0 = [4294967297,4294967297,4294967297,4294967297] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_134] # ymm1 = [0,1,2,3,4,5,6,7] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_2] # ymm2 = [8,9,10,11,12,13,14,15] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_4] # ymm3 = [16,17,18,19,20,21,22,23] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_136] # ymm4 = [24,25,26,27,28,29,30,31] .p2align 4, 0x90 .LBB0_8: # =>This Inner Loop Header: Depth=1 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 vpbroadcastd ymm5, dword ptr [rbx + 4*rcx] vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 add rcx, 1 sub r15, -128 cmp rax, rcx jne .LBB0_8 jmp .LBB0_147 .LBB0_49: cmp ecx, 19 jg .LBB0_61 # %bb.50: cmp ecx, 17 jg .LBB0_56 # %bb.51: cmp ecx, 16 je .LBB0_120 # %bb.52: cmp ecx, 17 jne .LBB0_147 # %bb.53: cmp edx, 32 jl .LBB0_147 # %bb.54: mov r8d, r14d add r15, 96 add rbx, 64 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_76] # ymm0 = [562945658585087,562945658585087,562945658585087,562945658585087] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_75] # ymm1 = [0,0,2,0,4,0,6,0] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_77] # ymm2 = [8,0,10,0,12,0,14,0] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_78] # ymm3 = [0,1,0,3,0,5,0,7] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_79] # ymm4 = [0,9,0,11,0,13,0,15] .p2align 4, 0x90 .LBB0_55: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 52] mov r10d, dword ptr [rbx - 48] shld r10d, ecx, 9 mov esi, dword ptr [rbx - 56] mov edi, ecx shld edi, esi, 11 mov r9d, dword ptr [rbx - 64] mov edx, dword ptr [rbx - 60] mov eax, edx shld eax, r9d, 15 vmovd xmm5, esi shld esi, edx, 13 vpinsrd xmm5, xmm5, edi, 1 vpinsrd xmm5, xmm5, ecx, 2 vpinsrd xmm5, xmm5, r10d, 3 vmovd xmm6, r9d vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, esi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov eax, dword ptr [rbx - 36] mov r10d, dword ptr [rbx - 32] shld r10d, eax, 1 mov edx, dword ptr [rbx - 40] mov esi, eax shld esi, edx, 3 mov r9d, dword ptr [rbx - 48] mov ecx, dword ptr [rbx - 44] mov edi, ecx shld edi, r9d, 7 vmovd xmm5, edx shld edx, ecx, 5 vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, r10d, 3 vmovd xmm6, r9d vpinsrd xmm6, xmm6, edi, 1 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov r9d, dword ptr [rbx - 16] mov r11d, dword ptr [rbx - 20] mov edx, r9d shld edx, r11d, 10 mov r10d, dword ptr [rbx - 24] mov edi, r11d shld edi, r10d, 12 mov eax, dword ptr [rbx - 28] mov esi, r10d shld esi, eax, 14 mov ecx, dword ptr [rbx - 32] shrd ecx, eax, 16 vmovd xmm5, edi vpinsrd xmm5, xmm5, r11d, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm6, xmm6, esi, 2 vpinsrd xmm6, xmm6, r10d, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov r9d, dword ptr [rbx] mov r11d, dword ptr [rbx - 4] mov edx, r9d shld edx, r11d, 2 mov r10d, dword ptr [rbx - 8] mov edi, r11d shld edi, r10d, 4 mov eax, dword ptr [rbx - 16] mov esi, dword ptr [rbx - 12] mov ecx, r10d shld ecx, esi, 6 shrd eax, esi, 24 vmovd xmm5, edi vpinsrd xmm5, xmm5, r11d, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, eax vpinsrd xmm6, xmm6, esi, 1 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm6, xmm6, r10d, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 68 add r8, -1 jne .LBB0_55 jmp .LBB0_147 .LBB0_25: cmp ecx, 11 jg .LBB0_37 # %bb.26: cmp ecx, 9 jg .LBB0_32 # %bb.27: cmp ecx, 8 je .LBB0_132 # %bb.28: cmp ecx, 9 jne .LBB0_147 # %bb.29: cmp edx, 32 jl .LBB0_147 # %bb.30: mov r8d, r14d add r15, 96 add rbx, 32 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_105] # ymm0 = [2194728288767,2194728288767,2194728288767,2194728288767] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_104] # ymm1 = [0,9,18,0,4,13,22,0] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_106] # ymm2 = [8,17,0,3,12,21,0,7] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_107] # ymm3 = [16,0,2,11,20,0,6,15] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_108] # ymm4 = [0,1,10,19,0,5,14,23] .p2align 4, 0x90 .LBB0_31: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 32] mov edx, dword ptr [rbx - 28] mov esi, dword ptr [rbx - 24] shld esi, edx, 1 vmovd xmm5, edx vpinsrd xmm5, xmm5, edx, 1 vpinsrd xmm5, xmm5, edx, 2 shld edx, ecx, 5 vpinsrd xmm5, xmm5, esi, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov ecx, dword ptr [rbx - 16] mov edx, dword ptr [rbx - 24] mov esi, dword ptr [rbx - 20] mov edi, ecx shld edi, esi, 2 mov eax, esi shld eax, edx, 6 vmovd xmm5, esi vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, edi, 2 vpinsrd xmm5, xmm5, ecx, 3 vmovd xmm6, edx vpinsrd xmm6, xmm6, edx, 1 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm6, xmm6, esi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov eax, dword ptr [rbx - 8] mov ecx, dword ptr [rbx - 16] mov edx, dword ptr [rbx - 12] mov esi, eax shld esi, edx, 3 mov edi, edx shld edi, ecx, 7 vmovd xmm5, edx vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, eax, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, edi, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov eax, dword ptr [rbx] mov ecx, dword ptr [rbx - 8] mov edx, dword ptr [rbx - 4] mov esi, eax shld esi, edx, 4 shrd ecx, edx, 24 vmovd xmm5, esi vpinsrd xmm5, xmm5, eax, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, eax, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, edx, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 36 add r8, -1 jne .LBB0_31 jmp .LBB0_147 .LBB0_73: cmp ecx, 25 jg .LBB0_79 # %bb.74: cmp ecx, 24 je .LBB0_108 # %bb.75: cmp ecx, 25 jne .LBB0_147 # %bb.76: cmp edx, 32 jl .LBB0_147 # %bb.77: mov r8d, r14d add r15, 96 add rbx, 96 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_38] # ymm0 = [144115183814443007,144115183814443007,144115183814443007,144115183814443007] vmovdqa ymm9, ymmword ptr [rip + .LCPI0_28] # ymm9 = [0,0,0,0,4,0,0,0] vmovdqa ymm10, ymmword ptr [rip + .LCPI0_39] # ymm10 = [0,1,0,0,0,5,0,0] vmovdqa xmm11, xmmword ptr [rip + .LCPI0_40] # xmm11 = <16,9,u,u> vmovdqa xmm4, xmmword ptr [rip + .LCPI0_41] # xmm4 = <16,23,u,u> vmovdqa ymm5, ymmword ptr [rip + .LCPI0_42] # ymm5 = [0,0,2,0,0,0,6,0] vmovdqa xmm6, xmmword ptr [rip + .LCPI0_43] # xmm6 = <24,17,u,u> vmovdqa xmm7, xmmword ptr [rip + .LCPI0_44] # xmm7 = <8,15,u,u> vmovdqa ymm8, ymmword ptr [rip + .LCPI0_45] # ymm8 = [0,0,0,3,0,0,0,7] .p2align 4, 0x90 .LBB0_78: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 76] mov r9d, dword ptr [rbx - 72] shld r9d, ecx, 17 mov esi, dword ptr [rbx - 80] shld ecx, esi, 10 mov edi, dword ptr [rbx - 84] shld esi, edi, 3 mov eax, dword ptr [rbx - 88] vmovd xmm1, edi shld edi, eax, 21 mov r10d, dword ptr [rbx - 96] mov edx, dword ptr [rbx - 92] shld eax, edx, 14 shld edx, r10d, 7 vpinsrd xmm1, xmm1, esi, 1 vmovd xmm2, r10d vpinsrd xmm1, xmm1, ecx, 2 vpinsrd xmm2, xmm2, edx, 1 vpinsrd xmm1, xmm1, r9d, 3 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm2, xmm2, edi, 3 vinserti128 ymm1, ymm2, xmm1, 1 vpsrlvd ymm1, ymm1, ymm9 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm1 mov r11d, dword ptr [rbx - 52] mov r9d, dword ptr [rbx - 48] shld r9d, r11d, 9 mov r10d, dword ptr [rbx - 56] shld r11d, r10d, 2 mov esi, dword ptr [rbx - 60] mov edi, r10d mov ecx, dword ptr [rbx - 64] shld edi, esi, 20 mov edx, dword ptr [rbx - 72] mov eax, dword ptr [rbx - 68] shld esi, ecx, 13 shrd edx, eax, 8 shld ecx, eax, 6 vmovd xmm1, edi vpinsrd xmm1, xmm1, r10d, 1 vmovd xmm2, edx vpinsrd xmm1, xmm1, r11d, 2 vpinsrd xmm2, xmm2, eax, 1 vpinsrd xmm1, xmm1, r9d, 3 vpinsrd xmm2, xmm2, ecx, 2 vpinsrd xmm2, xmm2, esi, 3 vinserti128 ymm1, ymm2, xmm1, 1 vpsrlvd ymm1, ymm1, ymm10 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm1 mov eax, dword ptr [rbx - 28] mov r9d, dword ptr [rbx - 24] shld r9d, eax, 1 mov edx, dword ptr [rbx - 32] mov esi, eax shld esi, edx, 19 mov edi, dword ptr [rbx - 40] mov ecx, dword ptr [rbx - 36] shld edx, ecx, 12 shld ecx, edi, 5 vmovq xmm1, qword ptr [rbx - 48] # xmm1 = mem[0],zero vpsrlvd xmm2, xmm1, xmm11 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3] vpinsrd xmm1, xmm1, edi, 1 vpsllvd xmm1, xmm1, xmm4 vpor xmm1, xmm2, xmm1 vmovd xmm2, edx vpinsrd xmm2, xmm2, esi, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm1, xmm1, edi, 2 vpinsrd xmm1, xmm1, ecx, 3 vinserti128 ymm1, ymm1, xmm2, 1 vpsrlvd ymm1, ymm1, ymm5 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm1 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, r9d shld edx, ecx, 18 mov esi, dword ptr [rbx - 8] shld ecx, esi, 11 mov r10d, dword ptr [rbx - 16] mov edi, dword ptr [rbx - 12] shld esi, edi, 4 mov eax, edi shld eax, r10d, 22 vmovq xmm1, qword ptr [rbx - 24] # xmm1 = mem[0],zero vpsrlvd xmm2, xmm1, xmm6 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3] vpinsrd xmm1, xmm1, r10d, 1 vpsllvd xmm1, xmm1, xmm7 vmovd xmm3, esi vpinsrd xmm3, xmm3, ecx, 1 vpor xmm1, xmm2, xmm1 vpinsrd xmm2, xmm3, edx, 2 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm1, xmm1, eax, 2 vpinsrd xmm1, xmm1, edi, 3 vinserti128 ymm1, ymm1, xmm2, 1 vpsrlvd ymm1, ymm1, ymm8 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15], ymm1 sub r15, -128 add rbx, 100 add r8, -1 jne .LBB0_78 jmp .LBB0_147 .LBB0_14: cmp ecx, 5 jg .LBB0_20 # %bb.15: cmp ecx, 4 je .LBB0_138 # %bb.16: cmp ecx, 5 jne .LBB0_147 # %bb.17: cmp edx, 32 jl .LBB0_147 # %bb.18: mov eax, r14d add r15, 96 add rbx, 16 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_120] # ymm0 = [133143986207,133143986207,133143986207,133143986207] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_119] # ymm1 = [0,5,10,15,20,25,0,3] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_121] # ymm2 = [8,13,18,23,0,1,6,11] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_122] # ymm3 = [16,21,26,0,4,9,14,19] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_123] # ymm4 = [24,0,2,7,12,17,22,27] .p2align 4, 0x90 .LBB0_19: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 16] mov edx, dword ptr [rbx - 12] mov esi, edx shld esi, ecx, 2 vmovd xmm5, ecx vpbroadcastd xmm6, xmm5 vpinsrd xmm5, xmm5, ecx, 1 vpinsrd xmm5, xmm5, esi, 2 vpinsrd xmm5, xmm5, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov ecx, dword ptr [rbx - 12] mov edx, dword ptr [rbx - 8] mov esi, edx shld esi, ecx, 4 vmovd xmm5, ecx vpbroadcastd xmm5, xmm5 vmovd xmm6, esi vpinsrd xmm6, xmm6, edx, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov ecx, dword ptr [rbx - 8] mov edx, dword ptr [rbx - 4] vmovd xmm5, edx shld edx, ecx, 1 vmovd xmm6, ecx vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm6, xmm6, edx, 3 vpbroadcastd xmm5, xmm5 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov ecx, dword ptr [rbx - 4] mov edx, dword ptr [rbx] mov esi, edx shld esi, ecx, 3 vmovd xmm5, ecx vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, edx, 3 vmovd xmm6, edx vpbroadcastd xmm6, xmm6 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 20 add rax, -1 jne .LBB0_19 jmp .LBB0_147 .LBB0_61: cmp ecx, 21 jg .LBB0_67 # %bb.62: cmp ecx, 20 je .LBB0_114 # %bb.63: cmp ecx, 21 jne .LBB0_147 # %bb.64: cmp edx, 32 jl .LBB0_147 # %bb.65: mov r8d, r14d add r15, 96 add rbx, 80 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_58] # ymm8 = [0,0,10,0,0,9,0,0] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_59] # ymm1 = [9007194961870847,9007194961870847,9007194961870847,9007194961870847] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_60] # ymm2 = [8,0,0,7,0,0,6,0] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_61] # ymm3 = [0,5,0,0,4,0,0,3] vmovdqa xmm4, xmmword ptr [rip + .LCPI0_62] # xmm4 = <24,13,u,u> vmovdqa xmm5, xmmword ptr [rip + .LCPI0_63] # xmm5 = <8,19,u,u> vmovdqa ymm6, ymmword ptr [rip + .LCPI0_64] # ymm6 = [0,0,2,0,0,1,0,11] .p2align 4, 0x90 .LBB0_66: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 64] mov r9d, dword ptr [rbx - 60] shld r9d, ecx, 13 mov r11d, dword ptr [rbx - 68] shld ecx, r11d, 2 mov edi, dword ptr [rbx - 72] mov esi, r11d shld esi, edi, 12 mov r10d, dword ptr [rbx - 80] mov eax, dword ptr [rbx - 76] shld edi, eax, 1 mov edx, eax shld edx, r10d, 11 vmovd xmm7, r10d vmovd xmm0, esi vpinsrd xmm7, xmm7, edx, 1 vpinsrd xmm0, xmm0, r11d, 1 vpinsrd xmm7, xmm7, eax, 2 vpinsrd xmm0, xmm0, ecx, 2 vpinsrd xmm7, xmm7, edi, 3 vpinsrd xmm0, xmm0, r9d, 3 vinserti128 ymm0, ymm7, xmm0, 1 vpsrlvd ymm0, ymm0, ymm8 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm0 mov r10d, dword ptr [rbx - 44] mov r9d, dword ptr [rbx - 40] shld r9d, r10d, 5 mov edx, dword ptr [rbx - 48] mov esi, r10d shld esi, edx, 15 mov ecx, dword ptr [rbx - 52] shld edx, ecx, 4 mov r11d, dword ptr [rbx - 60] mov eax, dword ptr [rbx - 56] mov edi, ecx shld edi, eax, 14 shld eax, r11d, 3 vmovd xmm0, r11d vmovd xmm7, edx vpinsrd xmm0, xmm0, eax, 1 vpinsrd xmm7, xmm7, esi, 1 vpinsrd xmm0, xmm0, edi, 2 vpinsrd xmm7, xmm7, r10d, 2 vpinsrd xmm0, xmm0, ecx, 3 vpinsrd xmm7, xmm7, r9d, 3 vinserti128 ymm0, ymm0, xmm7, 1 vpsrlvd ymm0, ymm0, ymm2 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm0 mov r9d, dword ptr [rbx - 20] mov ecx, dword ptr [rbx - 24] mov r10d, r9d shld r10d, ecx, 18 mov esi, dword ptr [rbx - 28] shld ecx, esi, 7 mov edi, dword ptr [rbx - 32] vmovd xmm0, esi shld esi, edi, 17 mov eax, dword ptr [rbx - 40] mov edx, dword ptr [rbx - 36] shld edi, edx, 6 shrd eax, edx, 16 vpinsrd xmm0, xmm0, ecx, 1 vmovd xmm7, eax vpinsrd xmm0, xmm0, r10d, 2 vpinsrd xmm7, xmm7, edx, 1 vpinsrd xmm0, xmm0, r9d, 3 vpinsrd xmm7, xmm7, edi, 2 vpinsrd xmm7, xmm7, esi, 3 vinserti128 ymm0, ymm7, xmm0, 1 vpsrlvd ymm0, ymm0, ymm3 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm0 mov r9d, dword ptr [rbx] mov eax, dword ptr [rbx - 4] mov edx, r9d shld edx, eax, 10 mov esi, dword ptr [rbx - 12] mov edi, dword ptr [rbx - 8] mov ecx, eax shld ecx, edi, 20 shld edi, esi, 9 vmovq xmm0, qword ptr [rbx - 20] # xmm0 = mem[0],zero vpsrlvd xmm7, xmm0, xmm4 vpshufd xmm0, xmm0, 229 # xmm0 = xmm0[1,1,2,3] vpinsrd xmm0, xmm0, esi, 1 vpsllvd xmm0, xmm0, xmm5 vpor xmm0, xmm7, xmm0 vmovd xmm7, ecx vpinsrd xmm7, xmm7, eax, 1 vpinsrd xmm7, xmm7, edx, 2 vpinsrd xmm7, xmm7, r9d, 3 vpinsrd xmm0, xmm0, esi, 2 vpinsrd xmm0, xmm0, edi, 3 vinserti128 ymm0, ymm0, xmm7, 1 vpsrlvd ymm0, ymm0, ymm6 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15], ymm0 sub r15, -128 add rbx, 84 add r8, -1 jne .LBB0_66 jmp .LBB0_147 .LBB0_37: cmp ecx, 13 jg .LBB0_43 # %bb.38: cmp ecx, 12 je .LBB0_126 # %bb.39: cmp ecx, 13 jne .LBB0_147 # %bb.40: cmp edx, 32 jl .LBB0_147 # %bb.41: mov r8d, r14d add r15, 96 add rbx, 48 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_90] # ymm0 = [35180077129727,35180077129727,35180077129727,35180077129727] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_89] # ymm1 = [0,13,0,7,0,1,14,0] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_91] # ymm2 = [8,0,2,15,0,9,0,3] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_92] # ymm3 = [16,0,10,0,4,17,0,11] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_93] # ymm4 = [0,5,18,0,12,0,6,19] .p2align 4, 0x90 .LBB0_42: # =>This Inner Loop Header: Depth=1 mov eax, dword ptr [rbx - 40] mov r9d, dword ptr [rbx - 36] shld r9d, eax, 5 mov esi, dword ptr [rbx - 48] mov edx, dword ptr [rbx - 44] mov ecx, eax shld ecx, edx, 12 mov edi, edx shld edi, esi, 6 vmovd xmm5, ecx vpinsrd xmm5, xmm5, eax, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, esi vpinsrd xmm6, xmm6, esi, 1 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov r9d, dword ptr [rbx - 24] mov ecx, dword ptr [rbx - 28] mov edx, r9d shld edx, ecx, 10 mov esi, dword ptr [rbx - 32] mov edi, ecx shld edi, esi, 4 mov r10d, dword ptr [rbx - 36] mov eax, esi shld eax, r10d, 11 vmovd xmm5, edi vpinsrd xmm5, xmm5, ecx, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, r10d vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm6, xmm6, esi, 2 vpinsrd xmm6, xmm6, esi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov r9d, dword ptr [rbx - 12] mov ecx, dword ptr [rbx - 16] mov edx, r9d shld edx, ecx, 2 mov esi, dword ptr [rbx - 24] mov eax, dword ptr [rbx - 20] vmovd xmm5, ecx vpinsrd xmm5, xmm5, ecx, 1 shld ecx, eax, 9 mov edi, eax shld edi, esi, 3 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, esi vpinsrd xmm6, xmm6, edi, 1 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm6, xmm6, ecx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov eax, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, eax shld edx, ecx, 7 mov esi, dword ptr [rbx - 8] vmovd xmm5, ecx shld ecx, esi, 1 mov edi, dword ptr [rbx - 12] shrd edi, esi, 24 vmovd xmm6, edi vpinsrd xmm6, xmm6, esi, 1 vpinsrd xmm6, xmm6, esi, 2 vpinsrd xmm6, xmm6, ecx, 3 vpinsrd xmm5, xmm5, edx, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, eax, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 52 add r8, -1 jne .LBB0_42 jmp .LBB0_147 .LBB0_85: cmp ecx, 28 je .LBB0_102 # %bb.86: cmp ecx, 29 jne .LBB0_147 # %bb.87: cmp edx, 32 jl .LBB0_147 # %bb.88: mov r8d, r14d add r15, 96 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_12] # ymm0 = [2305843005455597567,2305843005455597567,2305843005455597567,2305843005455597567] vmovdqa xmm8, xmmword ptr [rip + .LCPI0_13] # xmm8 = <8,5,u,u> vmovdqa xmm10, xmmword ptr [rip + .LCPI0_14] # xmm10 = <24,27,u,u> vmovdqa ymm11, ymmword ptr [rip + .LCPI0_15] # ymm11 = [0,0,2,0,0,0,0,0] vmovdqa xmm12, xmmword ptr [rip + .LCPI0_16] # xmm12 = [16,13,10,7] vmovdqa xmm5, xmmword ptr [rip + .LCPI0_17] # xmm5 = [16,19,22,25] vmovdqa ymm6, ymmword ptr [rip + .LCPI0_18] # ymm6 = [0,0,0,0,0,1,0,0] vmovdqa xmm7, xmmword ptr [rip + .LCPI0_19] # xmm7 = [24,21,18,15] vmovdqa xmm1, xmmword ptr [rip + .LCPI0_20] # xmm1 = [8,11,14,17] vmovdqa ymm9, ymmword ptr [rip + .LCPI0_21] # ymm9 = [0,0,0,0,0,0,0,3] .p2align 4, 0x90 .LBB0_89: # =>This Inner Loop Header: Depth=1 mov r11d, dword ptr [rbx + 24] mov r9d, dword ptr [rbx + 28] shld r9d, r11d, 21 mov esi, dword ptr [rbx + 20] shld r11d, esi, 18 mov edi, dword ptr [rbx + 16] shld esi, edi, 15 mov eax, dword ptr [rbx + 12] shld edi, eax, 12 mov edx, dword ptr [rbx + 8] shld eax, edx, 9 mov r10d, dword ptr [rbx] mov ecx, dword ptr [rbx + 4] shld edx, ecx, 6 shld ecx, r10d, 3 vmovd xmm2, r10d vmovd xmm3, edi vpinsrd xmm2, xmm2, ecx, 1 vpinsrd xmm3, xmm3, esi, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm3, xmm3, r11d, 2 vpinsrd xmm2, xmm2, eax, 3 vpinsrd xmm3, xmm3, r9d, 3 vinserti128 ymm2, ymm2, xmm3, 1 vpand ymm2, ymm2, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm2 mov eax, dword ptr [rbx + 52] mov r9d, dword ptr [rbx + 56] shld r9d, eax, 13 mov edx, dword ptr [rbx + 48] shld eax, edx, 10 mov esi, dword ptr [rbx + 44] shld edx, esi, 7 mov edi, dword ptr [rbx + 36] mov ecx, dword ptr [rbx + 40] shld esi, ecx, 4 shld ecx, edi, 1 vmovq xmm2, qword ptr [rbx + 28] # xmm2 = mem[0],zero vpsrlvd xmm3, xmm2, xmm8 vpshufd xmm2, xmm2, 229 # xmm2 = xmm2[1,1,2,3] vpinsrd xmm2, xmm2, edi, 1 vpsllvd xmm2, xmm2, xmm10 vpor xmm2, xmm3, xmm2 vmovd xmm3, esi vpinsrd xmm3, xmm3, edx, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, r9d, 3 vpinsrd xmm2, xmm2, edi, 2 vpinsrd xmm2, xmm2, ecx, 3 vinserti128 ymm2, ymm2, xmm3, 1 vpsrlvd ymm2, ymm2, ymm11 vpand ymm2, ymm2, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm2 mov eax, dword ptr [rbx + 80] mov ecx, dword ptr [rbx + 84] shld ecx, eax, 5 mov edx, dword ptr [rbx + 76] mov esi, dword ptr [rbx + 72] shld eax, edx, 2 mov edi, edx shld edi, esi, 28 vmovdqu xmm2, xmmword ptr [rbx + 56] vpsrlvd xmm3, xmm2, xmm12 vpshufd xmm2, xmm2, 249 # xmm2 = xmm2[1,2,3,3] vpinsrd xmm2, xmm2, esi, 3 vmovd xmm4, edi vpinsrd xmm4, xmm4, edx, 1 vpinsrd xmm4, xmm4, eax, 2 vpsllvd xmm2, xmm2, xmm5 vpinsrd xmm4, xmm4, ecx, 3 vpor xmm2, xmm3, xmm2 vinserti128 ymm2, ymm2, xmm4, 1 vpsrlvd ymm2, ymm2, ymm6 vpand ymm2, ymm2, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm2 mov eax, dword ptr [rbx + 112] mov ecx, dword ptr [rbx + 108] mov edx, eax shld edx, ecx, 26 mov esi, dword ptr [rbx + 104] shld ecx, esi, 23 mov edi, dword ptr [rbx + 100] vmovdqu xmm2, xmmword ptr [rbx + 84] shld esi, edi, 20 vpsrlvd xmm3, xmm2, xmm7 vpshufd xmm2, xmm2, 249 # xmm2 = xmm2[1,2,3,3] vpinsrd xmm2, xmm2, edi, 3 vmovd xmm4, esi vpinsrd xmm4, xmm4, ecx, 1 vpsllvd xmm2, xmm2, xmm1 vpinsrd xmm4, xmm4, edx, 2 vpinsrd xmm4, xmm4, eax, 3 vpor xmm2, xmm3, xmm2 vinserti128 ymm2, ymm2, xmm4, 1 vpsrlvd ymm2, ymm2, ymm9 vpand ymm2, ymm2, ymm0 vmovdqu ymmword ptr [r15], ymm2 add rbx, 116 sub r15, -128 add r8, -1 jne .LBB0_89 jmp .LBB0_147 .LBB0_9: cmp ecx, 2 je .LBB0_141 # %bb.10: cmp ecx, 3 jne .LBB0_147 # %bb.11: cmp edx, 32 jl .LBB0_147 # %bb.12: mov eax, r14d add r15, 96 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_127] # ymm0 = [30064771079,30064771079,30064771079,30064771079] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_126] # ymm1 = [0,3,6,9,12,15,18,21] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_128] # ymm2 = [24,27,0,1,4,7,10,13] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_129] # ymm3 = [16,19,22,25,28,0,2,5] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_130] # ymm4 = [8,11,14,17,20,23,26,29] .p2align 4, 0x90 .LBB0_13: # =>This Inner Loop Header: Depth=1 vpbroadcastd ymm5, dword ptr [rbx] vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov ecx, dword ptr [rbx] mov edx, dword ptr [rbx + 4] mov esi, edx shld esi, ecx, 2 vmovd xmm5, ecx vpinsrd xmm5, xmm5, ecx, 1 vpinsrd xmm5, xmm5, esi, 2 vpinsrd xmm5, xmm5, edx, 3 vmovd xmm6, edx vpbroadcastd xmm6, xmm6 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov ecx, dword ptr [rbx + 4] mov edx, dword ptr [rbx + 8] mov esi, edx shld esi, ecx, 1 vmovd xmm5, ecx vpbroadcastd xmm6, xmm5 vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 vpbroadcastd ymm5, dword ptr [rbx + 8] vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 12 add rax, -1 jne .LBB0_13 jmp .LBB0_147 .LBB0_56: cmp ecx, 18 je .LBB0_117 # %bb.57: cmp ecx, 19 jne .LBB0_147 # %bb.58: cmp edx, 32 jl .LBB0_147 # %bb.59: mov r8d, r14d add r15, 96 add rbx, 72 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_68] # ymm0 = [2251795519242239,2251795519242239,2251795519242239,2251795519242239] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_67] # ymm1 = [0,0,6,0,12,0,0,5] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_69] # ymm2 = [0,11,0,0,4,0,10,0] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_70] # ymm3 = [0,3,0,9,0,0,2,0] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_71] # ymm4 = [8,0,0,1,0,7,0,13] .p2align 4, 0x90 .LBB0_60: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 56] mov edx, dword ptr [rbx - 60] mov esi, r9d shld esi, edx, 14 mov edi, dword ptr [rbx - 64] mov r10d, dword ptr [rbx - 72] shld edx, edi, 1 mov eax, dword ptr [rbx - 68] mov ecx, eax shld ecx, r10d, 13 vmovd xmm5, edi shld edi, eax, 7 vpinsrd xmm5, xmm5, edx, 1 vmovd xmm6, r10d vpinsrd xmm5, xmm5, esi, 2 vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm5, xmm5, r9d, 3 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm6, xmm6, edi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov r10d, dword ptr [rbx - 40] mov r9d, dword ptr [rbx - 36] shld r9d, r10d, 3 mov edx, dword ptr [rbx - 44] mov esi, r10d shld esi, edx, 9 mov edi, dword ptr [rbx - 48] vmovd xmm5, edx shld edx, edi, 15 mov ecx, dword ptr [rbx - 56] mov eax, dword ptr [rbx - 52] shld edi, eax, 2 shrd ecx, eax, 24 vpinsrd xmm5, xmm5, esi, 1 vmovd xmm6, ecx vpinsrd xmm5, xmm5, r10d, 2 vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm5, xmm5, r9d, 3 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov r10d, dword ptr [rbx - 20] mov r9d, dword ptr [rbx - 16] shld r9d, r10d, 11 mov edx, dword ptr [rbx - 24] mov esi, r10d mov r11d, dword ptr [rbx - 28] shld esi, edx, 17 mov ecx, dword ptr [rbx - 36] mov eax, dword ptr [rbx - 32] shld edx, r11d, 4 mov edi, r11d shld edi, eax, 10 shrd ecx, eax, 16 vmovd xmm5, edx vpinsrd xmm5, xmm5, esi, 1 vmovd xmm6, ecx vpinsrd xmm5, xmm5, r10d, 2 vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm5, xmm5, r9d, 3 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, r11d, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov r9d, dword ptr [rbx] mov r11d, dword ptr [rbx - 4] mov edx, r9d shld edx, r11d, 6 mov ecx, dword ptr [rbx - 8] mov edi, r11d shld edi, ecx, 12 mov r10d, dword ptr [rbx - 16] mov eax, dword ptr [rbx - 12] mov esi, ecx shld esi, eax, 18 shld eax, r10d, 5 vmovd xmm5, r10d vmovd xmm6, edi vpinsrd xmm5, xmm5, eax, 1 vpinsrd xmm6, xmm6, r11d, 1 vpinsrd xmm5, xmm5, esi, 2 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm5, xmm5, ecx, 3 vpinsrd xmm6, xmm6, r9d, 3 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 76 add r8, -1 jne .LBB0_60 jmp .LBB0_147 .LBB0_32: cmp ecx, 10 je .LBB0_129 # %bb.33: cmp ecx, 11 jne .LBB0_147 # %bb.34: cmp edx, 32 jl .LBB0_147 # %bb.35: mov r8d, r14d add r15, 96 add rbx, 40 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_97] # ymm0 = [8791798056959,8791798056959,8791798056959,8791798056959] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_96] # ymm1 = [0,11,0,1,12,0,2,13] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_98] # ymm2 = [0,3,14,0,4,15,0,5] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_99] # ymm3 = [16,0,6,17,0,7,18,0] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_100] # ymm4 = [8,19,0,9,20,0,10,21] .p2align 4, 0x90 .LBB0_36: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 32] mov edx, dword ptr [rbx - 40] mov esi, dword ptr [rbx - 36] mov edi, ecx shld edi, esi, 9 mov eax, esi shld eax, edx, 10 vmovd xmm5, esi vpinsrd xmm5, xmm5, edi, 1 vpinsrd xmm5, xmm5, ecx, 2 vpinsrd xmm5, xmm5, ecx, 3 vmovd xmm6, edx vpinsrd xmm6, xmm6, edx, 1 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm6, xmm6, esi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov eax, dword ptr [rbx - 20] mov ecx, dword ptr [rbx - 24] mov edx, eax shld edx, ecx, 6 mov esi, dword ptr [rbx - 32] mov edi, dword ptr [rbx - 28] vmovd xmm5, ecx vpinsrd xmm5, xmm5, ecx, 1 shld ecx, edi, 7 shrd esi, edi, 24 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, eax, 3 vmovd xmm6, esi vpinsrd xmm6, xmm6, edi, 1 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, ecx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov eax, dword ptr [rbx - 12] mov ecx, dword ptr [rbx - 8] shld ecx, eax, 3 mov r9d, dword ptr [rbx - 20] mov esi, dword ptr [rbx - 16] mov edi, eax shld edi, esi, 4 mov edx, esi shld edx, r9d, 5 vmovd xmm5, edi vpinsrd xmm5, xmm5, eax, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, ecx, 3 vmovd xmm6, r9d vpinsrd xmm6, xmm6, edx, 1 vpinsrd xmm6, xmm6, esi, 2 vpinsrd xmm6, xmm6, esi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov eax, dword ptr [rbx] mov ecx, dword ptr [rbx - 8] mov edx, dword ptr [rbx - 4] mov esi, eax shld esi, edx, 1 mov edi, edx shld edi, ecx, 2 vmovd xmm5, edx vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, eax, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 44 add r8, -1 jne .LBB0_36 jmp .LBB0_147 .LBB0_79: cmp ecx, 26 je .LBB0_105 # %bb.80: cmp ecx, 27 jne .LBB0_147 # %bb.81: cmp edx, 32 jl .LBB0_147 # %bb.82: mov r8d, r14d add r15, 96 add rbx, 104 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_25] # ymm0 = [576460748142673919,576460748142673919,576460748142673919,576460748142673919] vmovdqa ymm9, ymmword ptr [rip + .LCPI0_24] # ymm9 = [0,0,0,0,0,0,2,0] vmovdqa xmm10, xmmword ptr [rip + .LCPI0_26] # xmm10 = [24,19,14,9] vmovdqa xmm11, xmmword ptr [rip + .LCPI0_27] # xmm11 = [8,13,18,23] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_28] # ymm4 = [0,0,0,0,4,0,0,0] vmovdqa xmm5, xmmword ptr [rip + .LCPI0_29] # xmm5 = <16,11,u,u> vmovdqa xmm6, xmmword ptr [rip + .LCPI0_30] # xmm6 = <16,21,u,u> vmovdqa ymm7, ymmword ptr [rip + .LCPI0_31] # ymm7 = [0,0,0,1,0,0,0,0] vmovdqa ymm8, ymmword ptr [rip + .LCPI0_32] # ymm8 = [0,3,0,0,0,0,0,5] .p2align 4, 0x90 .LBB0_83: # =>This Inner Loop Header: Depth=1 mov r10d, dword ptr [rbx - 84] mov r9d, dword ptr [rbx - 80] shld r9d, r10d, 3 mov esi, dword ptr [rbx - 88] mov edi, r10d shld edi, esi, 25 mov eax, dword ptr [rbx - 92] shld esi, eax, 20 mov edx, dword ptr [rbx - 96] shld eax, edx, 15 mov r11d, dword ptr [rbx - 104] mov ecx, dword ptr [rbx - 100] shld edx, ecx, 10 shld ecx, r11d, 5 vmovd xmm1, r11d vmovd xmm2, esi vpinsrd xmm1, xmm1, ecx, 1 vpinsrd xmm2, xmm2, edi, 1 vpinsrd xmm1, xmm1, edx, 2 vpinsrd xmm2, xmm2, r10d, 2 vpinsrd xmm1, xmm1, eax, 3 vpinsrd xmm2, xmm2, r9d, 3 vinserti128 ymm1, ymm1, xmm2, 1 vpsrlvd ymm1, ymm1, ymm9 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm1 mov eax, dword ptr [rbx - 56] mov ecx, dword ptr [rbx - 52] shld ecx, eax, 11 mov edx, dword ptr [rbx - 60] mov esi, dword ptr [rbx - 64] shld eax, edx, 6 shld edx, esi, 1 vmovdqu xmm1, xmmword ptr [rbx - 80] vpsrlvd xmm2, xmm1, xmm10 vpshufd xmm1, xmm1, 249 # xmm1 = xmm1[1,2,3,3] vmovd xmm3, esi vpinsrd xmm1, xmm1, esi, 3 vpinsrd xmm3, xmm3, edx, 1 vpinsrd xmm3, xmm3, eax, 2 vpsllvd xmm1, xmm1, xmm11 vpinsrd xmm3, xmm3, ecx, 3 vpor xmm1, xmm2, xmm1 vinserti128 ymm1, ymm1, xmm3, 1 vpsrlvd ymm1, ymm1, ymm4 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm1 mov eax, dword ptr [rbx - 28] mov r9d, dword ptr [rbx - 24] shld r9d, eax, 19 mov edx, dword ptr [rbx - 32] shld eax, edx, 14 mov esi, dword ptr [rbx - 36] shld edx, esi, 9 mov r10d, dword ptr [rbx - 44] mov edi, dword ptr [rbx - 40] shld esi, edi, 4 mov ecx, edi shld ecx, r10d, 26 vmovq xmm1, qword ptr [rbx - 52] # xmm1 = mem[0],zero vpsrlvd xmm2, xmm1, xmm5 vpshufd xmm1, xmm1, 229 # xmm1 = xmm1[1,1,2,3] vpinsrd xmm1, xmm1, r10d, 1 vpsllvd xmm1, xmm1, xmm6 vmovd xmm3, esi vpinsrd xmm3, xmm3, edx, 1 vpor xmm1, xmm2, xmm1 vpinsrd xmm2, xmm3, eax, 2 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm1, xmm1, ecx, 2 vpinsrd xmm1, xmm1, edi, 3 vinserti128 ymm1, ymm1, xmm2, 1 vpsrlvd ymm1, ymm1, ymm7 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm1 mov r9d, dword ptr [rbx] mov r11d, dword ptr [rbx - 4] mov r10d, r9d shld r10d, r11d, 22 mov esi, dword ptr [rbx - 8] shld r11d, esi, 17 mov edi, dword ptr [rbx - 12] mov eax, dword ptr [rbx - 16] shld esi, edi, 12 mov edx, dword ptr [rbx - 24] mov ecx, dword ptr [rbx - 20] shld edi, eax, 7 shrd edx, ecx, 8 shld eax, ecx, 2 vmovd xmm1, esi vpinsrd xmm1, xmm1, r11d, 1 vmovd xmm2, edx vpinsrd xmm1, xmm1, r10d, 2 vpinsrd xmm2, xmm2, ecx, 1 vpinsrd xmm1, xmm1, r9d, 3 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm2, xmm2, edi, 3 vinserti128 ymm1, ymm2, xmm1, 1 vpsrlvd ymm1, ymm1, ymm8 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15], ymm1 sub r15, -128 add rbx, 108 add r8, -1 jne .LBB0_83 jmp .LBB0_147 .LBB0_20: cmp ecx, 6 je .LBB0_135 # %bb.21: cmp ecx, 7 jne .LBB0_147 # %bb.22: cmp edx, 32 jl .LBB0_147 # %bb.23: mov r8d, r14d add r15, 96 add rbx, 24 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_112] # ymm0 = [545460846719,545460846719,545460846719,545460846719] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_111] # ymm1 = [0,7,14,21,0,3,10,17] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_113] # ymm2 = [24,0,6,13,20,0,2,9] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_114] # ymm3 = [16,23,0,5,12,19,0,1] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_115] # ymm4 = [8,15,22,0,4,11,18,25] .p2align 4, 0x90 .LBB0_24: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 24] mov edx, dword ptr [rbx - 20] mov esi, edx shld esi, ecx, 4 vmovd xmm5, ecx vmovd xmm6, esi vpinsrd xmm6, xmm6, edx, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, edx, 3 vpbroadcastd xmm5, xmm5 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov ecx, dword ptr [rbx - 12] mov edx, dword ptr [rbx - 20] mov esi, dword ptr [rbx - 16] mov edi, ecx shld edi, esi, 5 mov eax, esi shld eax, edx, 1 vmovd xmm5, esi vpinsrd xmm5, xmm5, edi, 1 vpinsrd xmm5, xmm5, ecx, 2 vpinsrd xmm5, xmm5, ecx, 3 vmovd xmm6, edx vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm6, xmm6, esi, 2 vpinsrd xmm6, xmm6, esi, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov eax, dword ptr [rbx - 4] mov ecx, dword ptr [rbx - 12] mov edx, dword ptr [rbx - 8] mov esi, eax shld esi, edx, 6 mov edi, edx shld edi, ecx, 2 vmovd xmm5, edx vpinsrd xmm5, xmm5, edx, 1 vpinsrd xmm5, xmm5, esi, 2 vpinsrd xmm5, xmm5, eax, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov eax, dword ptr [rbx - 4] mov ecx, dword ptr [rbx] mov edx, ecx shld edx, eax, 3 vmovd xmm5, ecx vmovd xmm6, eax vpinsrd xmm6, xmm6, eax, 1 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm6, xmm6, edx, 3 vpbroadcastd xmm5, xmm5 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 28 add r8, -1 jne .LBB0_24 jmp .LBB0_147 .LBB0_67: cmp ecx, 22 je .LBB0_111 # %bb.68: cmp ecx, 23 jne .LBB0_147 # %bb.69: cmp edx, 32 jl .LBB0_147 # %bb.70: mov r8d, r14d add r15, 96 add rbx, 88 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_48] # ymm8 = [0,0,0,5,0,0,0,1] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_49] # ymm1 = [36028792732385279,36028792732385279,36028792732385279,36028792732385279] vmovdqa xmm2, xmmword ptr [rip + .LCPI0_50] # xmm2 = <24,15,u,u> vmovdqa xmm3, xmmword ptr [rip + .LCPI0_51] # xmm3 = <8,17,u,u> vmovdqa ymm4, ymmword ptr [rip + .LCPI0_52] # ymm4 = [0,0,6,0,0,0,2,0] vmovdqa ymm5, ymmword ptr [rip + .LCPI0_53] # ymm5 = [0,7,0,0,0,3,0,0] vmovdqa ymm6, ymmword ptr [rip + .LCPI0_54] # ymm6 = [8,0,0,0,4,0,0,9] .p2align 4, 0x90 .LBB0_71: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 68] mov edx, dword ptr [rbx - 72] mov r11d, r9d shld r11d, edx, 22 mov edi, dword ptr [rbx - 76] shld edx, edi, 13 mov esi, dword ptr [rbx - 80] shld edi, esi, 4 mov r10d, dword ptr [rbx - 88] mov ecx, dword ptr [rbx - 84] mov eax, esi shld eax, ecx, 18 shld ecx, r10d, 9 vmovd xmm7, r10d vmovd xmm0, edi vpinsrd xmm7, xmm7, ecx, 1 vpinsrd xmm0, xmm0, edx, 1 vpinsrd xmm7, xmm7, eax, 2 vpinsrd xmm0, xmm0, r11d, 2 vpinsrd xmm7, xmm7, esi, 3 vpinsrd xmm0, xmm0, r9d, 3 vinserti128 ymm0, ymm7, xmm0, 1 vpsrlvd ymm0, ymm0, ymm8 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm0 mov eax, dword ptr [rbx - 48] mov r9d, dword ptr [rbx - 44] shld r9d, eax, 7 mov edx, dword ptr [rbx - 52] mov esi, eax shld esi, edx, 21 mov edi, dword ptr [rbx - 60] mov ecx, dword ptr [rbx - 56] shld edx, ecx, 12 shld ecx, edi, 3 vmovq xmm0, qword ptr [rbx - 68] # xmm0 = mem[0],zero vpsrlvd xmm7, xmm0, xmm2 vpshufd xmm0, xmm0, 229 # xmm0 = xmm0[1,1,2,3] vpinsrd xmm0, xmm0, edi, 1 vpsllvd xmm0, xmm0, xmm3 vpor xmm0, xmm7, xmm0 vmovd xmm7, edx vpinsrd xmm7, xmm7, esi, 1 vpinsrd xmm7, xmm7, eax, 2 vpinsrd xmm7, xmm7, r9d, 3 vpinsrd xmm0, xmm0, edi, 2 vpinsrd xmm0, xmm0, ecx, 3 vinserti128 ymm0, ymm0, xmm7, 1 vpsrlvd ymm0, ymm0, ymm4 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm0 mov r11d, dword ptr [rbx - 24] mov r9d, dword ptr [rbx - 20] shld r9d, r11d, 15 mov r10d, dword ptr [rbx - 28] shld r11d, r10d, 6 mov esi, dword ptr [rbx - 32] mov edi, r10d mov ecx, dword ptr [rbx - 36] shld edi, esi, 20 mov edx, dword ptr [rbx - 44] mov eax, dword ptr [rbx - 40] shld esi, ecx, 11 shrd edx, eax, 16 shld ecx, eax, 2 vmovd xmm0, edi vpinsrd xmm0, xmm0, r10d, 1 vmovd xmm7, edx vpinsrd xmm0, xmm0, r11d, 2 vpinsrd xmm7, xmm7, eax, 1 vpinsrd xmm0, xmm0, r9d, 3 vpinsrd xmm7, xmm7, ecx, 2 vpinsrd xmm7, xmm7, esi, 3 vinserti128 ymm0, ymm7, xmm0, 1 vpsrlvd ymm0, ymm0, ymm5 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm0 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, r9d shld edx, ecx, 14 mov esi, dword ptr [rbx - 8] shld ecx, esi, 5 mov edi, dword ptr [rbx - 12] vmovd xmm0, esi shld esi, edi, 19 mov r10d, dword ptr [rbx - 20] mov eax, dword ptr [rbx - 16] shld edi, eax, 10 shld eax, r10d, 1 vpinsrd xmm0, xmm0, ecx, 1 vmovd xmm7, r10d vpinsrd xmm0, xmm0, edx, 2 vpinsrd xmm7, xmm7, eax, 1 vpinsrd xmm0, xmm0, r9d, 3 vpinsrd xmm7, xmm7, edi, 2 vpinsrd xmm7, xmm7, esi, 3 vinserti128 ymm0, ymm7, xmm0, 1 vpsrlvd ymm0, ymm0, ymm6 vpand ymm0, ymm0, ymm1 vmovdqu ymmword ptr [r15], ymm0 sub r15, -128 add rbx, 92 add r8, -1 jne .LBB0_71 jmp .LBB0_147 .LBB0_43: cmp ecx, 14 je .LBB0_123 # %bb.44: cmp ecx, 15 jne .LBB0_147 # %bb.45: cmp edx, 32 jl .LBB0_147 # %bb.46: mov r8d, r14d add r15, 96 add rbx, 56 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_82] # ymm0 = [140733193420799,140733193420799,140733193420799,140733193420799] vmovdqa ymm1, ymmword ptr [rip + .LCPI0_81] # ymm1 = [0,15,0,13,0,11,0,9] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_83] # ymm2 = [0,7,0,5,0,3,0,1] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_84] # ymm3 = [16,0,14,0,12,0,10,0] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_85] # ymm4 = [8,0,6,0,4,0,2,17] .p2align 4, 0x90 .LBB0_47: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 44] mov eax, dword ptr [rbx - 48] mov esi, r9d shld esi, eax, 6 mov r10d, dword ptr [rbx - 52] mov edx, eax shld edx, r10d, 4 mov ecx, dword ptr [rbx - 56] mov edi, r10d shld edi, ecx, 2 vmovd xmm5, edx vpinsrd xmm5, xmm5, eax, 1 vpinsrd xmm5, xmm5, esi, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, ecx vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, edi, 2 vpinsrd xmm6, xmm6, r10d, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm1 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov r9d, dword ptr [rbx - 28] mov r11d, dword ptr [rbx - 32] mov edx, r9d shld edx, r11d, 14 mov r10d, dword ptr [rbx - 36] mov edi, r11d shld edi, r10d, 12 mov eax, dword ptr [rbx - 44] mov esi, dword ptr [rbx - 40] mov ecx, r10d shld ecx, esi, 10 shrd eax, esi, 24 vmovd xmm5, edi vpinsrd xmm5, xmm5, r11d, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm5, xmm5, r9d, 3 vmovd xmm6, eax vpinsrd xmm6, xmm6, esi, 1 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm6, xmm6, r10d, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov eax, dword ptr [rbx - 16] mov r10d, dword ptr [rbx - 12] shld r10d, eax, 7 mov edx, dword ptr [rbx - 20] mov esi, eax shld esi, edx, 5 mov r9d, dword ptr [rbx - 28] mov ecx, dword ptr [rbx - 24] mov edi, ecx shld edi, r9d, 1 vmovd xmm5, edx shld edx, ecx, 3 vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm5, xmm5, eax, 2 vpinsrd xmm5, xmm5, r10d, 3 vmovd xmm6, r9d vpinsrd xmm6, xmm6, edi, 1 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm6, xmm6, edx, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm3 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, r9d shld edx, ecx, 13 mov eax, dword ptr [rbx - 8] vmovd xmm5, ecx shld ecx, eax, 11 mov edi, dword ptr [rbx - 12] mov esi, eax shld esi, edi, 9 vmovd xmm6, edi vpinsrd xmm6, xmm6, esi, 1 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm6, xmm6, ecx, 3 vpinsrd xmm5, xmm5, edx, 1 vpinsrd xmm5, xmm5, r9d, 2 vpinsrd xmm5, xmm5, r9d, 3 vinserti128 ymm5, ymm6, xmm5, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 60 add r8, -1 jne .LBB0_47 jmp .LBB0_147 .LBB0_96: cmp edx, 32 jl .LBB0_147 # %bb.97: mov r8d, r14d vpbroadcastq ymm0, qword ptr [rip + .LCPI0_0] # ymm0 = [9223372034707292159,9223372034707292159,9223372034707292159,9223372034707292159] add r15, 96 vmovdqa ymm8, ymmword ptr [rip + .LCPI0_1] # ymm8 = [24,23,22,21,20,19,18,17] vmovdqa ymm9, ymmword ptr [rip + .LCPI0_2] # ymm9 = [8,9,10,11,12,13,14,15] vmovdqa ymm10, ymmword ptr [rip + .LCPI0_3] # ymm10 = [16,15,14,13,12,11,10,9] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_4] # ymm4 = [16,17,18,19,20,21,22,23] vmovdqa xmm5, xmmword ptr [rip + .LCPI0_5] # xmm5 = [8,7,6,5] vmovdqa xmm6, xmmword ptr [rip + .LCPI0_6] # xmm6 = [24,25,26,27] vmovdqa ymm7, ymmword ptr [rip + .LCPI0_7] # ymm7 = [0,0,0,0,0,0,0,1] .p2align 4, 0x90 .LBB0_98: # =>This Inner Loop Header: Depth=1 mov r10d, dword ptr [rbx + 24] mov r9d, dword ptr [rbx + 28] shld r9d, r10d, 7 mov esi, dword ptr [rbx + 20] shld r10d, esi, 6 mov edi, dword ptr [rbx + 16] shld esi, edi, 5 mov eax, dword ptr [rbx + 12] shld edi, eax, 4 mov edx, dword ptr [rbx + 8] shld eax, edx, 3 mov ecx, dword ptr [rbx + 4] shld edx, ecx, 2 mov r11d, dword ptr [rbx] shld ecx, r11d, 1 vmovd xmm1, edi vpinsrd xmm1, xmm1, esi, 1 vpinsrd xmm1, xmm1, r10d, 2 vpinsrd xmm1, xmm1, r9d, 3 vmovd xmm2, r11d vpinsrd xmm2, xmm2, ecx, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm2, xmm2, eax, 3 vinserti128 ymm1, ymm2, xmm1, 1 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm1 vmovdqu ymm1, ymmword ptr [rbx + 28] vpsrlvd ymm1, ymm1, ymm8 vmovdqu xmm2, xmmword ptr [rbx + 44] vpshufd xmm3, xmm2, 249 # xmm3 = xmm2[1,2,3,3] vpinsrd xmm3, xmm3, dword ptr [rbx + 60], 3 vpalignr xmm2, xmm2, xmmword ptr [rbx + 28], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] vinserti128 ymm2, ymm2, xmm3, 1 vpsllvd ymm2, ymm2, ymm9 vpor ymm1, ymm1, ymm2 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm1 vmovdqu ymm1, ymmword ptr [rbx + 60] vmovdqu xmm2, xmmword ptr [rbx + 76] vpshufd xmm3, xmm2, 249 # xmm3 = xmm2[1,2,3,3] vpinsrd xmm3, xmm3, dword ptr [rbx + 92], 3 vpsrlvd ymm1, ymm1, ymm10 vpalignr xmm2, xmm2, xmmword ptr [rbx + 60], 4 # xmm2 = mem[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3] vinserti128 ymm2, ymm2, xmm3, 1 vpsllvd ymm2, ymm2, ymm4 vpor ymm1, ymm1, ymm2 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm1 mov eax, dword ptr [rbx + 120] mov ecx, dword ptr [rbx + 116] mov edx, eax shld edx, ecx, 30 mov esi, dword ptr [rbx + 112] shld ecx, esi, 29 mov edi, dword ptr [rbx + 108] shld esi, edi, 28 vmovdqu xmm1, xmmword ptr [rbx + 92] vpsrlvd xmm2, xmm1, xmm5 vpshufd xmm1, xmm1, 249 # xmm1 = xmm1[1,2,3,3] vpinsrd xmm1, xmm1, edi, 3 vpsllvd xmm1, xmm1, xmm6 vmovd xmm3, esi vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, edx, 2 vpinsrd xmm3, xmm3, eax, 3 vpor xmm1, xmm2, xmm1 vinserti128 ymm1, ymm1, xmm3, 1 vpsrlvd ymm1, ymm1, ymm7 vpand ymm1, ymm1, ymm0 vmovdqu ymmword ptr [r15], ymm1 add rbx, 124 sub r15, -128 add r8, -1 jne .LBB0_98 jmp .LBB0_147 .LBB0_144: cmp edx, 32 jl .LBB0_147 # %bb.145: mov ebx, r14d .p2align 4, 0x90 .LBB0_146: # =>This Inner Loop Header: Depth=1 mov edx, 128 mov rdi, r15 xor esi, esi call clib·_memset(SB) sub r15, -128 add rbx, -1 jne .LBB0_146 jmp .LBB0_147 .LBB0_120: cmp edx, 32 jl .LBB0_147 # %bb.121: mov eax, r14d xor ecx, ecx vpbroadcastq ymm0, qword ptr [rip + .LCPI0_80] # ymm0 = [68719476736,68719476736,68719476736,68719476736] vpxor xmm1, xmm1, xmm1 .p2align 4, 0x90 .LBB0_122: # =>This Inner Loop Header: Depth=1 vmovdqu xmm2, xmmword ptr [rbx + rcx] vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] vpsrlvd ymm2, ymm2, ymm0 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] vmovdqu ymmword ptr [r15 + 2*rcx], ymm2 vmovdqu xmm2, xmmword ptr [rbx + rcx + 16] vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] vpsrlvd ymm2, ymm2, ymm0 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] vmovdqu ymmword ptr [r15 + 2*rcx + 32], ymm2 vmovdqu xmm2, xmmword ptr [rbx + rcx + 32] vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] vpsrlvd ymm2, ymm2, ymm0 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] vmovdqu ymmword ptr [r15 + 2*rcx + 64], ymm2 vmovdqu xmm2, xmmword ptr [rbx + rcx + 48] vpermq ymm2, ymm2, 216 # ymm2 = ymm2[0,2,1,3] vpshufd ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1,4,4,5,5] vpsrlvd ymm2, ymm2, ymm0 vpblendw ymm2, ymm2, ymm1, 170 # ymm2 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4],ymm1[5],ymm2[6],ymm1[7],ymm2[8],ymm1[9],ymm2[10],ymm1[11],ymm2[12],ymm1[13],ymm2[14],ymm1[15] vmovdqu ymmword ptr [r15 + 2*rcx + 96], ymm2 add rcx, 64 add rax, -1 jne .LBB0_122 jmp .LBB0_147 .LBB0_132: cmp edx, 32 jl .LBB0_147 # %bb.133: mov eax, r14d xor ecx, ecx vbroadcasti128 ymm0, xmmword ptr [rip + .LCPI0_109] # ymm0 = [0,8,16,24,0,8,16,24] # ymm0 = mem[0,1,0,1] vpbroadcastd ymm1, dword ptr [rip + .LCPI0_110] # ymm1 = [255,255,255,255,255,255,255,255] .p2align 4, 0x90 .LBB0_134: # =>This Inner Loop Header: Depth=1 vmovq xmm2, qword ptr [rbx + rcx] # xmm2 = mem[0],zero vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 4*rcx], ymm2 vmovq xmm2, qword ptr [rbx + rcx + 8] # xmm2 = mem[0],zero vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 4*rcx + 32], ymm2 vmovq xmm2, qword ptr [rbx + rcx + 16] # xmm2 = mem[0],zero vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 4*rcx + 64], ymm2 vmovq xmm2, qword ptr [rbx + rcx + 24] # xmm2 = mem[0],zero vpshufd xmm2, xmm2, 80 # xmm2 = xmm2[0,0,1,1] vpermq ymm2, ymm2, 80 # ymm2 = ymm2[0,0,1,1] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 4*rcx + 96], ymm2 add rcx, 32 add rax, -1 jne .LBB0_134 jmp .LBB0_147 .LBB0_108: cmp edx, 32 jl .LBB0_147 # %bb.109: mov r8d, r14d add r15, 96 add rbx, 92 vbroadcasti128 ymm0, xmmword ptr [rip + .LCPI0_46] # ymm0 = [0,0,0,8,0,0,0,8] # ymm0 = mem[0,1,0,1] vpbroadcastd ymm1, dword ptr [rip + .LCPI0_47] # ymm1 = [16777215,16777215,16777215,16777215,16777215,16777215,16777215,16777215] .p2align 4, 0x90 .LBB0_110: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 72] mov edx, dword ptr [rbx - 76] mov esi, r9d mov edi, dword ptr [rbx - 80] mov r10d, dword ptr [rbx - 84] shld esi, edx, 16 mov r11d, dword ptr [rbx - 92] mov eax, dword ptr [rbx - 88] shld edx, edi, 8 mov ecx, r10d shld ecx, eax, 16 shld eax, r11d, 8 vmovd xmm2, edi vmovd xmm3, r11d vpinsrd xmm2, xmm2, edx, 1 vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm2, xmm2, esi, 2 vpinsrd xmm3, xmm3, ecx, 2 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm3, xmm3, r10d, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm2 mov r9d, dword ptr [rbx - 48] mov ecx, dword ptr [rbx - 52] mov edx, r9d mov esi, dword ptr [rbx - 56] mov r10d, dword ptr [rbx - 60] shld edx, ecx, 16 mov r11d, dword ptr [rbx - 68] mov edi, dword ptr [rbx - 64] shld ecx, esi, 8 mov eax, r10d shld eax, edi, 16 shld edi, r11d, 8 vmovd xmm2, esi vmovd xmm3, r11d vpinsrd xmm2, xmm2, ecx, 1 vpinsrd xmm3, xmm3, edi, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm3, xmm3, r10d, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm2 mov r9d, dword ptr [rbx - 24] mov ecx, dword ptr [rbx - 28] mov edx, r9d mov esi, dword ptr [rbx - 32] mov r10d, dword ptr [rbx - 36] shld edx, ecx, 16 mov r11d, dword ptr [rbx - 44] mov edi, dword ptr [rbx - 40] shld ecx, esi, 8 mov eax, r10d shld eax, edi, 16 shld edi, r11d, 8 vmovd xmm2, esi vmovd xmm3, r11d vpinsrd xmm2, xmm2, ecx, 1 vpinsrd xmm3, xmm3, edi, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm3, xmm3, r10d, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm2 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, r9d mov esi, dword ptr [rbx - 8] mov r10d, dword ptr [rbx - 12] shld edx, ecx, 16 mov r11d, dword ptr [rbx - 20] mov edi, dword ptr [rbx - 16] shld ecx, esi, 8 mov eax, r10d shld eax, edi, 16 shld edi, r11d, 8 vmovd xmm2, esi vpinsrd xmm2, xmm2, ecx, 1 vmovd xmm3, r11d vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm3, xmm3, edi, 1 vpinsrd xmm2, xmm2, r9d, 3 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, r10d, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15], ymm2 sub r15, -128 add rbx, 96 add r8, -1 jne .LBB0_110 jmp .LBB0_147 .LBB0_138: cmp edx, 32 jl .LBB0_147 # %bb.139: mov eax, r14d xor ecx, ecx vmovdqa ymm0, ymmword ptr [rip + .LCPI0_124] # ymm0 = [0,4,8,12,16,20,24,28] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_125] # ymm1 = [64424509455,64424509455,64424509455,64424509455] .p2align 4, 0x90 .LBB0_140: # =>This Inner Loop Header: Depth=1 vpbroadcastd ymm2, dword ptr [rbx + rcx] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 8*rcx], ymm2 vpbroadcastd ymm2, dword ptr [rbx + rcx + 4] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 8*rcx + 32], ymm2 vpbroadcastd ymm2, dword ptr [rbx + rcx + 8] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 8*rcx + 64], ymm2 vpbroadcastd ymm2, dword ptr [rbx + rcx + 12] vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 + 8*rcx + 96], ymm2 add rcx, 16 add rax, -1 jne .LBB0_140 jmp .LBB0_147 .LBB0_114: cmp edx, 32 jl .LBB0_147 # %bb.115: mov r8d, r14d add r15, 96 add rbx, 76 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_65] # ymm0 = [0,0,8,0,0,4,0,12] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_66] # ymm1 = [4503595333451775,4503595333451775,4503595333451775,4503595333451775] .p2align 4, 0x90 .LBB0_116: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 60] mov r11d, dword ptr [rbx - 64] mov esi, r9d shld esi, r11d, 8 mov edi, dword ptr [rbx - 68] mov edx, r11d shld edx, edi, 16 mov eax, dword ptr [rbx - 72] shld edi, eax, 4 mov r10d, dword ptr [rbx - 76] mov ecx, eax shld ecx, r10d, 12 vmovd xmm2, edx vpinsrd xmm2, xmm2, r11d, 1 vpinsrd xmm2, xmm2, esi, 2 vpinsrd xmm2, xmm2, r9d, 3 vmovd xmm3, r10d vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, edi, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm2 mov r9d, dword ptr [rbx - 40] mov r11d, dword ptr [rbx - 44] mov edx, r9d shld edx, r11d, 8 mov esi, dword ptr [rbx - 48] mov edi, r11d shld edi, esi, 16 mov r10d, dword ptr [rbx - 56] mov ecx, dword ptr [rbx - 52] shld esi, ecx, 4 mov eax, ecx shld eax, r10d, 12 vmovd xmm2, edi vpinsrd xmm2, xmm2, r11d, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm2, xmm2, r9d, 3 vmovd xmm3, r10d vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm3, xmm3, ecx, 2 vpinsrd xmm3, xmm3, esi, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm2 mov r9d, dword ptr [rbx - 20] mov r11d, dword ptr [rbx - 24] mov edx, r9d shld edx, r11d, 8 mov esi, dword ptr [rbx - 28] mov edi, r11d shld edi, esi, 16 mov ecx, dword ptr [rbx - 32] shld esi, ecx, 4 mov r10d, dword ptr [rbx - 36] mov eax, ecx shld eax, r10d, 12 vmovd xmm2, edi vpinsrd xmm2, xmm2, r11d, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm2, xmm2, r9d, 3 vmovd xmm3, r10d vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm3, xmm3, ecx, 2 vpinsrd xmm3, xmm3, esi, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm2 mov r9d, dword ptr [rbx] mov r11d, dword ptr [rbx - 4] mov edx, r9d shld edx, r11d, 8 mov esi, dword ptr [rbx - 8] mov edi, r11d shld edi, esi, 16 mov r10d, dword ptr [rbx - 16] mov ecx, dword ptr [rbx - 12] shld esi, ecx, 4 mov eax, ecx shld eax, r10d, 12 vmovd xmm2, edi vpinsrd xmm2, xmm2, r11d, 1 vpinsrd xmm2, xmm2, edx, 2 vpinsrd xmm2, xmm2, r9d, 3 vmovd xmm3, r10d vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm3, xmm3, ecx, 2 vpinsrd xmm3, xmm3, esi, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15], ymm2 sub r15, -128 add rbx, 80 add r8, -1 jne .LBB0_116 jmp .LBB0_147 .LBB0_126: cmp edx, 32 jl .LBB0_147 # %bb.127: mov r8d, r14d add r15, 96 add rbx, 44 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_94] # ymm0 = [0,12,0,4,16,0,8,20] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_95] # ymm1 = [17587891081215,17587891081215,17587891081215,17587891081215] .p2align 4, 0x90 .LBB0_128: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 36] mov edx, dword ptr [rbx - 44] mov esi, dword ptr [rbx - 40] mov edi, ecx shld edi, esi, 4 mov eax, esi shld eax, edx, 8 vmovd xmm2, esi vpinsrd xmm2, xmm2, edi, 1 vpinsrd xmm2, xmm2, ecx, 2 vpinsrd xmm2, xmm2, ecx, 3 vmovd xmm3, edx vpinsrd xmm3, xmm3, edx, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, esi, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm2 mov eax, dword ptr [rbx - 24] mov ecx, dword ptr [rbx - 32] mov edx, dword ptr [rbx - 28] mov esi, eax shld esi, edx, 4 mov edi, edx shld edi, ecx, 8 vmovd xmm2, edx vpinsrd xmm2, xmm2, esi, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm2, xmm2, eax, 3 vmovd xmm3, ecx vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, edi, 2 vpinsrd xmm3, xmm3, edx, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm2 mov eax, dword ptr [rbx - 12] mov ecx, dword ptr [rbx - 20] mov edx, dword ptr [rbx - 16] mov esi, eax shld esi, edx, 4 mov edi, edx shld edi, ecx, 8 vmovd xmm2, edx vpinsrd xmm2, xmm2, esi, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm2, xmm2, eax, 3 vmovd xmm3, ecx vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, edi, 2 vpinsrd xmm3, xmm3, edx, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm2 mov eax, dword ptr [rbx] mov ecx, dword ptr [rbx - 8] mov edx, dword ptr [rbx - 4] mov esi, eax shld esi, edx, 4 mov edi, edx shld edi, ecx, 8 vmovd xmm2, edx vpinsrd xmm2, xmm2, esi, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm2, xmm2, eax, 3 vmovd xmm3, ecx vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, edi, 2 vpinsrd xmm3, xmm3, edx, 3 vinserti128 ymm2, ymm3, xmm2, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15], ymm2 sub r15, -128 add rbx, 48 add r8, -1 jne .LBB0_128 jmp .LBB0_147 .LBB0_102: cmp edx, 32 jl .LBB0_147 # %bb.103: mov r8d, r14d add r15, 96 add rbx, 108 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_22] # ymm0 = [0,0,0,0,0,0,0,4] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_23] # ymm1 = [1152921500580315135,1152921500580315135,1152921500580315135,1152921500580315135] .p2align 4, 0x90 .LBB0_104: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 84] mov edx, dword ptr [rbx - 88] mov r10d, r9d shld r10d, edx, 24 mov edi, dword ptr [rbx - 92] shld edx, edi, 20 mov eax, dword ptr [rbx - 96] shld edi, eax, 16 mov ecx, dword ptr [rbx - 100] shld eax, ecx, 12 mov r11d, dword ptr [rbx - 108] mov esi, dword ptr [rbx - 104] shld ecx, esi, 8 shld esi, r11d, 4 vmovd xmm2, r11d vmovd xmm3, edi vpinsrd xmm2, xmm2, esi, 1 vpinsrd xmm3, xmm3, edx, 1 vpinsrd xmm2, xmm2, ecx, 2 vpinsrd xmm3, xmm3, r10d, 2 vpinsrd xmm2, xmm2, eax, 3 vpinsrd xmm3, xmm3, r9d, 3 vinserti128 ymm2, ymm2, xmm3, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm2 mov r9d, dword ptr [rbx - 56] mov ecx, dword ptr [rbx - 60] mov r10d, r9d shld r10d, ecx, 24 mov esi, dword ptr [rbx - 64] shld ecx, esi, 20 mov edi, dword ptr [rbx - 68] shld esi, edi, 16 mov eax, dword ptr [rbx - 72] shld edi, eax, 12 mov r11d, dword ptr [rbx - 80] mov edx, dword ptr [rbx - 76] shld eax, edx, 8 shld edx, r11d, 4 vmovd xmm2, r11d vmovd xmm3, esi vpinsrd xmm2, xmm2, edx, 1 vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm3, xmm3, r10d, 2 vpinsrd xmm2, xmm2, edi, 3 vpinsrd xmm3, xmm3, r9d, 3 vinserti128 ymm2, ymm2, xmm3, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm2 mov r9d, dword ptr [rbx - 28] mov ecx, dword ptr [rbx - 32] mov r10d, r9d shld r10d, ecx, 24 mov esi, dword ptr [rbx - 36] shld ecx, esi, 20 mov edi, dword ptr [rbx - 40] shld esi, edi, 16 mov eax, dword ptr [rbx - 44] shld edi, eax, 12 mov r11d, dword ptr [rbx - 52] mov edx, dword ptr [rbx - 48] shld eax, edx, 8 shld edx, r11d, 4 vmovd xmm2, r11d vmovd xmm3, esi vpinsrd xmm2, xmm2, edx, 1 vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm3, xmm3, r10d, 2 vpinsrd xmm2, xmm2, edi, 3 vpinsrd xmm3, xmm3, r9d, 3 vinserti128 ymm2, ymm2, xmm3, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm2 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov r10d, r9d shld r10d, ecx, 24 mov esi, dword ptr [rbx - 8] shld ecx, esi, 20 mov edi, dword ptr [rbx - 12] shld esi, edi, 16 mov eax, dword ptr [rbx - 16] shld edi, eax, 12 mov r11d, dword ptr [rbx - 24] mov edx, dword ptr [rbx - 20] shld eax, edx, 8 shld edx, r11d, 4 vmovd xmm2, r11d vmovd xmm3, esi vpinsrd xmm2, xmm2, edx, 1 vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm2, xmm2, eax, 2 vpinsrd xmm3, xmm3, r10d, 2 vpinsrd xmm2, xmm2, edi, 3 vpinsrd xmm3, xmm3, r9d, 3 vinserti128 ymm2, ymm2, xmm3, 1 vpsrlvd ymm2, ymm2, ymm0 vpand ymm2, ymm2, ymm1 vmovdqu ymmword ptr [r15], ymm2 sub r15, -128 add rbx, 112 add r8, -1 jne .LBB0_104 jmp .LBB0_147 .LBB0_141: cmp edx, 32 jl .LBB0_147 # %bb.142: mov eax, r14d add r15, 96 xor ecx, ecx vmovdqa ymm0, ymmword ptr [rip + .LCPI0_131] # ymm0 = [0,2,4,6,8,10,12,14] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_132] # ymm1 = [12884901891,12884901891,12884901891,12884901891] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_133] # ymm2 = [16,18,20,22,24,26,28,30] .p2align 4, 0x90 .LBB0_143: # =>This Inner Loop Header: Depth=1 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx] vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm3 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx] vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm3 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx + 4] vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm3 vpbroadcastd ymm3, dword ptr [rbx + 8*rcx + 4] vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15], ymm3 add rcx, 1 sub r15, -128 cmp rax, rcx jne .LBB0_143 jmp .LBB0_147 .LBB0_117: cmp edx, 32 jl .LBB0_147 # %bb.118: mov r8d, r14d add r15, 96 add rbx, 68 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_72] # ymm0 = [0,0,4,0,8,0,12,0] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_73] # ymm1 = [1125895612137471,1125895612137471,1125895612137471,1125895612137471] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_74] # ymm2 = [0,2,0,6,0,10,0,14] .p2align 4, 0x90 .LBB0_119: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 56] mov r10d, dword ptr [rbx - 52] shld r10d, ecx, 2 mov esi, dword ptr [rbx - 60] mov edi, ecx shld edi, esi, 6 mov r9d, dword ptr [rbx - 68] mov edx, dword ptr [rbx - 64] mov eax, edx shld eax, r9d, 14 vmovd xmm3, esi shld esi, edx, 10 vpinsrd xmm3, xmm3, edi, 1 vpinsrd xmm3, xmm3, ecx, 2 vpinsrd xmm3, xmm3, r10d, 3 vmovd xmm4, r9d vpinsrd xmm4, xmm4, eax, 1 vpinsrd xmm4, xmm4, edx, 2 vpinsrd xmm4, xmm4, esi, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm3 mov r9d, dword ptr [rbx - 36] mov r11d, dword ptr [rbx - 40] mov edx, r9d shld edx, r11d, 4 mov r10d, dword ptr [rbx - 44] mov edi, r11d shld edi, r10d, 8 mov eax, dword ptr [rbx - 52] mov esi, dword ptr [rbx - 48] mov ecx, r10d shld ecx, esi, 12 shrd eax, esi, 16 vmovd xmm3, edi vpinsrd xmm3, xmm3, r11d, 1 vpinsrd xmm3, xmm3, edx, 2 vpinsrd xmm3, xmm3, r9d, 3 vmovd xmm4, eax vpinsrd xmm4, xmm4, esi, 1 vpinsrd xmm4, xmm4, ecx, 2 vpinsrd xmm4, xmm4, r10d, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm3 mov eax, dword ptr [rbx - 20] mov r10d, dword ptr [rbx - 16] shld r10d, eax, 2 mov edx, dword ptr [rbx - 24] mov esi, eax shld esi, edx, 6 mov r9d, dword ptr [rbx - 32] mov ecx, dword ptr [rbx - 28] mov edi, ecx shld edi, r9d, 14 vmovd xmm3, edx shld edx, ecx, 10 vpinsrd xmm3, xmm3, esi, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, r10d, 3 vmovd xmm4, r9d vpinsrd xmm4, xmm4, edi, 1 vpinsrd xmm4, xmm4, ecx, 2 vpinsrd xmm4, xmm4, edx, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm3 mov r9d, dword ptr [rbx] mov r11d, dword ptr [rbx - 4] mov edx, r9d shld edx, r11d, 4 mov r10d, dword ptr [rbx - 8] mov edi, r11d shld edi, r10d, 8 mov eax, dword ptr [rbx - 16] mov esi, dword ptr [rbx - 12] mov ecx, r10d shld ecx, esi, 12 shrd eax, esi, 16 vmovd xmm3, edi vpinsrd xmm3, xmm3, r11d, 1 vpinsrd xmm3, xmm3, edx, 2 vpinsrd xmm3, xmm3, r9d, 3 vmovd xmm4, eax vpinsrd xmm4, xmm4, esi, 1 vpinsrd xmm4, xmm4, ecx, 2 vpinsrd xmm4, xmm4, r10d, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15], ymm3 sub r15, -128 add rbx, 72 add r8, -1 jne .LBB0_119 jmp .LBB0_147 .LBB0_129: cmp edx, 32 jl .LBB0_147 # %bb.130: mov r8d, r14d add r15, 96 add rbx, 36 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_101] # ymm0 = [0,10,20,0,8,18,0,6] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_102] # ymm1 = [4393751544831,4393751544831,4393751544831,4393751544831] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_103] # ymm2 = [16,0,4,14,0,2,12,22] .p2align 4, 0x90 .LBB0_131: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 28] mov edx, dword ptr [rbx - 36] mov esi, dword ptr [rbx - 32] mov edi, ecx shld edi, esi, 4 vmovd xmm3, esi vpinsrd xmm3, xmm3, esi, 1 shld esi, edx, 2 vpinsrd xmm3, xmm3, edi, 2 vpinsrd xmm3, xmm3, ecx, 3 vmovd xmm4, edx vpinsrd xmm4, xmm4, edx, 1 vpinsrd xmm4, xmm4, edx, 2 vpinsrd xmm4, xmm4, esi, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm3 mov ecx, dword ptr [rbx - 20] mov edx, dword ptr [rbx - 24] mov esi, ecx shld esi, edx, 8 mov edi, dword ptr [rbx - 28] mov eax, edx shld eax, edi, 6 vmovd xmm3, esi vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, ecx, 2 vpinsrd xmm3, xmm3, ecx, 3 vmovd xmm4, edi vpinsrd xmm4, xmm4, eax, 1 vpinsrd xmm4, xmm4, edx, 2 vpinsrd xmm4, xmm4, edx, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm3 mov eax, dword ptr [rbx - 8] mov ecx, dword ptr [rbx - 16] mov edx, dword ptr [rbx - 12] mov esi, eax shld esi, edx, 4 vmovd xmm3, edx vpinsrd xmm3, xmm3, edx, 1 shld edx, ecx, 2 vpinsrd xmm3, xmm3, esi, 2 vpinsrd xmm3, xmm3, eax, 3 vmovd xmm4, ecx vpinsrd xmm4, xmm4, ecx, 1 vpinsrd xmm4, xmm4, ecx, 2 vpinsrd xmm4, xmm4, edx, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm3 mov eax, dword ptr [rbx] mov ecx, dword ptr [rbx - 8] mov edx, dword ptr [rbx - 4] mov esi, eax shld esi, edx, 8 mov edi, edx shld edi, ecx, 6 vmovd xmm3, esi vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, eax, 3 vmovd xmm4, ecx vpinsrd xmm4, xmm4, edi, 1 vpinsrd xmm4, xmm4, edx, 2 vpinsrd xmm4, xmm4, edx, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15], ymm3 sub r15, -128 add rbx, 40 add r8, -1 jne .LBB0_131 jmp .LBB0_147 .LBB0_105: cmp edx, 32 jl .LBB0_147 # %bb.106: mov r8d, r14d add r15, 96 add rbx, 100 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_34] # ymm0 = [288230371923853311,288230371923853311,288230371923853311,288230371923853311] vpbroadcastq xmm1, qword ptr [rip + .LCPI0_35] # xmm1 = [42949672976,42949672976] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_33] # ymm2 = [0,0,0,0,0,2,0,0] vpbroadcastq xmm3, qword ptr [rip + .LCPI0_36] # xmm3 = [94489280528,94489280528] vmovdqa ymm4, ymmword ptr [rip + .LCPI0_37] # ymm4 = [0,0,4,0,0,0,0,6] .p2align 4, 0x90 .LBB0_107: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 80] mov r9d, dword ptr [rbx - 76] shld r9d, ecx, 10 mov r11d, dword ptr [rbx - 84] shld ecx, r11d, 4 mov edi, dword ptr [rbx - 88] mov esi, r11d shld esi, edi, 24 mov edx, dword ptr [rbx - 92] shld edi, edx, 18 mov r10d, dword ptr [rbx - 100] mov eax, dword ptr [rbx - 96] shld edx, eax, 12 shld eax, r10d, 6 vmovd xmm5, r10d vmovd xmm6, esi vpinsrd xmm5, xmm5, eax, 1 vpinsrd xmm6, xmm6, r11d, 1 vpinsrd xmm5, xmm5, edx, 2 vpinsrd xmm6, xmm6, ecx, 2 vpinsrd xmm5, xmm5, edi, 3 vpinsrd xmm6, xmm6, r9d, 3 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm5 mov r9d, dword ptr [rbx - 52] mov ecx, dword ptr [rbx - 56] mov edx, r9d shld edx, ecx, 20 mov esi, dword ptr [rbx - 60] shld ecx, esi, 14 mov edi, dword ptr [rbx - 68] mov eax, dword ptr [rbx - 64] shld esi, eax, 8 shld eax, edi, 2 vmovq xmm5, qword ptr [rbx - 76] # xmm5 = mem[0],zero vpsrlvd xmm6, xmm5, xmm1 vpshufd xmm5, xmm5, 229 # xmm5 = xmm5[1,1,2,3] vpinsrd xmm5, xmm5, edi, 1 vpsllvd xmm5, xmm5, xmm3 vpor xmm5, xmm6, xmm5 vmovd xmm6, esi vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, r9d, 3 vpinsrd xmm5, xmm5, edi, 2 vpinsrd xmm5, xmm5, eax, 3 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm5 mov eax, dword ptr [rbx - 28] mov r9d, dword ptr [rbx - 24] shld r9d, eax, 10 mov r11d, dword ptr [rbx - 32] shld eax, r11d, 4 mov esi, dword ptr [rbx - 36] mov edi, r11d shld edi, esi, 24 mov ecx, dword ptr [rbx - 40] shld esi, ecx, 18 mov r10d, dword ptr [rbx - 48] mov edx, dword ptr [rbx - 44] shld ecx, edx, 12 shld edx, r10d, 6 vmovd xmm5, r10d vmovd xmm6, edi vpinsrd xmm5, xmm5, edx, 1 vpinsrd xmm6, xmm6, r11d, 1 vpinsrd xmm5, xmm5, ecx, 2 vpinsrd xmm6, xmm6, eax, 2 vpinsrd xmm5, xmm5, esi, 3 vpinsrd xmm6, xmm6, r9d, 3 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm2 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm5 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, r9d shld edx, ecx, 20 mov esi, dword ptr [rbx - 8] shld ecx, esi, 14 mov edi, dword ptr [rbx - 16] mov eax, dword ptr [rbx - 12] shld esi, eax, 8 shld eax, edi, 2 vmovq xmm5, qword ptr [rbx - 24] # xmm5 = mem[0],zero vpsrlvd xmm6, xmm5, xmm1 vpshufd xmm5, xmm5, 229 # xmm5 = xmm5[1,1,2,3] vpinsrd xmm5, xmm5, edi, 1 vpsllvd xmm5, xmm5, xmm3 vpor xmm5, xmm6, xmm5 vmovd xmm6, esi vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, r9d, 3 vpinsrd xmm5, xmm5, edi, 2 vpinsrd xmm5, xmm5, eax, 3 vinserti128 ymm5, ymm5, xmm6, 1 vpsrlvd ymm5, ymm5, ymm4 vpand ymm5, ymm5, ymm0 vmovdqu ymmword ptr [r15], ymm5 sub r15, -128 add rbx, 104 add r8, -1 jne .LBB0_107 jmp .LBB0_147 .LBB0_135: cmp edx, 32 jl .LBB0_147 # %bb.136: mov eax, r14d add r15, 96 add rbx, 20 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_116] # ymm0 = [0,6,12,18,24,0,4,10] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_117] # ymm1 = [270582939711,270582939711,270582939711,270582939711] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_118] # ymm2 = [16,22,0,2,8,14,20,26] .p2align 4, 0x90 .LBB0_137: # =>This Inner Loop Header: Depth=1 mov ecx, dword ptr [rbx - 20] mov edx, dword ptr [rbx - 16] mov esi, edx shld esi, ecx, 2 vmovd xmm3, ecx vpbroadcastd xmm4, xmm3 vpinsrd xmm3, xmm3, esi, 1 vpinsrd xmm3, xmm3, edx, 2 vpinsrd xmm3, xmm3, edx, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm3 mov ecx, dword ptr [rbx - 16] mov edx, dword ptr [rbx - 12] mov esi, edx shld esi, ecx, 4 vmovd xmm3, ecx vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, esi, 2 vpinsrd xmm3, xmm3, edx, 3 vmovd xmm4, edx vpbroadcastd xmm4, xmm4 vinserti128 ymm3, ymm3, xmm4, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm3 mov ecx, dword ptr [rbx - 8] mov edx, dword ptr [rbx - 4] mov esi, edx shld esi, ecx, 2 vmovd xmm3, ecx vpinsrd xmm4, xmm3, esi, 1 vpinsrd xmm4, xmm4, edx, 2 vpbroadcastd xmm3, xmm3 vpinsrd xmm4, xmm4, edx, 3 vinserti128 ymm3, ymm3, xmm4, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm3 mov ecx, dword ptr [rbx - 4] mov edx, dword ptr [rbx] mov esi, edx shld esi, ecx, 4 vmovd xmm3, ecx vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, esi, 2 vpinsrd xmm3, xmm3, edx, 3 vmovd xmm4, edx vpbroadcastd xmm4, xmm4 vinserti128 ymm3, ymm3, xmm4, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15], ymm3 sub r15, -128 add rbx, 24 add rax, -1 jne .LBB0_137 jmp .LBB0_147 .LBB0_111: cmp edx, 32 jl .LBB0_147 # %bb.112: mov r8d, r14d add r15, 96 add rbx, 84 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_55] # ymm0 = [0,0,0,2,0,0,4,0] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_56] # ymm1 = [18014394218708991,18014394218708991,18014394218708991,18014394218708991] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_57] # ymm2 = [0,6,0,0,8,0,0,10] .p2align 4, 0x90 .LBB0_113: # =>This Inner Loop Header: Depth=1 mov r10d, dword ptr [rbx - 68] mov r9d, dword ptr [rbx - 64] shld r9d, r10d, 6 mov esi, dword ptr [rbx - 72] mov edi, r10d shld edi, esi, 18 mov edx, dword ptr [rbx - 76] shld esi, edx, 8 mov r11d, dword ptr [rbx - 84] mov ecx, dword ptr [rbx - 80] mov eax, edx shld eax, ecx, 20 shld ecx, r11d, 10 vmovd xmm3, r11d vmovd xmm4, esi vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm4, xmm4, edi, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm4, xmm4, r10d, 2 vpinsrd xmm3, xmm3, edx, 3 vpinsrd xmm4, xmm4, r9d, 3 vinserti128 ymm3, ymm3, xmm4, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm3 mov r9d, dword ptr [rbx - 44] mov ecx, dword ptr [rbx - 48] mov r10d, r9d shld r10d, ecx, 12 mov esi, dword ptr [rbx - 52] shld ecx, esi, 2 mov edi, dword ptr [rbx - 56] vmovd xmm3, esi shld esi, edi, 14 mov eax, dword ptr [rbx - 64] mov edx, dword ptr [rbx - 60] shld edi, edx, 4 shrd eax, edx, 16 vpinsrd xmm3, xmm3, ecx, 1 vmovd xmm4, eax vpinsrd xmm3, xmm3, r10d, 2 vpinsrd xmm4, xmm4, edx, 1 vpinsrd xmm3, xmm3, r9d, 3 vpinsrd xmm4, xmm4, edi, 2 vpinsrd xmm4, xmm4, esi, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm3 mov r10d, dword ptr [rbx - 24] mov r9d, dword ptr [rbx - 20] shld r9d, r10d, 6 mov edx, dword ptr [rbx - 28] mov esi, r10d shld esi, edx, 18 mov ecx, dword ptr [rbx - 32] shld edx, ecx, 8 mov r11d, dword ptr [rbx - 40] mov eax, dword ptr [rbx - 36] mov edi, ecx shld edi, eax, 20 shld eax, r11d, 10 vmovd xmm3, r11d vmovd xmm4, edx vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm4, xmm4, esi, 1 vpinsrd xmm3, xmm3, edi, 2 vpinsrd xmm4, xmm4, r10d, 2 vpinsrd xmm3, xmm3, ecx, 3 vpinsrd xmm4, xmm4, r9d, 3 vinserti128 ymm3, ymm3, xmm4, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm3 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov r10d, r9d shld r10d, ecx, 12 mov esi, dword ptr [rbx - 8] shld ecx, esi, 2 mov edi, dword ptr [rbx - 12] vmovd xmm3, esi shld esi, edi, 14 mov eax, dword ptr [rbx - 20] mov edx, dword ptr [rbx - 16] shld edi, edx, 4 shrd eax, edx, 16 vpinsrd xmm3, xmm3, ecx, 1 vmovd xmm4, eax vpinsrd xmm3, xmm3, r10d, 2 vpinsrd xmm4, xmm4, edx, 1 vpinsrd xmm3, xmm3, r9d, 3 vpinsrd xmm4, xmm4, edi, 2 vpinsrd xmm4, xmm4, esi, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15], ymm3 sub r15, -128 add rbx, 88 add r8, -1 jne .LBB0_113 jmp .LBB0_147 .LBB0_123: cmp edx, 32 jl .LBB0_147 # %bb.124: mov r8d, r14d add r15, 96 add rbx, 52 vmovdqa ymm0, ymmword ptr [rip + .LCPI0_86] # ymm0 = [0,14,0,10,0,6,0,2] vpbroadcastq ymm1, qword ptr [rip + .LCPI0_87] # ymm1 = [70364449226751,70364449226751,70364449226751,70364449226751] vmovdqa ymm2, ymmword ptr [rip + .LCPI0_88] # ymm2 = [16,0,12,0,8,0,4,18] .p2align 4, 0x90 .LBB0_125: # =>This Inner Loop Header: Depth=1 mov r9d, dword ptr [rbx - 40] mov ecx, dword ptr [rbx - 44] mov esi, r9d shld esi, ecx, 12 mov edi, dword ptr [rbx - 52] mov r10d, dword ptr [rbx - 48] mov edx, ecx shld edx, r10d, 8 mov eax, r10d shld eax, edi, 4 vmovd xmm3, edx vpinsrd xmm3, xmm3, ecx, 1 vpinsrd xmm3, xmm3, esi, 2 vpinsrd xmm3, xmm3, r9d, 3 vmovd xmm4, edi vpinsrd xmm4, xmm4, edi, 1 vpinsrd xmm4, xmm4, eax, 2 vpinsrd xmm4, xmm4, r10d, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 96], ymm3 mov eax, dword ptr [rbx - 28] mov ecx, dword ptr [rbx - 32] mov edx, eax shld edx, ecx, 10 mov r9d, dword ptr [rbx - 40] mov esi, dword ptr [rbx - 36] vmovd xmm3, ecx shld ecx, esi, 6 mov edi, esi shld edi, r9d, 2 vmovd xmm4, r9d vpinsrd xmm4, xmm4, edi, 1 vpinsrd xmm4, xmm4, esi, 2 vpinsrd xmm4, xmm4, ecx, 3 vpinsrd xmm3, xmm3, edx, 1 vpinsrd xmm3, xmm3, eax, 2 vpinsrd xmm3, xmm3, eax, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 64], ymm3 mov r9d, dword ptr [rbx - 12] mov eax, dword ptr [rbx - 16] mov edx, r9d shld edx, eax, 12 mov esi, dword ptr [rbx - 24] mov r10d, dword ptr [rbx - 20] mov ecx, eax shld ecx, r10d, 8 mov edi, r10d shld edi, esi, 4 vmovd xmm3, ecx vpinsrd xmm3, xmm3, eax, 1 vpinsrd xmm3, xmm3, edx, 2 vpinsrd xmm3, xmm3, r9d, 3 vmovd xmm4, esi vpinsrd xmm4, xmm4, esi, 1 vpinsrd xmm4, xmm4, edi, 2 vpinsrd xmm4, xmm4, r10d, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm0 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15 - 32], ymm3 mov r9d, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, r9d shld edx, ecx, 10 mov eax, dword ptr [rbx - 8] vmovd xmm3, ecx shld ecx, eax, 6 mov edi, dword ptr [rbx - 12] mov esi, eax shld esi, edi, 2 vmovd xmm4, edi vpinsrd xmm4, xmm4, esi, 1 vpinsrd xmm4, xmm4, eax, 2 vpinsrd xmm4, xmm4, ecx, 3 vpinsrd xmm3, xmm3, edx, 1 vpinsrd xmm3, xmm3, r9d, 2 vpinsrd xmm3, xmm3, r9d, 3 vinserti128 ymm3, ymm4, xmm3, 1 vpsrlvd ymm3, ymm3, ymm2 vpand ymm3, ymm3, ymm1 vmovdqu ymmword ptr [r15], ymm3 sub r15, -128 add rbx, 56 add r8, -1 jne .LBB0_125 jmp .LBB0_147 .LBB0_99: cmp edx, 32 jl .LBB0_147 # %bb.100: mov r8d, r14d add r15, 96 vpbroadcastq ymm0, qword ptr [rip + .LCPI0_8] # ymm0 = [4611686015206162431,4611686015206162431,4611686015206162431,4611686015206162431] add rbx, 116 vmovdqa xmm1, xmmword ptr [rip + .LCPI0_9] # xmm1 = [16,14,12,10] vmovdqa xmm2, xmmword ptr [rip + .LCPI0_10] # xmm2 = [16,18,20,22] vmovdqa ymm3, ymmword ptr [rip + .LCPI0_11] # ymm3 = [0,0,0,0,0,0,0,2] .p2align 4, 0x90 .LBB0_101: # =>This Inner Loop Header: Depth=1 mov r11d, dword ptr [rbx - 92] mov r9d, dword ptr [rbx - 88] shld r9d, r11d, 14 mov esi, dword ptr [rbx - 96] shld r11d, esi, 12 mov edi, dword ptr [rbx - 100] shld esi, edi, 10 mov eax, dword ptr [rbx - 104] shld edi, eax, 8 mov edx, dword ptr [rbx - 108] shld eax, edx, 6 mov r10d, dword ptr [rbx - 116] mov ecx, dword ptr [rbx - 112] shld edx, ecx, 4 shld ecx, r10d, 2 vmovd xmm4, r10d vmovd xmm5, edi vpinsrd xmm4, xmm4, ecx, 1 vpinsrd xmm5, xmm5, esi, 1 vpinsrd xmm4, xmm4, edx, 2 vpinsrd xmm5, xmm5, r11d, 2 vpinsrd xmm4, xmm4, eax, 3 vpinsrd xmm5, xmm5, r9d, 3 vinserti128 ymm4, ymm4, xmm5, 1 vpand ymm4, ymm4, ymm0 vmovdqu ymmword ptr [r15 - 96], ymm4 mov eax, dword ptr [rbx - 60] mov ecx, dword ptr [rbx - 64] mov edx, eax shld edx, ecx, 28 mov esi, dword ptr [rbx - 68] mov edi, dword ptr [rbx - 72] shld ecx, esi, 26 shld esi, edi, 24 vmovdqu xmm4, xmmword ptr [rbx - 88] vpsrlvd xmm5, xmm4, xmm1 vpshufd xmm4, xmm4, 249 # xmm4 = xmm4[1,2,3,3] vpinsrd xmm4, xmm4, edi, 3 vmovd xmm6, esi vpinsrd xmm6, xmm6, ecx, 1 vpinsrd xmm6, xmm6, edx, 2 vpsllvd xmm4, xmm4, xmm2 vpinsrd xmm6, xmm6, eax, 3 vpor xmm4, xmm5, xmm4 vinserti128 ymm4, ymm4, xmm6, 1 vpsrlvd ymm4, ymm4, ymm3 vpand ymm4, ymm4, ymm0 vmovdqu ymmword ptr [r15 - 64], ymm4 mov r11d, dword ptr [rbx - 32] mov r9d, dword ptr [rbx - 28] shld r9d, r11d, 14 mov edx, dword ptr [rbx - 36] shld r11d, edx, 12 mov esi, dword ptr [rbx - 40] shld edx, esi, 10 mov edi, dword ptr [rbx - 44] shld esi, edi, 8 mov ecx, dword ptr [rbx - 48] shld edi, ecx, 6 mov r10d, dword ptr [rbx - 56] mov eax, dword ptr [rbx - 52] shld ecx, eax, 4 shld eax, r10d, 2 vmovd xmm4, r10d vmovd xmm5, esi vpinsrd xmm4, xmm4, eax, 1 vpinsrd xmm5, xmm5, edx, 1 vpinsrd xmm4, xmm4, ecx, 2 vpinsrd xmm5, xmm5, r11d, 2 vpinsrd xmm4, xmm4, edi, 3 vpinsrd xmm5, xmm5, r9d, 3 vinserti128 ymm4, ymm4, xmm5, 1 vpand ymm4, ymm4, ymm0 vmovdqu ymmword ptr [r15 - 32], ymm4 mov eax, dword ptr [rbx] mov ecx, dword ptr [rbx - 4] mov edx, eax shld edx, ecx, 28 mov esi, dword ptr [rbx - 8] shld ecx, esi, 26 mov edi, dword ptr [rbx - 12] vmovdqu xmm4, xmmword ptr [rbx - 28] shld esi, edi, 24 vpsrlvd xmm5, xmm4, xmm1 vpshufd xmm4, xmm4, 249 # xmm4 = xmm4[1,2,3,3] vpinsrd xmm4, xmm4, edi, 3 vmovd xmm6, esi vpinsrd xmm6, xmm6, ecx, 1 vpsllvd xmm4, xmm4, xmm2 vpinsrd xmm6, xmm6, edx, 2 vpinsrd xmm6, xmm6, eax, 3 vpor xmm4, xmm5, xmm4 vinserti128 ymm4, ymm4, xmm6, 1 vpsrlvd ymm4, ymm4, ymm3 vpand ymm4, ymm4, ymm0 vmovdqu ymmword ptr [r15], ymm4 sub r15, -128 add rbx, 120 add r8, -1 jne .LBB0_101 .LBB0_147: shl r14d, 5 mov eax, r14d lea rsp, [rbp - 32] pop rbx pop r12 pop r14 pop r15 pop rbp vzeroupper ret .Lfunc_end0: .size unpack32_avx2, .Lfunc_end0-unpack32_avx2 # -- End function .ident "Debian clang version 11.1.0-++20210428103820+1fdec59bffc1-1~exp1~20210428204437.162" .section ".note.GNU-stack","",@progbits .addrsig