bench/asm_amd64.s

// +build amd64,!noasm /*Copyright (c) 2009 The Go Authors. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Google Inc. nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include "textflag.h" // func inthash(i int, h uintptr) uintptr TEXT ·inthash(SB),NOSPLIT,$0 MOVQ i+0(FP), X0 // integer MOVQ h+8(FP), X1 // hash seed PXOR ·aeskeysched(SB), X0 // xor in per-process seed AESENC X0, X0 // scramble seed PXOR X0, X1 // xor data with seed AESENC X1, X1 // scramble combo 3 times AESENC X1, X1 AESENC X1, X1 MOVQ X1, ret+16(FP) // return X1 RET // func inthash(i int, h uintptr) uintptr /*TEXT ·inthash(SB),NOSPLIT,$0-24 MOVQ h+8(FP), X0 // hash seed MOVQ i+0(FP), AX // data PINSRQ $1, AX, X0 // 64 bit data key to high order 64 bits of X0 AESENC ·aeskeysched+0(SB), X0 AESENC ·aeskeysched+16(SB), X0 AESENC ·aeskeysched+0(SB), X0 MOVQ X0, ret+16(FP) RET */ // func strhash(s string, h uintptr) uintptr TEXT ·strhash(SB),NOSPLIT,$0 MOVQ s_elements+0(FP), AX // string data MOVQ s_len+8(FP), CX // length of string MOVQ h+16(FP), BX // hash seed JMP aeshashbody(SB) // AX: data // BX: hash seed // CX: length // At return: AX = return value TEXT aeshashbody(SB),NOSPLIT,$0-0 // Fill an SSE register with our seeds. MOVQ BX, X0 // 64 bits of per-table hash seed PINSRW $4, CX, X0 // 16 bits of length PSHUFHW $0, X0, X0 // repeat length 4 times total MOVO X0, X1 // save unscrambled seed PXOR ·aeskeysched(SB), X0 // xor in per-process seed AESENC X0, X0 // scramble seed CMPQ CX, $16 JB aes0to15 JE aes16 CMPQ CX, $32 JBE aes17to32 CMPQ CX, $64 JBE aes33to64 CMPQ CX, $128 JBE aes65to128 JMP aes129plus aes0to15: TESTQ CX, CX JE aes0 ADDQ $16, AX TESTW $0xff0, AX JE endofpage // 16 bytes loaded at this address won't cross // a page boundary, so we can load it directly. MOVOU -16(AX), X1 ADDQ CX, CX MOVQ $masks<>(SB), AX PAND (AX)(CX*8), X1 final1: PXOR X0, X1 // xor data with seed AESENC X1, X1 // scramble combo 3 times AESENC X1, X1 AESENC X1, X1 MOVQ X1, ret+24(FP) // return X1 RET endofpage: // address ends in 1111xxxx. Might be up against // a page boundary, so load ending at last byte. // Then shift bytes down using pshufb. MOVOU -32(AX)(CX*1), X1 ADDQ CX, CX MOVQ $shifts<>(SB), AX PSHUFB (AX)(CX*8), X1 JMP final1 aes0: // Return scrambled input seed AESENC X0, X0 MOVQ X0, ret+24(FP) // return X0 RET aes16: MOVOU (AX), X1 JMP final1 aes17to32: // make second starting seed PXOR ·aeskeysched+16(SB), X1 AESENC X1, X1 // load data to be hashed MOVOU (AX), X2 MOVOU -16(AX)(CX*1), X3 // xor with seed PXOR X0, X2 PXOR X1, X3 // scramble 3 times AESENC X2, X2 AESENC X3, X3 AESENC X2, X2 AESENC X3, X3 AESENC X2, X2 AESENC X3, X3 // combine results PXOR X3, X2 MOVQ X2, ret+24(FP) // return X2 RET aes33to64: // make 3 more starting seeds MOVO X1, X2 MOVO X1, X3 PXOR ·aeskeysched+16(SB), X1 PXOR ·aeskeysched+32(SB), X2 PXOR ·aeskeysched+48(SB), X3 AESENC X1, X1 AESENC X2, X2 AESENC X3, X3 MOVOU (AX), X4 MOVOU 16(AX), X5 MOVOU -32(AX)(CX*1), X6 MOVOU -16(AX)(CX*1), X7 PXOR X0, X4 PXOR X1, X5 PXOR X2, X6 PXOR X3, X7 AESENC X4, X4 AESENC X5, X5 AESENC X6, X6 AESENC X7, X7 AESENC X4, X4 AESENC X5, X5 AESENC X6, X6 AESENC X7, X7 AESENC X4, X4 AESENC X5, X5 AESENC X6, X6 AESENC X7, X7 PXOR X6, X4 PXOR X7, X5 PXOR X5, X4 MOVQ X4, ret+24(FP) // return X4 RET aes65to128: // make 7 more starting seeds MOVO X1, X2 MOVO X1, X3 MOVO X1, X4 MOVO X1, X5 MOVO X1, X6 MOVO X1, X7 PXOR ·aeskeysched+16(SB), X1 PXOR ·aeskeysched+32(SB), X2 PXOR ·aeskeysched+48(SB), X3 PXOR ·aeskeysched+64(SB), X4 PXOR ·aeskeysched+80(SB), X5 PXOR ·aeskeysched+96(SB), X6 PXOR ·aeskeysched+112(SB), X7 AESENC X1, X1 AESENC X2, X2 AESENC X3, X3 AESENC X4, X4 AESENC X5, X5 AESENC X6, X6 AESENC X7, X7 // load data MOVOU (AX), X8 MOVOU 16(AX), X9 MOVOU 32(AX), X10 MOVOU 48(AX), X11 MOVOU -64(AX)(CX*1), X12 MOVOU -48(AX)(CX*1), X13 MOVOU -32(AX)(CX*1), X14 MOVOU -16(AX)(CX*1), X15 // xor with seed PXOR X0, X8 PXOR X1, X9 PXOR X2, X10 PXOR X3, X11 PXOR X4, X12 PXOR X5, X13 PXOR X6, X14 PXOR X7, X15 // scramble 3 times AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 // combine results PXOR X12, X8 PXOR X13, X9 PXOR X14, X10 PXOR X15, X11 PXOR X10, X8 PXOR X11, X9 PXOR X9, X8 // X15 must be zero on return PXOR X15, X15 MOVQ X8, ret+24(FP) // return X8 RET aes129plus: // make 7 more starting seeds MOVO X1, X2 MOVO X1, X3 MOVO X1, X4 MOVO X1, X5 MOVO X1, X6 MOVO X1, X7 PXOR ·aeskeysched+16(SB), X1 PXOR ·aeskeysched+32(SB), X2 PXOR ·aeskeysched+48(SB), X3 PXOR ·aeskeysched+64(SB), X4 PXOR ·aeskeysched+80(SB), X5 PXOR ·aeskeysched+96(SB), X6 PXOR ·aeskeysched+112(SB), X7 AESENC X1, X1 AESENC X2, X2 AESENC X3, X3 AESENC X4, X4 AESENC X5, X5 AESENC X6, X6 AESENC X7, X7 // start with last (possibly overlapping) block MOVOU -128(AX)(CX*1), X8 MOVOU -112(AX)(CX*1), X9 MOVOU -96(AX)(CX*1), X10 MOVOU -80(AX)(CX*1), X11 MOVOU -64(AX)(CX*1), X12 MOVOU -48(AX)(CX*1), X13 MOVOU -32(AX)(CX*1), X14 MOVOU -16(AX)(CX*1), X15 // xor in seed PXOR X0, X8 PXOR X1, X9 PXOR X2, X10 PXOR X3, X11 PXOR X4, X12 PXOR X5, X13 PXOR X6, X14 PXOR X7, X15 // compute number of remaining 128-byte blocks DECQ CX SHRQ $7, CX aesloop: // scramble state AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 // scramble state, xor in a block MOVOU (AX), X0 MOVOU 16(AX), X1 MOVOU 32(AX), X2 MOVOU 48(AX), X3 AESENC X0, X8 AESENC X1, X9 AESENC X2, X10 AESENC X3, X11 MOVOU 64(AX), X4 MOVOU 80(AX), X5 MOVOU 96(AX), X6 MOVOU 112(AX), X7 AESENC X4, X12 AESENC X5, X13 AESENC X6, X14 AESENC X7, X15 ADDQ $128, AX DECQ CX JNE aesloop // 3 more scrambles to finish AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 AESENC X8, X8 AESENC X9, X9 AESENC X10, X10 AESENC X11, X11 AESENC X12, X12 AESENC X13, X13 AESENC X14, X14 AESENC X15, X15 PXOR X12, X8 PXOR X13, X9 PXOR X14, X10 PXOR X15, X11 PXOR X10, X8 PXOR X11, X9 PXOR X9, X8 // X15 must be zero on return PXOR X15, X15 MOVQ X8, ret+24(FP) // return X8 RET // simple mask to get rid of data in the high part of the register. DATA masks<>+0x00(SB)/8, $0x0000000000000000 DATA masks<>+0x08(SB)/8, $0x0000000000000000 DATA masks<>+0x10(SB)/8, $0x00000000000000ff DATA masks<>+0x18(SB)/8, $0x0000000000000000 DATA masks<>+0x20(SB)/8, $0x000000000000ffff DATA masks<>+0x28(SB)/8, $0x0000000000000000 DATA masks<>+0x30(SB)/8, $0x0000000000ffffff DATA masks<>+0x38(SB)/8, $0x0000000000000000 DATA masks<>+0x40(SB)/8, $0x00000000ffffffff DATA masks<>+0x48(SB)/8, $0x0000000000000000 DATA masks<>+0x50(SB)/8, $0x000000ffffffffff DATA masks<>+0x58(SB)/8, $0x0000000000000000 DATA masks<>+0x60(SB)/8, $0x0000ffffffffffff DATA masks<>+0x68(SB)/8, $0x0000000000000000 DATA masks<>+0x70(SB)/8, $0x00ffffffffffffff DATA masks<>+0x78(SB)/8, $0x0000000000000000 DATA masks<>+0x80(SB)/8, $0xffffffffffffffff DATA masks<>+0x88(SB)/8, $0x0000000000000000 DATA masks<>+0x90(SB)/8, $0xffffffffffffffff DATA masks<>+0x98(SB)/8, $0x00000000000000ff DATA masks<>+0xa0(SB)/8, $0xffffffffffffffff DATA masks<>+0xa8(SB)/8, $0x000000000000ffff DATA masks<>+0xb0(SB)/8, $0xffffffffffffffff DATA masks<>+0xb8(SB)/8, $0x0000000000ffffff DATA masks<>+0xc0(SB)/8, $0xffffffffffffffff DATA masks<>+0xc8(SB)/8, $0x00000000ffffffff DATA masks<>+0xd0(SB)/8, $0xffffffffffffffff DATA masks<>+0xd8(SB)/8, $0x000000ffffffffff DATA masks<>+0xe0(SB)/8, $0xffffffffffffffff DATA masks<>+0xe8(SB)/8, $0x0000ffffffffffff DATA masks<>+0xf0(SB)/8, $0xffffffffffffffff DATA masks<>+0xf8(SB)/8, $0x00ffffffffffffff GLOBL masks<>(SB),RODATA,$256 // these are arguments to pshufb. They move data down from // the high bytes of the register to the low bytes of the register. // index is how many bytes to move. DATA shifts<>+0x00(SB)/8, $0x0000000000000000 DATA shifts<>+0x08(SB)/8, $0x0000000000000000 DATA shifts<>+0x10(SB)/8, $0xffffffffffffff0f DATA shifts<>+0x18(SB)/8, $0xffffffffffffffff DATA shifts<>+0x20(SB)/8, $0xffffffffffff0f0e DATA shifts<>+0x28(SB)/8, $0xffffffffffffffff DATA shifts<>+0x30(SB)/8, $0xffffffffff0f0e0d DATA shifts<>+0x38(SB)/8, $0xffffffffffffffff DATA shifts<>+0x40(SB)/8, $0xffffffff0f0e0d0c DATA shifts<>+0x48(SB)/8, $0xffffffffffffffff DATA shifts<>+0x50(SB)/8, $0xffffff0f0e0d0c0b DATA shifts<>+0x58(SB)/8, $0xffffffffffffffff DATA shifts<>+0x60(SB)/8, $0xffff0f0e0d0c0b0a DATA shifts<>+0x68(SB)/8, $0xffffffffffffffff DATA shifts<>+0x70(SB)/8, $0xff0f0e0d0c0b0a09 DATA shifts<>+0x78(SB)/8, $0xffffffffffffffff DATA shifts<>+0x80(SB)/8, $0x0f0e0d0c0b0a0908 DATA shifts<>+0x88(SB)/8, $0xffffffffffffffff DATA shifts<>+0x90(SB)/8, $0x0e0d0c0b0a090807 DATA shifts<>+0x98(SB)/8, $0xffffffffffffff0f DATA shifts<>+0xa0(SB)/8, $0x0d0c0b0a09080706 DATA shifts<>+0xa8(SB)/8, $0xffffffffffff0f0e DATA shifts<>+0xb0(SB)/8, $0x0c0b0a0908070605 DATA shifts<>+0xb8(SB)/8, $0xffffffffff0f0e0d DATA shifts<>+0xc0(SB)/8, $0x0b0a090807060504 DATA shifts<>+0xc8(SB)/8, $0xffffffff0f0e0d0c DATA shifts<>+0xd0(SB)/8, $0x0a09080706050403 DATA shifts<>+0xd8(SB)/8, $0xffffff0f0e0d0c0b DATA shifts<>+0xe0(SB)/8, $0x0908070605040302 DATA shifts<>+0xe8(SB)/8, $0xffff0f0e0d0c0b0a DATA shifts<>+0xf0(SB)/8, $0x0807060504030201 DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09 GLOBL shifts<>(SB),RODATA,$256

bench/asm_amd64.s (426 lines of code) (raw):