crypto/fipsmodule/aes/asm/aesni-xts-avx512.pl (2,716 lines of code) (raw):
#! /usr/bin/env perl
# Copyright (C) 2023 Intel Corporation
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html
# This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ)
# from Intel(R) Intelligent Storage Acceleration Library Crypto Version
# (https://github.com/intel/isa-l_crypto).
#
######################################################################
# The main building block of the loop is code that encrypts/decrypts
# 8/16 blocks of data stitching with generation of tweak for the next
# 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width
# of ZMM registers. The main loop is selected based on the input length.
# main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected
# when input length >= 256 bytes (16 blocks)
# main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected
# when 128 bytes <= input length < 256 bytes (8-15 blocks)
# Input length < 128 bytes (8 blocks) is handled by do_n_blocks.
#
# This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc,
# vaesdec, vpclmulqdq from AVX-512F family.
# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
Two arguments are necessary: the flavour and the output file path."; }
$flavour = shift;
$output = shift;
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$avx512vaes = 1;
for (@ARGV) { $avx512vaes = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
#======================================================================
if ($avx512vaes) {
my $GP_STORAGE = $win64 ? (16 * 33) : (16 * 23); # store rbx
my $XMM_STORAGE = $win64 ? (16 * 23) : 0; # store xmm6:xmm15
my $VARIABLE_OFFSET = $win64 ? (16 *8 + 16* 15 + 16 * 10 + 8*3) :
(16*8 + 16 * 15 + 8 * 1);
my $TW = "%rsp";
my $TWTEMPH = "%rbx";
my $TWTEMPL = "%rax";
my $ZPOLY = "%zmm25";
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;;; Function arguments abstraction
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
my ($key2, $key1, $tweak, $length, $input, $output);
if ($win64) {
$input = "%rcx";
$output = "%rdx";
$length = "%r8";
$key1 = "%r9";
$key2 = "%r10";
$tweak = "%r11";
} else {
$input = "%rdi";
$output = "%rsi";
$length = "%rdx";
$key1 = "%rcx";
$key2 = "%r8";
$tweak = "%r9";
}
# arguments for temp parameters
my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp);
if ($win64) {
$tmp1 = "%r10";
$gf_poly_8b = "%rdi";
$gf_poly_8b_temp = "%rsi";
} else {
$tmp1 = "%r8";
$gf_poly_8b = "%r10";
$gf_poly_8b_temp = "%r11";
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;;; Helper functions
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# Generates "random" local labels
sub random_string() {
my @chars = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
my $length = 15;
my $str;
map { $str .= $chars[rand(33)] } 1 .. $length;
return $str;
}
# ; Seed the RNG so the labels are generated deterministically
srand(12345);
sub encrypt_tweak_for_encryption {
my $key2 = $_[0];
my $state_tweak = $_[1];
my $key1 = $_[2];
my $raw_key = $_[3];
my $tmp = $_[4];
my $ptr_key2 = $_[5];
my $ptr_key1 = $_[6];
my $ptr_expanded_keys = $_[7];
$code.=<<___;
vmovdqu ($ptr_key2), $key2
vpxor $key2, $state_tweak, $state_tweak # AddRoundKey(ARK) for tweak encryption
vmovdqu ($ptr_key1), $key1
vmovdqa $key1, 0x80($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x10($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 1 for tweak encryption
vmovdqu 0x10($ptr_key1), $key1
vmovdqa $key1, 0x90($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x20($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 2 for tweak encryption
vmovdqu 0x20($ptr_key1), $key1
vmovdqa $key1, 0xa0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x30($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 3 for tweak encryption
vmovdqu 0x30($ptr_key1), $key1
vmovdqa $key1, 0xb0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x40($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 4 for tweak encryption
vmovdqu 0x40($ptr_key1), $key1
vmovdqa $key1, 0xc0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x50($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 5 for tweak encryption
vmovdqu 0x50($ptr_key1), $key1
vmovdqa $key1, 0xd0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x60($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 6 for tweak encryption
vmovdqu 0x60($ptr_key1), $key1
vmovdqa $key1, 0xe0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x70($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 7 for tweak encryption
vmovdqu 0x70($ptr_key1), $key1
vmovdqa $key1, 0xf0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x80($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 8 for tweak encryption
vmovdqu 0x80($ptr_key1), $key1
vmovdqa $key1, 0x100($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x90($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 9 for tweak encryption
vmovdqu 0x90($ptr_key1), $key1
vmovdqa $key1, 0x110($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xa0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 10 for tweak encryption
vmovdqu 0xa0($ptr_key1), $key1
vmovdqa $key1, 0x120($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xb0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 11 for tweak encryption
vmovdqu 0xb0($ptr_key1), $key1
vmovdqa $key1, 0x130($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xc0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 12 for tweak encryption
vmovdqu 0xc0($ptr_key1), $key1
vmovdqa $key1, 0x140($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xd0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 13 for tweak encryption
vmovdqu 0xd0($ptr_key1), $key1
vmovdqa $key1, 0x150($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xe0($ptr_key2), $key2
vaesenclast $key2, $state_tweak, $state_tweak # round 14 for tweak encryption
vmovdqu 0xe0($ptr_key1), $key1
vmovdqa $key1, 0x160($ptr_expanded_keys) # store round keys in stack
vmovdqa $state_tweak, ($ptr_expanded_keys) # Store the encrypted Tweak value
___
}
sub initialize {
my @st;
$st[0] = $_[0];
$st[1] = $_[1];
$st[2] = $_[2];
$st[3] = $_[3];
$st[4] = $_[4];
$st[5] = $_[5];
$st[6] = $_[6];
$st[7] = $_[7];
my @tw;
$tw[0] = $_[8];
$tw[1] = $_[9];
$tw[2] = $_[10];
$tw[3] = $_[11];
$tw[4] = $_[12];
$tw[5] = $_[13];
$tw[6] = $_[14];
my $num_initial_blocks = $_[15];
$code .= <<___;
vmovdqa 0x0($TW), $tw[0]
mov 0x0($TW), $TWTEMPL
mov 0x08($TW), $TWTEMPH
vmovdqu 0x0($input), $st[0]
___
if ($num_initial_blocks >= 2) {
for (my $i = 1; $i < $num_initial_blocks; $i++) {
$code .= "xor $gf_poly_8b_temp, $gf_poly_8b_temp\n";
$code .= "shl \$1, $TWTEMPL\n";
$code .= "adc $TWTEMPH, $TWTEMPH\n";
$code .= "cmovc $gf_poly_8b, $gf_poly_8b_temp\n";
$code .= "xor $gf_poly_8b_temp, $TWTEMPL\n";
my $offset = $i * 16;
$code .= "mov $TWTEMPL, $offset($TW)\n";
$code .= "mov $TWTEMPH, `$offset + 8`($TW)\n";
$code .= "vmovdqa $offset($TW), $tw[$i]\n";
$code .= "vmovdqu $offset($input), $st[$i]\n";
}
}
}
# encrypt initial blocks of AES
# 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
# next 8 Tweak values are generated
sub encrypt_initial {
my @st;
$st[0] = $_[0];
$st[1] = $_[1];
$st[2] = $_[2];
$st[3] = $_[3];
$st[4] = $_[4];
$st[5] = $_[5];
$st[6] = $_[6];
$st[7] = $_[7];
my @tw;
$tw[0] = $_[8];
$tw[1] = $_[9];
$tw[2] = $_[10];
$tw[3] = $_[11];
$tw[4] = $_[12];
$tw[5] = $_[13];
$tw[6] = $_[14];
my $t0 = $_[15];
my $num_blocks = $_[16];
my $lt128 = $_[17];
# num_blocks blocks encrypted
# num_blocks can be 1, 2, 3, 4, 5, 6, 7
# xor Tweak value
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
}
$code .= "vmovdqa 0x80($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vpxor $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
___
}
# round 1
$code .= "vmovdqa 0x90($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x0($TW) # next Tweak1 generated
mov $TWTEMPL, 0x08($TW)
xor $gf_poly_8b_temp, $gf_poly_8b_temp
___
}
# round 2
$code .= "vmovdqa 0xa0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x10($TW) # next Tweak2 generated
___
}
# round 3
$code .= "vmovdqa 0xb0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
mov $TWTEMPH, 0x18($TW)
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
___
}
# round 4
$code .= "vmovdqa 0xc0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x20($TW) # next Tweak3 generated
mov $TWTEMPH, 0x28($TW)
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
___
}
# round 5
$code .= "vmovdqa 0xd0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x30($TW) # next Tweak4 generated
mov $TWTEMPH, 0x38($TW)
___
}
# round 6
$code .= "vmovdqa 0xe0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x40($TW) # next Tweak5 generated
mov $TWTEMPH, 0x48($TW)
___
}
# round 7
$code .= "vmovdqa 0xf0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x50($TW) # next Tweak6 generated
mov $TWTEMPH, 0x58($TW)
___
}
# round 8
$code .= "vmovdqa 0x100($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x60($TW) # next Tweak7 generated
mov $TWTEMPH, 0x68($TW)
___
}
# round 9
$code .= "vmovdqa 0x110($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x70($TW) # next Tweak8 generated
mov $TWTEMPH, 0x78($TW)
___
}
# round 10
$code .= "vmovdqa 0x120($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 11
$code .= "vmovdqa 0x130($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 12
$code .= "vmovdqa 0x140($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 13
$code .= "vmovdqa 0x150($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 14
$code .= "vmovdqa 0x160($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
}
# xor Tweak values
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
# load next Tweak values
$code .= <<___;
vmovdqa 0x0($TW), $tw[0]
vmovdqa 0x10($TW), $tw[1]
vmovdqa 0x20($TW), $tw[2]
vmovdqa 0x30($TW), $tw[3]
vmovdqa 0x40($TW), $tw[4]
vmovdqa 0x50($TW), $tw[5]
vmovdqa 0x60($TW), $tw[6]
___
}
}
sub encrypt_tweak_for_decryption {
my $key2 = $_[0];
my $state_tweak = $_[1];
my $key1 = $_[2];
my $raw_key = $_[3];
my $tmp = $_[4];
my $ptr_key2 = $_[5];
my $ptr_key1 = $_[6];
my $ptr_expanded_keys = $_[7];
$code.=<<___;
vmovdqu ($ptr_key2), $key2
vpxor $key2, $state_tweak, $state_tweak # ARK for tweak encryption
vmovdqu 0xe0($ptr_key1), $key1
vmovdqa $key1, 0x160($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x10($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 1 for tweak encryption
vmovdqu 0xd0($ptr_key1), $key1
vmovdqa $key1, 0x150($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x20($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 2 for tweak encryption
vmovdqu 0xc0($ptr_key1), $key1
vmovdqa $key1, 0x140($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x30($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 3 for tweak encryption
vmovdqu 0xb0($ptr_key1), $key1
vmovdqa $key1, 0x130($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x40($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 4 for tweak encryption
vmovdqu 0xa0($ptr_key1), $key1
vmovdqa $key1, 0x120($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x50($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 5 for tweak encryption
vmovdqu 0x90($ptr_key1), $key1
vmovdqa $key1, 0x110($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x60($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 6 for tweak encryption
vmovdqu 0x80($ptr_key1), $key1
vmovdqa $key1, 0x100($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x70($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 7 for tweak encryption
vmovdqu 0x70($ptr_key1), $key1
vmovdqa $key1, 0xf0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x80($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 8 for tweak encryption
vmovdqu 0x60($ptr_key1), $key1
vmovdqa $key1, 0xe0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0x90($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 9 for tweak encryption
vmovdqu 0x50($ptr_key1), $key1
vmovdqa $key1, 0xd0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xa0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 10 for tweak encryption
vmovdqu 0x40($ptr_key1), $key1
vmovdqa $key1, 0xc0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xb0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 11 for tweak encryption
vmovdqu 0x30($ptr_key1), $key1
vmovdqa $key1, 0xb0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xc0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 12 for tweak encryption
vmovdqu 0x20($ptr_key1), $key1
vmovdqa $key1, 0xa0($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xd0($ptr_key2), $key2
vaesenc $key2, $state_tweak, $state_tweak # round 13 for tweak encryption
vmovdqu 0x10($ptr_key1), $key1
vmovdqa $key1, 0x90($ptr_expanded_keys) # store round keys in stack
vmovdqu 0xe0($ptr_key2), $key2
vaesenclast $key2, $state_tweak, $state_tweak # round 14 for tweak encryption
vmovdqu ($ptr_key1), $key1
vmovdqa $key1, 0x80($ptr_expanded_keys) # store round keys in stack
vmovdqa $state_tweak, ($ptr_expanded_keys) # Store the encrypted Tweak value
___
}
# decrypt initial blocks of AES
# 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
# next 8 Tweak values are generated
sub decrypt_initial {
my @st;
$st[0] = $_[0];
$st[1] = $_[1];
$st[2] = $_[2];
$st[3] = $_[3];
$st[4] = $_[4];
$st[5] = $_[5];
$st[6] = $_[6];
$st[7] = $_[7];
my @tw;
$tw[0] = $_[8];
$tw[1] = $_[9];
$tw[2] = $_[10];
$tw[3] = $_[11];
$tw[4] = $_[12];
$tw[5] = $_[13];
$tw[6] = $_[14];
my $t0 = $_[15];
my $num_blocks = $_[16];
my $lt128 = $_[17];
# num_blocks blocks encrypted
# num_blocks can be 1, 2, 3, 4, 5, 6, 7
# xor Tweak value
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
}
$code .= "vmovdqa 0x80($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vpxor $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
___
}
# round 1
$code .= "vmovdqa 0x90($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, ($TW) # next Tweak1 generated
mov $TWTEMPL, 0x08($TW)
xor $gf_poly_8b_temp, $gf_poly_8b_temp
___
}
# round 2
$code .= "vmovdqa 0xa0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x10($TW) # next Tweak2 generated
___
}
# round 3
$code .= "vmovdqa 0xb0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
mov $TWTEMPH, 0x18($TW)
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
___
}
# round 4
$code .= "vmovdqa 0xc0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x20($TW) # next Tweak3 generated
mov $TWTEMPH, 0x28($TW)
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
___
}
# round 5
$code .= "vmovdqa 0xd0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x30($TW) # next Tweak4 generated
mov $TWTEMPH, 0x38($TW)
___
}
# round 6
$code .= "vmovdqa 0xe0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x40($TW) # next Tweak5 generated
mov $TWTEMPH, 0x48($TW)
___
}
# round 7
$code .= "vmovdqa 0xf0($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x50($TW) # next Tweak6 generated
mov $TWTEMPH, 0x58($TW)
___
}
# round 8
$code .= "vmovdqa 0x100($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x60($TW) # next Tweak7 generated
mov $TWTEMPH, 0x68($TW)
___
}
# round 9
$code .= "vmovdqa 0x110($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
$code .= <<___;
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL, 0x70($TW) # next Tweak8 generated
mov $TWTEMPH, 0x78($TW)
___
}
# round 10
$code .= "vmovdqa 0x120($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 11
$code .= "vmovdqa 0x130($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 12
$code .= "vmovdqa 0x140($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 13
$code .= "vmovdqa 0x150($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 14
$code .= "vmovdqa 0x160($TW), $t0\n";
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
}
# xor Tweak values
for (my $i = 0; $i < $num_blocks; $i++) {
$code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
}
if (0 == $lt128) {
# load next Tweak values
$code .= <<___;
vmovdqa ($TW), $tw1
vmovdqa 0x10($TW), $tw2
vmovdqa 0x20($TW), $tw3
vmovdqa 0x30($TW), $tw4
vmovdqa 0x40($TW), $tw5
vmovdqa 0x50($TW), $tw6
vmovdqa 0x60($TW), $tw7
___
}
}
# Encrypt 8 blocks in parallel
# generate next 8 tweak values
sub encrypt_by_eight_zmm {
my $st1 = $_[0];
my $st2 = $_[1];
my $tw1 = $_[2];
my $tw2 = $_[3];
my $t0 = $_[4];
my $last_eight = $_[5];
$code .= <<___;
# xor Tweak values
vpxorq $tw1, $st1, $st1
vpxorq $tw2, $st2, $st2
# ARK
vbroadcasti32x4 0x80($TW), $t0
vpxorq $t0, $st1, $st1
vpxorq $t0, $st2, $st2
___
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw1, %zmm13
vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw1, %zmm15
vpxord %zmm14, %zmm15, %zmm15
___
}
# round 1
$code .= <<___;
vbroadcasti32x4 0x90($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 2
vbroadcasti32x4 0xa0($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 3
vbroadcasti32x4 0xb0($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
___
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw2, %zmm13
vpclmulqdq \$0x0, $ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw2, %zmm16
vpxord %zmm14, %zmm16, %zmm16
___
}
$code .= <<___;
# round 4
vbroadcasti32x4 0xc0($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 5
vbroadcasti32x4 0xd0($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 6
vbroadcasti32x4 0xe0($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 7
vbroadcasti32x4 0xf0($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 8
vbroadcasti32x4 0x100($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 9
vbroadcasti32x4 0x110($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 10
vbroadcasti32x4 0x120($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 11
vbroadcasti32x4 0x130($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 12
vbroadcasti32x4 0x140($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 13
vbroadcasti32x4 0x150($TW), $t0
vaesenc $t0, $st1, $st1
vaesenc $t0, $st2, $st2
# round 14
vbroadcasti32x4 0x160($TW), $t0
vaesenclast $t0, $st1, $st1
vaesenclast $t0, $st2, $st2
# xor Tweak values
vpxorq $tw1, $st1, $st1
vpxorq $tw2, $st2, $st2
# load next Tweak values
vmovdqa32 %zmm15, $tw1
vmovdqa32 %zmm16, $tw2
___
}
# Decrypt 8 blocks in parallel
# generate next 8 tweak values
sub decrypt_by_eight_zmm {
my $st1 = $_[0];
my $st2 = $_[1];
my $tw1 = $_[2];
my $tw2 = $_[3];
my $t0 = $_[4];
my $last_eight = $_[5];
$code .= <<___;
# xor Tweak values
vpxorq $tw1, $st1, $st1
vpxorq $tw2, $st2, $st2
# ARK
vbroadcasti32x4 0x80($TW), $t0
vpxorq $t0, $st1, $st1
vpxorq $t0, $st2, $st2
___
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw1, %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw1, %zmm15
vpxord %zmm14, %zmm15, %zmm15
___
}
# round 1
$code .= <<___;
vbroadcasti32x4 0x90($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 2
vbroadcasti32x4 0xa0($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 3
vbroadcasti32x4 0xb0($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
___
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw2, %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw2, %zmm16
vpxord %zmm14, %zmm16, %zmm16
___
}
$code .= <<___;
# round 4
vbroadcasti32x4 0xc0($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 5
vbroadcasti32x4 0xd0($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 6
vbroadcasti32x4 0xe0($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 7
vbroadcasti32x4 0xf0($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 8
vbroadcasti32x4 0x100($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 9
vbroadcasti32x4 0x110($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 10
vbroadcasti32x4 0x120($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 11
vbroadcasti32x4 0x130($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 12
vbroadcasti32x4 0x140($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 13
vbroadcasti32x4 0x150($TW), $t0
vaesdec $t0, $st1, $st1
vaesdec $t0, $st2, $st2
# round 14
vbroadcasti32x4 0x160($TW), $t0
vaesdeclast $t0, $st1, $st1
vaesdeclast $t0, $st2, $st2
# xor Tweak values
vpxorq $tw1, $st1, $st1
vpxorq $tw2, $st2, $st2
# load next Tweak values
vmovdqa32 %zmm15, $tw1
vmovdqa32 %zmm16, $tw2
___
}
# Encrypt 16 blocks in parallel
# generate next 16 tweak values
sub encrypt_by_16_zmm {
my @st;
$st[0] = $_[0];
$st[1] = $_[1];
$st[2] = $_[2];
$st[3] = $_[3];
my @tw;
$tw[0] = $_[4];
$tw[1] = $_[5];
$tw[2] = $_[6];
$tw[3] = $_[7];
my $t0 = $_[8];
my $last_eight = $_[9];
# xor Tweak values
for (my $i = 0; $i < 4; $i++) {
$code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n";
}
# ARK
$code .= "vbroadcasti32x4 0x80($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vpxorq $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw[2], %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw[2], %zmm15
vpxord %zmm14, %zmm15, %zmm15
___
}
# round 1
$code .= "vbroadcasti32x4 0x90($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 2
$code .= "vbroadcasti32x4 0xa0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 3
$code .= "vbroadcasti32x4 0xb0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw[3], %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw[3], %zmm16
vpxord %zmm14, %zmm16, %zmm16
___
}
# round 4
$code .= "vbroadcasti32x4 0xc0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 5
$code .= "vbroadcasti32x4 0xd0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 6
$code .= "vbroadcasti32x4 0xe0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, %zmm15, %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, %zmm15, %zmm17
vpxord %zmm14, %zmm17, %zmm17
___
}
# round 7
$code .= "vbroadcasti32x4 0xf0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 8
$code .= "vbroadcasti32x4 0x100($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 9
$code .= "vbroadcasti32x4 0x110($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, %zmm16, %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, %zmm16, %zmm18
vpxord %zmm14, %zmm18, %zmm18
___
}
# round 10
$code .= "vbroadcasti32x4 0x120($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 11
$code .= "vbroadcasti32x4 0x130($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 12
$code .= "vbroadcasti32x4 0x140($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 13
$code .= "vbroadcasti32x4 0x150($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenc $t0, $st[$i], $st[$i]\n";
}
# round 14
$code .= "vbroadcasti32x4 0x160($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
}
# xor Tweak values
for (my $i = 0; $i < 4; $i++) {
$code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n";
}
$code .= <<___;
# load next Tweak values
vmovdqa32 %zmm15, $tw[0]
vmovdqa32 %zmm16, $tw[1]
vmovdqa32 %zmm17, $tw[2]
vmovdqa32 %zmm18, $tw[3]
___
}
# Decrypt 16 blocks in parallel
# generate next 8 tweak values
sub decrypt_by_16_zmm {
my @st;
$st[0] = $_[0];
$st[1] = $_[1];
$st[2] = $_[2];
$st[3] = $_[3];
my @tw;
$tw[0] = $_[4];
$tw[1] = $_[5];
$tw[2] = $_[6];
$tw[3] = $_[7];
my $t0 = $_[8];
my $last_eight = $_[9];
# xor Tweak values
for (my $i = 0; $i < 4; $i++) {
$code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n";
}
# ARK
$code .= "vbroadcasti32x4 0x80($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vpxorq $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw[2], %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw[2], %zmm15
vpxord %zmm14, %zmm15, %zmm15
___
}
# round 1
$code .= "vbroadcasti32x4 0x90($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 2
$code .= "vbroadcasti32x4 0xa0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 3
$code .= "vbroadcasti32x4 0xb0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, $tw[3], %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, $tw[3], %zmm16
vpxord %zmm14, %zmm16, %zmm16
___
}
# round 4
$code .= "vbroadcasti32x4 0xc0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 5
$code .= "vbroadcasti32x4 0xd0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 6
$code .= "vbroadcasti32x4 0xe0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, %zmm15, %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, %zmm15, %zmm17
vpxord %zmm14, %zmm17, %zmm17
___
}
# round 7
$code .= "vbroadcasti32x4 0xf0($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 8
$code .= "vbroadcasti32x4 0x100($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 9
$code .= "vbroadcasti32x4 0x110($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
if (0 == $last_eight) {
$code .= <<___;
vpsrldq \$0xf, %zmm16, %zmm13
vpclmulqdq \$0x0,$ZPOLY, %zmm13, %zmm14
vpslldq \$0x1, %zmm16, %zmm18
vpxord %zmm14, %zmm18, %zmm18
___
}
# round 10
$code .= "vbroadcasti32x4 0x120($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 11
$code .= "vbroadcasti32x4 0x130($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 12
$code .= "vbroadcasti32x4 0x140($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 13
$code .= "vbroadcasti32x4 0x150($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdec $t0, $st[$i], $st[$i]\n";
}
# round 14
$code .= "vbroadcasti32x4 0x160($TW), $t0\n";
for (my $i = 0; $i < 4; $i++) {
$code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
}
# xor Tweak values
for (my $i = 0; $i < 4; $i++) {
$code .= "vpxorq $tw[$i], $st[$i], $st[$i]\n";
}
$code .= <<___;
# load next Tweak values
vmovdqa32 %zmm15, $tw[0]
vmovdqa32 %zmm16, $tw[1]
vmovdqa32 %zmm17, $tw[2]
vmovdqa32 %zmm18, $tw[3]
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;void aes_hw_xts_encrypt_avx512(
# ; const uint8_t *in, // input data
# ; uint8_t *out, // output data
# ; size_t length, // sector size, in bytes
# ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes
# ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes
# ; const uint8_t iv[16]) // initial tweak value, 16 bytes
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
my $rndsuffix = &random_string();
$code .= <<___;
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
.text
___
{
$code.=<<___;
.globl aes_hw_xts_encrypt_avx512
.hidden aes_hw_xts_encrypt_avx512
.type aes_hw_xts_encrypt_avx512,\@abi-omnipotent
.align 32
aes_hw_xts_encrypt_avx512:
.cfi_startproc
endbranch
___
}
$code .= "push %rbp\n";
$code .= "mov %rsp,%rbp\n";
$code .= "sub \$$VARIABLE_OFFSET,%rsp\n";
$code .= "and \$0xffffffffffffffc0,%rsp\n";
$code .= "mov %rbx,$GP_STORAGE($TW)\n";
if ($win64) {
$code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n";
$code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n";
$code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n";
$code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n";
$code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n";
$code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n";
$code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n";
$code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n";
$code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n";
$code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n";
$code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n";
$code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n";
}
$code .= "mov \$0x87, $gf_poly_8b\n";
$code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values
$code .= "vpxor %xmm4,%xmm4,%xmm4\n"; # for key expansion
encrypt_tweak_for_encryption("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
$key2, $key1, $TW);
if ($win64) {
$code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer
$code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
}
{
$code.=<<___;
cmp \$0x80,$length
jl .L_less_than_128_bytes_${rndsuffix}
vpbroadcastq $gf_poly_8b,$ZPOLY
cmp \$0x100,$length
jge .L_start_by16_${rndsuffix}
cmp \$0x80,$length
jge .L_start_by8_${rndsuffix}
.L_do_n_blocks_${rndsuffix}:
cmp \$0x0,$length
je .L_ret_${rndsuffix}
cmp \$0x70,$length
jge .L_remaining_num_blocks_is_7_${rndsuffix}
cmp \$0x60,$length
jge .L_remaining_num_blocks_is_6_${rndsuffix}
cmp \$0x50,$length
jge .L_remaining_num_blocks_is_5_${rndsuffix}
cmp \$0x40,$length
jge .L_remaining_num_blocks_is_4_${rndsuffix}
cmp \$0x30,$length
jge .L_remaining_num_blocks_is_3_${rndsuffix}
cmp \$0x20,$length
jge .L_remaining_num_blocks_is_2_${rndsuffix}
cmp \$0x10,$length
jge .L_remaining_num_blocks_is_1_${rndsuffix}
vmovdqa %xmm0,%xmm8
vmovdqa %xmm9,%xmm0
jmp .L_steal_cipher_${rndsuffix}
.L_remaining_num_blocks_is_7_${rndsuffix}:
mov \$0xffffffffffffffff,$tmp1
shr \$0x10,$tmp1
kmovq $tmp1,%k1
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%zmm2{%k1}
add \$0x70,$input
___
}
encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu8 %zmm2,0x40($output){%k1}
add \$0x70,$output
vextracti32x4 \$0x2,%zmm2,%xmm8
vextracti32x4 \$0x3,%zmm10,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
.L_remaining_num_blocks_is_6_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%ymm2
add \$0x60,$input
___
}
encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu8 %ymm2,0x40($output)
add \$0x60,$output
vextracti32x4 \$0x1,%zmm2,%xmm8
vextracti32x4 \$0x2,%zmm10,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
.L_remaining_num_blocks_is_5_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu 0x40($input),%xmm2
add \$0x50,$input
___
}
encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu %xmm2,0x40($output)
add \$0x50,$output
vmovdqa %xmm2,%xmm8
vextracti32x4 \$0x1,%zmm10,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
.L_remaining_num_blocks_is_4_${rndsuffix}:
vmovdqu8 ($input),%zmm1
add \$0x40,$input
___
}
encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
add \$0x40,$output
vextracti32x4 \$0x3,%zmm1,%xmm8
vextracti32x4 \$0x0,%zmm10,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
___
}
{
$code .= <<___;
.L_remaining_num_blocks_is_3_${rndsuffix}:
vextracti32x4 \$0x1,%zmm9,%xmm10
vextracti32x4 \$0x2,%zmm9,%xmm11
vmovdqu ($input),%xmm1
vmovdqu 0x10($input),%xmm2
vmovdqu 0x20($input),%xmm3
add \$0x30,$input
___
}
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
add \$0x30,$output
vmovdqa %xmm3,%xmm8
vextracti32x4 \$0x3,%zmm9,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
___
}
{
$code .= <<___;
.L_remaining_num_blocks_is_2_${rndsuffix}:
vextracti32x4 \$0x1,%zmm9,%xmm10
vmovdqu ($input),%xmm1
vmovdqu 0x10($input),%xmm2
add \$0x20,$input
___
}
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
add \$0x20,$output
vmovdqa %xmm2,%xmm8
vextracti32x4 \$0x2,%zmm9,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
___
}
{
$code .= <<___;
.L_remaining_num_blocks_is_1_${rndsuffix}:
vmovdqu ($input),%xmm1
add \$0x10,$input
___
}
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
add \$0x10,$output
vmovdqa %xmm1,%xmm8
vextracti32x4 \$0x1,%zmm9,%xmm0
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_${rndsuffix}
.L_start_by16_${rndsuffix}:
vbroadcasti32x4 (%rsp),%zmm0
vbroadcasti32x4 shufb_15_7(%rip),%zmm8
mov \$0xaa,$tmp1
kmovq $tmp1,%k2
vpshufb %zmm8,%zmm0,%zmm1
vpsllvq const_dq3210(%rip),%zmm0,%zmm4
vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3
vpxorq %zmm2,%zmm4,%zmm4{%k2}
vpxord %zmm4,%zmm3,%zmm9
vpsllvq const_dq7654(%rip),%zmm0,%zmm5
vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7
vpxorq %zmm6,%zmm5,%zmm5{%k2}
vpxord %zmm5,%zmm7,%zmm10
vpsrldq \$0xf,%zmm9,%zmm13
vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14
vpslldq \$0x1,%zmm9,%zmm11
vpxord %zmm14,%zmm11,%zmm11
vpsrldq \$0xf,%zmm10,%zmm15
vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16
vpslldq \$0x1,%zmm10,%zmm12
vpxord %zmm16,%zmm12,%zmm12
.L_main_loop_run_16_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%zmm2
vmovdqu8 0x80($input),%zmm3
vmovdqu8 0xc0($input),%zmm4
add \$0x100,$input
___
}
encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
"%zmm10", "%zmm11", "%zmm12", "%zmm0", 0);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu8 %zmm2,0x40($output)
vmovdqu8 %zmm3,0x80($output)
vmovdqu8 %zmm4,0xc0($output)
add \$0x100,$output
sub \$0x100,$length
cmp \$0x100,$length
jge .L_main_loop_run_16_${rndsuffix}
cmp \$0x80,$length
jge .L_main_loop_run_8_${rndsuffix}
vextracti32x4 \$0x3,%zmm4,%xmm0
jmp .L_do_n_blocks_${rndsuffix}
.L_start_by8_${rndsuffix}:
vbroadcasti32x4 (%rsp),%zmm0
vbroadcasti32x4 shufb_15_7(%rip),%zmm8
mov \$0xaa,$tmp1
kmovq $tmp1,%k2
vpshufb %zmm8,%zmm0,%zmm1
vpsllvq const_dq3210(%rip),%zmm0,%zmm4
vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3
vpxorq %zmm2,%zmm4,%zmm4{%k2}
vpxord %zmm4,%zmm3,%zmm9
vpsllvq const_dq7654(%rip),%zmm0,%zmm5
vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7
vpxorq %zmm6,%zmm5,%zmm5{%k2}
vpxord %zmm5,%zmm7,%zmm10
.L_main_loop_run_8_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%zmm2
add \$0x80,$input
___
}
encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu8 %zmm2,0x40($output)
add \$0x80,$output
sub \$0x80,$length
cmp \$0x80,$length
jge .L_main_loop_run_8_${rndsuffix}
vextracti32x4 \$0x3,%zmm2,%xmm0
jmp .L_do_n_blocks_${rndsuffix}
.L_steal_cipher_next_${rndsuffix}:
xor $gf_poly_8b_temp,$gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH,$TWTEMPH
cmovc $gf_poly_8b,$gf_poly_8b_temp
xor $gf_poly_8b_temp,$TWTEMPL
mov $TWTEMPL,($TW)
mov $TWTEMPH,0x8($TW)
vmovdqa ($TW),%xmm0
.L_steal_cipher_${rndsuffix}:
vmovdqa %xmm8,%xmm2
lea vpshufb_shf_table(%rip),$TWTEMPL
vmovdqu ($TWTEMPL,$length,1),%xmm10
vpshufb %xmm10,%xmm8,%xmm8
vmovdqu -0x10($input,$length,1),%xmm3
vmovdqu %xmm8,-0x10($output,$length,1)
lea vpshufb_shf_table(%rip),$TWTEMPL
add \$16, $TWTEMPL
sub $length,$TWTEMPL
vmovdqu ($TWTEMPL),%xmm10
vpxor mask1(%rip),%xmm10,%xmm10
vpshufb %xmm10,%xmm3,%xmm3
vpblendvb %xmm10,%xmm2,%xmm3,%xmm3
vpxor %xmm0,%xmm3,%xmm8
vpxor 0x80(%rsp),%xmm8,%xmm8
vaesenc 0x90(%rsp),%xmm8,%xmm8
vaesenc 0xa0(%rsp),%xmm8,%xmm8
vaesenc 0xb0(%rsp),%xmm8,%xmm8
vaesenc 0xc0(%rsp),%xmm8,%xmm8
vaesenc 0xd0(%rsp),%xmm8,%xmm8
vaesenc 0xe0(%rsp),%xmm8,%xmm8
vaesenc 0xf0(%rsp),%xmm8,%xmm8
vaesenc 0x100(%rsp),%xmm8,%xmm8
vaesenc 0x110(%rsp),%xmm8,%xmm8
vaesenc 0x120(%rsp),%xmm8,%xmm8
vaesenc 0x130(%rsp),%xmm8,%xmm8
vaesenc 0x140(%rsp),%xmm8,%xmm8
vaesenc 0x150(%rsp),%xmm8,%xmm8
vaesenclast 0x160(%rsp),%xmm8,%xmm8
vpxor %xmm0,%xmm8,%xmm8
vmovdqu %xmm8,-0x10($output)
___
}
{
$code .= <<___;
.L_ret_${rndsuffix}:
mov $GP_STORAGE($TW),%rbx
xor $tmp1,$tmp1
mov $tmp1,$GP_STORAGE($TW)
# Zero-out the whole of `%zmm0`.
vpxorq %zmm0,%zmm0,%zmm0
___
}
if ($win64) {
$code .= <<___;
mov $GP_STORAGE + 8*1($TW),%rdi
mov $tmp1,$GP_STORAGE + 8*1($TW)
mov $GP_STORAGE + 8*2($TW),%rsi
mov $tmp1,$GP_STORAGE + 8*2($TW)
vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9
# Zero the 64 bytes we just restored to the xmm registers.
vmovdqa64 %zmm0,$XMM_STORAGE($TW)
vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13
# And again.
vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)
vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15
# Last round is only 32 bytes (256-bits), so we use `%ymm` as the
# source operand.
vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
___
}
{
$code .= <<___;
# Zero-out the stack frames used for `key1`, 64 bytes at a time.
vmovdqa64 %zmm0,0x80(%rsp)
vmovdqa64 %zmm0,0xc0(%rsp)
vmovdqa64 %zmm0,0x100(%rsp)
# Stack usage is not divisible by 64, so we use a kmask register to
# only mov 48 of the bytes (6 quad-words).
mov \$0x3f,$tmp1
kmovq $tmp1,%k2
vmovdqa64 %zmm0,0x140(%rsp){%k2}
mov %rbp,%rsp
pop %rbp
vzeroupper
ret
.L_less_than_128_bytes_${rndsuffix}:
cmp \$0x10,$length
jb .L_ret_${rndsuffix}
mov $length,$tmp1
and \$0x70,$tmp1
cmp \$0x60,$tmp1
je .L_num_blocks_is_6_${rndsuffix}
cmp \$0x50,$tmp1
je .L_num_blocks_is_5_${rndsuffix}
cmp \$0x40,$tmp1
je .L_num_blocks_is_4_${rndsuffix}
cmp \$0x30,$tmp1
je .L_num_blocks_is_3_${rndsuffix}
cmp \$0x20,$tmp1
je .L_num_blocks_is_2_${rndsuffix}
cmp \$0x10,$tmp1
je .L_num_blocks_is_1_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 7);
$code .= "add \$0x70,$input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
vmovdqu %xmm6,0x50($output)
vmovdqu %xmm7,0x60($output)
add \$0x70,$output
vmovdqa %xmm7,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 6);
$code .= "add \$0x60,$input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
vmovdqu %xmm6,0x50($output)
add \$0x60,$output
vmovdqa %xmm6,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 5);
$code .= "add \$0x50,$input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
add \$0x50,$output
vmovdqa %xmm5,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 4);
$code .= "add \$0x40, $input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
add \$0x40,$output
vmovdqa %xmm4,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 3);
$code .= "add \$0x30,$input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
add \$0x30,$output
vmovdqa %xmm3,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 2);
$code .= "add \$0x20,$input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
add \$0x20,$output
vmovdqa %xmm2,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 1);
$code .= "add \$0x10,$input\n";
encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
add \$0x10,$output
vmovdqa %xmm1,%xmm8
and \$0xf,$length
je .L_ret_${rndsuffix}
jmp .L_steal_cipher_next_${rndsuffix}
.cfi_endproc
___
}
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
# ;void aes_hw_xts_decrypt_avx512(
# ; const uint8_t *in, // input data
# ; uint8_t *out, // output data
# ; size_t length, // sector size, in bytes
# ; const AES_KEY *key1, // key used for "ECB" encryption, 16*2 bytes
# ; const AES_KEY *key2, // key used for tweaking, 16*2 bytes
# ; const uint8_t iv[16]) // initial tweak value, 16 bytes
# ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
my $rndsuffix = &random_string();
{
$code.=<<___;
.globl aes_hw_xts_decrypt_avx512
.hidden aes_hw_xts_decrypt_avx512
.type aes_hw_xts_decrypt_avx512,\@abi-omnipotent
.align 32
aes_hw_xts_decrypt_avx512:
.cfi_startproc
endbranch
___
}
$code .= "push %rbp\n";
$code .= "mov %rsp,%rbp\n";
$code .= "sub \$$VARIABLE_OFFSET,%rsp\n";
$code .= "and \$0xffffffffffffffc0,%rsp\n";
$code .= "mov %rbx,$GP_STORAGE($TW)\n";
if ($win64) {
$code .= "mov %rdi,$GP_STORAGE + 8*1($TW)\n";
$code .= "mov %rsi,$GP_STORAGE + 8*2($TW)\n";
$code .= "vmovdqa %xmm6, $XMM_STORAGE + 16*0($TW)\n";
$code .= "vmovdqa %xmm7, $XMM_STORAGE + 16*1($TW)\n";
$code .= "vmovdqa %xmm8, $XMM_STORAGE + 16*2($TW)\n";
$code .= "vmovdqa %xmm9, $XMM_STORAGE + 16*3($TW)\n";
$code .= "vmovdqa %xmm10, $XMM_STORAGE + 16*4($TW)\n";
$code .= "vmovdqa %xmm11, $XMM_STORAGE + 16*5($TW)\n";
$code .= "vmovdqa %xmm12, $XMM_STORAGE + 16*6($TW)\n";
$code .= "vmovdqa %xmm13, $XMM_STORAGE + 16*7($TW)\n";
$code .= "vmovdqa %xmm14, $XMM_STORAGE + 16*8($TW)\n";
$code .= "vmovdqa %xmm15, $XMM_STORAGE + 16*9($TW)\n";
}
$code .= "mov \$0x87, $gf_poly_8b\n";
$code .= "vmovdqu ($tweak),%xmm1\n"; # read initial tweak values
$code .= "vpxor %xmm4,%xmm4,%xmm4\n"; # for key expansion
encrypt_tweak_for_decryption("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
$key2, $key1, $TW);
if ($win64) {
$code .= "mov $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer
$code .= "mov $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
}
{
$code.=<<___;
cmp \$0x80,$length
jb .L_less_than_128_bytes_${rndsuffix}
vpbroadcastq $gf_poly_8b,$ZPOLY
cmp \$0x100,$length
jge .L_start_by16_${rndsuffix}
jmp .L_start_by8_${rndsuffix}
.L_do_n_blocks_${rndsuffix}:
cmp \$0x0,$length
je .L_ret_${rndsuffix}
cmp \$0x70,$length
jge .L_remaining_num_blocks_is_7_${rndsuffix}
cmp \$0x60,$length
jge .L_remaining_num_blocks_is_6_${rndsuffix}
cmp \$0x50,$length
jge .L_remaining_num_blocks_is_5_${rndsuffix}
cmp \$0x40,$length
jge .L_remaining_num_blocks_is_4_${rndsuffix}
cmp \$0x30,$length
jge .L_remaining_num_blocks_is_3_${rndsuffix}
cmp \$0x20,$length
jge .L_remaining_num_blocks_is_2_${rndsuffix}
cmp \$0x10,$length
jge .L_remaining_num_blocks_is_1_${rndsuffix}
# _remaining_num_blocks_is_0:
vmovdqu %xmm5, %xmm1
# xmm5 contains last full block to decrypt with next teawk
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
vmovdqu %xmm1, -0x10($output)
vmovdqa %xmm1, %xmm8
# Calc previous tweak
mov \$0x1,$tmp1
kmovq $tmp1, %k1
vpsllq \$0x3f,%xmm9,%xmm13
vpsraq \$0x3f,%xmm13,%xmm14
vpandq %xmm25,%xmm14,%xmm5
vpxorq %xmm5,%xmm9,%xmm9{%k1}
vpsrldq \$0x8,%xmm9,%xmm10
.byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0
vpslldq \$0x8,%xmm13,%xmm13
vpxorq %xmm13,%xmm0,%xmm0
jmp .L_steal_cipher_${rndsuffix}
.L_remaining_num_blocks_is_7_${rndsuffix}:
mov \$0xffffffffffffffff,$tmp1
shr \$0x10,$tmp1
kmovq $tmp1,%k1
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%zmm2{%k1}
add \$0x70,$input
and \$0xf,$length
je .L_done_7_remain_${rndsuffix}
vextracti32x4 \$0x2,%zmm10,%xmm12
vextracti32x4 \$0x3,%zmm10,%xmm13
vinserti32x4 \$0x2,%xmm13,%zmm10,%zmm10
___
}
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
vmovdqu8 %zmm2, 0x40($output){%k1}
add \$0x70, $output
vextracti32x4 \$0x2,%zmm2,%xmm8
vmovdqa %xmm12,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_7_remain_${rndsuffix}:\n";
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
vmovdqu8 %zmm2, 0x40($output){%k1}
jmp .L_ret_${rndsuffix}
.L_remaining_num_blocks_is_6_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%ymm2
add \$0x60,$input
and \$0xf, $length
je .L_done_6_remain_${rndsuffix}
vextracti32x4 \$0x1,%zmm10,%xmm12
vextracti32x4 \$0x2,%zmm10,%xmm13
vinserti32x4 \$0x1,%xmm13,%zmm10,%zmm10
___
}
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
vmovdqu8 %ymm2, 0x40($output)
add \$0x60,$output
vextracti32x4 \$0x1,%zmm2,%xmm8
vmovdqa %xmm12,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_6_remain_${rndsuffix}:\n";
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
vmovdqu8 %ymm2,0x40($output)
jmp .L_ret_${rndsuffix}
.L_remaining_num_blocks_is_5_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu 0x40($input),%xmm2
add \$0x50,$input
and \$0xf,$length
je .L_done_5_remain_${rndsuffix}
vmovdqa %xmm10,%xmm12
vextracti32x4 \$0x1,%zmm10,%xmm10
___
}
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
vmovdqu %xmm2, 0x40($output)
add \$0x50, $output
vmovdqa %xmm2,%xmm8
vmovdqa %xmm12,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_5_remain_${rndsuffix}:\n";
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
vmovdqu8 %xmm2, 0x40($output)
jmp .L_ret_${rndsuffix}
.L_remaining_num_blocks_is_4_${rndsuffix}:
vmovdqu8 ($input),%zmm1
add \$0x40,$input
and \$0xf, $length
je .L_done_4_remain_${rndsuffix}
vextracti32x4 \$0x3,%zmm9,%xmm12
vinserti32x4 \$0x3,%xmm10,%zmm9,%zmm9
___
}
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
add \$0x40,$output
vextracti32x4 \$0x3,%zmm1,%xmm8
vmovdqa %xmm12,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_4_remain_${rndsuffix}:\n";
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);
{
$code .= <<___;
vmovdqu8 %zmm1, ($output)
jmp .L_ret_${rndsuffix}
.L_remaining_num_blocks_is_3_${rndsuffix}:
vmovdqu ($input),%xmm1
vmovdqu 0x10($input),%xmm2
vmovdqu 0x20($input),%xmm3
add \$0x30,$input
and \$0xf,$length
je .L_done_3_remain_${rndsuffix}
vextracti32x4 \$0x2,%zmm9,%xmm13
vextracti32x4 \$0x1,%zmm9,%xmm10
vextracti32x4 \$0x3,%zmm9,%xmm11
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
add \$0x30,$output
vmovdqa %xmm3,%xmm8
vmovdqa %xmm13,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_3_remain_${rndsuffix}:\n";
$code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n";
$code .= "vextracti32x4 \$0x2,%zmm9,%xmm11\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
jmp .L_ret_${rndsuffix}
.L_remaining_num_blocks_is_2_${rndsuffix}:
vmovdqu ($input),%xmm1
vmovdqu 0x10($input),%xmm2
add \$0x20,$input
and \$0xf,$length
je .L_done_2_remain_${rndsuffix}
vextracti32x4 \$0x2,%zmm9,%xmm10
vextracti32x4 \$0x1,%zmm9,%xmm12
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
add \$0x20,$output
vmovdqa %xmm2,%xmm8
vmovdqa %xmm12,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_2_remain_${rndsuffix}:\n";
$code .= "vextracti32x4 \$0x1,%zmm9,%xmm10\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
jmp .L_ret_${rndsuffix}
.L_remaining_num_blocks_is_1_${rndsuffix}:
vmovdqu ($input),%xmm1
add \$0x10,$input
and \$0xf,$length
je .L_done_1_remain_${rndsuffix}
vextracti32x4 \$0x1,%zmm9,%xmm11
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
add \$0x10,$output
vmovdqa %xmm1,%xmm8
vmovdqa %xmm9,%xmm0
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_1_remain_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
vmovdqu %xmm1, ($output)
jmp .L_ret_${rndsuffix}
.L_start_by16_${rndsuffix}:
vbroadcasti32x4 ($TW),%zmm0
vbroadcasti32x4 shufb_15_7(%rip),%zmm8
mov \$0xaa,$tmp1
kmovq $tmp1,%k2
# Mult tweak by 2^{3, 2, 1, 0}
vpshufb %zmm8,%zmm0,%zmm1
vpsllvq const_dq3210(%rip),%zmm0,%zmm4
vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
vpclmulqdq \$0x0,$ZPOLY,%zmm2,%zmm3
vpxorq %zmm2,%zmm4,%zmm4{%k2}
vpxord %zmm4,%zmm3,%zmm9
# Mult tweak by 2^{7, 6, 5, 4}
vpsllvq const_dq7654(%rip),%zmm0,%zmm5
vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7
vpxorq %zmm6,%zmm5,%zmm5{%k2}
vpxord %zmm5,%zmm7,%zmm10
# Make next 8 tweek values by all x 2^8
vpsrldq \$0xf,%zmm9,%zmm13
vpclmulqdq \$0x0,%zmm25,%zmm13,%zmm14
vpslldq \$0x1,%zmm9,%zmm11
vpxord %zmm14,%zmm11,%zmm11
vpsrldq \$0xf,%zmm10,%zmm15
vpclmulqdq \$0x0,%zmm25,%zmm15,%zmm16
vpslldq \$0x1,%zmm10,%zmm12
vpxord %zmm16,%zmm12,%zmm12
.L_main_loop_run_16_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%zmm2
vmovdqu8 0x80($input),%zmm3
vmovdqu8 0xc0($input),%zmm4
vmovdqu8 0xf0($input),%xmm5
add \$0x100,$input
___
}
decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
"%zmm10", "%zmm11", "%zmm12", "%zmm0", 0);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu8 %zmm2,0x40($output)
vmovdqu8 %zmm3,0x80($output)
vmovdqu8 %zmm4,0xc0($output)
add \$0x100,$output
sub \$0x100,$length
cmp \$0x100,$length
jge .L_main_loop_run_16_${rndsuffix}
cmp \$0x80,$length
jge .L_main_loop_run_8_${rndsuffix}
jmp .L_do_n_blocks_${rndsuffix}
.L_start_by8_${rndsuffix}:
# Make first 7 tweek values
vbroadcasti32x4 ($TW),%zmm0
vbroadcasti32x4 shufb_15_7(%rip),%zmm8
mov \$0xaa,$tmp1
kmovq $tmp1,%k2
# Mult tweak by 2^{3, 2, 1, 0}
vpshufb %zmm8,%zmm0,%zmm1
vpsllvq const_dq3210(%rip),%zmm0,%zmm4
vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
vpclmulqdq \$0x0,%zmm25,%zmm2,%zmm3
vpxorq %zmm2,%zmm4,%zmm4{%k2}
vpxord %zmm4,%zmm3,%zmm9
# Mult tweak by 2^{7, 6, 5, 4}
vpsllvq const_dq7654(%rip),%zmm0,%zmm5
vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
vpclmulqdq \$0x0,%zmm25,%zmm6,%zmm7
vpxorq %zmm6,%zmm5,%zmm5{%k2}
vpxord %zmm5,%zmm7,%zmm10
.L_main_loop_run_8_${rndsuffix}:
vmovdqu8 ($input),%zmm1
vmovdqu8 0x40($input),%zmm2
vmovdqu8 0x70($input),%xmm5
add \$0x80,$input
___
}
decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0);
{
$code .= <<___;
vmovdqu8 %zmm1,($output)
vmovdqu8 %zmm2,0x40($output)
add \$0x80,$output
sub \$0x80,$length
cmp \$0x80,$length
jge .L_main_loop_run_8_${rndsuffix}
jmp .L_do_n_blocks_${rndsuffix}
.L_steal_cipher_${rndsuffix}:
# start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak
vmovdqa %xmm8,%xmm2
# shift xmm8 to the left by 16-N_val bytes
lea vpshufb_shf_table(%rip),$TWTEMPL
vmovdqu ($TWTEMPL,$length,1),%xmm10
vpshufb %xmm10,%xmm8,%xmm8
vmovdqu -0x10($input,$length,1),%xmm3
vmovdqu %xmm8,-0x10($output,$length,1)
# shift xmm3 to the right by 16-N_val bytes
lea vpshufb_shf_table(%rip), $TWTEMPL
add \$16, $TWTEMPL
sub $length,$TWTEMPL
vmovdqu ($TWTEMPL),%xmm10
vpxor mask1(%rip),%xmm10,%xmm10
vpshufb %xmm10,%xmm3,%xmm3
vpblendvb %xmm10,%xmm2,%xmm3,%xmm3
# xor Tweak value
vpxor %xmm0,%xmm3,%xmm8
# decrypt last block with cipher stealing
vpxor 0x80(%rsp),%xmm8,%xmm8
vaesdec 0x90(%rsp),%xmm8,%xmm8
vaesdec 0xa0(%rsp),%xmm8,%xmm8
vaesdec 0xb0(%rsp),%xmm8,%xmm8
vaesdec 0xc0(%rsp),%xmm8,%xmm8
vaesdec 0xd0(%rsp),%xmm8,%xmm8
vaesdec 0xe0(%rsp),%xmm8,%xmm8
vaesdec 0xf0(%rsp),%xmm8,%xmm8
vaesdec 0x100(%rsp),%xmm8,%xmm8
vaesdec 0x110(%rsp),%xmm8,%xmm8
vaesdec 0x120(%rsp),%xmm8,%xmm8
vaesdec 0x130(%rsp),%xmm8,%xmm8
vaesdec 0x140(%rsp),%xmm8,%xmm8
vaesdec 0x150(%rsp),%xmm8,%xmm8
vaesdeclast 0x160(%rsp),%xmm8,%xmm8
# xor Tweak value
vpxor %xmm0,%xmm8,%xmm8
.L_done_${rndsuffix}:
# store last ciphertext value
vmovdqu %xmm8,-0x10($output)
___
}
{
$code .= <<___;
.L_ret_${rndsuffix}:
mov $GP_STORAGE($TW),%rbx
xor $tmp1,$tmp1
mov $tmp1,$GP_STORAGE($TW)
# Zero-out the whole of `%zmm0`.
vpxorq %zmm0,%zmm0,%zmm0
___
}
if ($win64) {
$code .= <<___;
mov $GP_STORAGE + 8*1($TW),%rdi
mov $tmp1,$GP_STORAGE + 8*1($TW)
mov $GP_STORAGE + 8*2($TW),%rsi
mov $tmp1,$GP_STORAGE + 8*2($TW)
vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9
# Zero the 64 bytes we just restored to the xmm registers.
vmovdqa64 %zmm0,$XMM_STORAGE($TW)
vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13
# And again.
vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)
vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15
# Last round is only 32 bytes (256-bits), so we use `%ymm` as the
# source operand.
vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
___
}
{
$code .= <<___;
# Zero-out the stack frames used for `key1`, 64 bytes at a time.
vmovdqa64 %zmm0,0x80(%rsp)
vmovdqa64 %zmm0,0xc0(%rsp)
vmovdqa64 %zmm0,0x100(%rsp)
# Stack usage is not divisible by 64, so we use a kmask register to
# only mov 48 of the bytes (6 quad-words).
mov \$0x3f,$tmp1
kmovq $tmp1,%k2
vmovdqa64 %zmm0,0x140(%rsp){%k2}
mov %rbp,%rsp
pop %rbp
vzeroupper
ret
.L_less_than_128_bytes_${rndsuffix}:
cmp \$0x10,$length
jb .L_ret_${rndsuffix}
mov $length,$tmp1
and \$0x70,$tmp1
cmp \$0x60,$tmp1
je .L_num_blocks_is_6_${rndsuffix}
cmp \$0x50,$tmp1
je .L_num_blocks_is_5_${rndsuffix}
cmp \$0x40,$tmp1
je .L_num_blocks_is_4_${rndsuffix}
cmp \$0x30,$tmp1
je .L_num_blocks_is_3_${rndsuffix}
cmp \$0x20,$tmp1
je .L_num_blocks_is_2_${rndsuffix}
cmp \$0x10,$tmp1
je .L_num_blocks_is_1_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 7);
{
$code .= <<___;
add \$0x70,$input
and \$0xf,$length
je .L_done_7_${rndsuffix}
.L_steal_cipher_7_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm15,%xmm16
vmovdqa 0x10(%rsp),%xmm15
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
vmovdqu %xmm6,0x50($output)
add \$0x70,$output
vmovdqa64 %xmm16,%xmm0
vmovdqa %xmm7,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_7_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
vmovdqu %xmm6,0x50($output)
add \$0x70,$output
vmovdqa %xmm7,%xmm8
jmp .L_done_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 6);
{
$code .= <<___;
add \$0x60,$input
and \$0xf,$length
je .L_done_6_${rndsuffix}
.L_steal_cipher_6_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm14,%xmm15
vmovdqa 0x10(%rsp),%xmm14
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
add \$0x60,$output
vmovdqa %xmm15,%xmm0
vmovdqa %xmm6,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_6_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
vmovdqu %xmm5,0x40($output)
add \$0x60,$output
vmovdqa %xmm6,%xmm8
jmp .L_done_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 5);
{
$code .= <<___;
add \$0x50,$input
and \$0xf,$length
je .L_done_5_${rndsuffix}
.L_steal_cipher_5_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm13,%xmm14
vmovdqa 0x10($TW),%xmm13
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
add \$0x50,$output
vmovdqa %xmm14,%xmm0
vmovdqa %xmm5,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_5_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
vmovdqu %xmm4,0x30($output)
add \$0x50,$output
vmovdqa %xmm5,%xmm8
jmp .L_done_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 4);
{
$code .= <<___;
add \$0x40,$input
and \$0xf,$length
je .L_done_4_${rndsuffix}
.L_steal_cipher_4_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm12,%xmm13
vmovdqa 0x10($TW),%xmm12
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
add \$0x40,$output
vmovdqa %xmm13,%xmm0
vmovdqa %xmm4,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_4_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
vmovdqu %xmm3,0x20($output)
add \$0x40,$output
vmovdqa %xmm4,%xmm8
jmp .L_done_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 3);
{
$code .= <<___;
add \$0x30,$input
and \$0xf,$length
je .L_done_3_${rndsuffix}
.L_steal_cipher_3_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm11,%xmm12
vmovdqa 0x10($TW),%xmm11
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
add \$0x30,$output
vmovdqa %xmm12,%xmm0
vmovdqa %xmm3,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_3_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
vmovdqu %xmm2,0x10($output)
add \$0x30,$output
vmovdqa %xmm3,%xmm8
jmp .L_done_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 2);
{
$code .= <<___;
add \$0x20,$input
and \$0xf,$length
je .L_done_2_${rndsuffix}
.L_steal_cipher_2_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm10,%xmm11
vmovdqa 0x10($TW),%xmm10
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
add \$0x20,$output
vmovdqa %xmm11,%xmm0
vmovdqa %xmm2,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_2_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);
{
$code .= <<___;
vmovdqu %xmm1,($output)
add \$0x20,$output
vmovdqa %xmm2,%xmm8
jmp .L_done_${rndsuffix}
___
}
$code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n";
initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", 1);
{
$code .= <<___;
add \$0x10,$input
and \$0xf,$length
je .L_done_1_${rndsuffix}
.L_steal_cipher_1_${rndsuffix}:
xor $gf_poly_8b_temp, $gf_poly_8b_temp
shl \$1, $TWTEMPL
adc $TWTEMPH, $TWTEMPH
cmovc $gf_poly_8b, $gf_poly_8b_temp
xor $gf_poly_8b_temp, $TWTEMPL
mov $TWTEMPL,0x10($TW)
mov $TWTEMPH,0x18($TW)
vmovdqa64 %xmm9,%xmm10
vmovdqa 0x10($TW),%xmm9
___
}
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
add \$0x10,$output
vmovdqa %xmm10,%xmm0
vmovdqa %xmm1,%xmm8
jmp .L_steal_cipher_${rndsuffix}
___
}
$code .= "\n.L_done_1_${rndsuffix}:\n";
decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
"%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
"%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
{
$code .= <<___;
add \$0x10,$output
vmovdqa %xmm1,%xmm8
jmp .L_done_${rndsuffix}
.cfi_endproc
___
}
$code .= <<___;
.section .rodata
.align 16
vpshufb_shf_table:
.quad 0x8786858483828100, 0x8f8e8d8c8b8a8988
.quad 0x0706050403020100, 0x000e0d0c0b0a0908
mask1:
.quad 0x8080808080808080, 0x8080808080808080
const_dq3210:
.quad 0, 0, 1, 1, 2, 2, 3, 3
const_dq5678:
.quad 8, 8, 7, 7, 6, 6, 5, 5
const_dq7654:
.quad 4, 4, 5, 5, 6, 6, 7, 7
const_dq1234:
.quad 4, 4, 3, 3, 2, 2, 1, 1
shufb_15_7:
.byte 15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff
.byte 0xff, 0xff, 0xff, 0xff, 0xff
.text
#endif
___
} else {
$code .= <<___;
.text
.globl aes_hw_xts_encrypt_avx512
.globl aes_hw_xts_decrypt_avx512
aes_hw_xts_encrypt_avx512:
aes_hw_xts_decrypt_avx512:
.byte 0x0f,0x0b # ud2
ret
___
}
# Bits 7 & 4 contain the src1 register's MSB in inverted form
# Bits 6 & 5 contian the dst register's MSB in inverted form
# Bits 1 & 0 is fixed to 10 for vaesenc* instrcutions and 11
# for vpclmulqdq instruction
sub evex_byte1 {
my ($mm, $src1, $dst) = @_;
# set default to zero
$src1 = 0 if (!defined($src1));
$dst = 0 if (!defined($dst));
my $byte = 0xf0 | $mm;
if (($src1 & 0x8) > 0) {
$byte = $byte & 0x7f;
}
if (($src1 & 0x10) > 0) {
$byte = $byte & 0xef;
}
if (($dst & 0x8) > 0) {
$byte = $byte & 0xdf;
}
if (($dst & 0x10) > 0) {
$byte = $byte & 0xbf;
}
return $byte;
}
# Bits 6->3 contians the lower 4 bits of src2 register in inverted form
# Bits 0->2 is fixed to 101
sub evex_byte2 {
my $src2 = shift;
$src2 = ($src2 & 0x0f) ^ 0x0f;
return (($src2 << 3) | 0x05);
}
# Bits 6 & 5 tells about the operand register types and bit 3 contains
# the src2 register's MSB in inverted form
sub evex_byte3 {
my ($type, $src2) = @_;
my $byte = 0x0; # default for xmm registers
if ($type eq 'y') {
$byte = 0x01;
} elsif ($type eq 'z') {
$byte = 0x02;
}
$byte = $byte << 5;
if (!($src2 & 0x10)) {
$byte = $byte | 0x08;
}
return $byte;
}
sub vpclmulqdq {
my $line = shift;
my @opcode = (0x62);
my $inst_type = 0x03; #vpclmulqdq
my %opcodelet = (
"vpclmulqdq" => 0x44,
);
if ($line=~/(vpclmul[a-z]+)\s+\$0x([0-9]+),\s*%([xyz])mm([0-9]+),\s*%[xyz]mm([0-9]+),\s*%[xyz]mm([0-9]+)/) {
return undef if (!defined($opcodelet{$1}));
my $byte1 = evex_byte1($inst_type, $6, $4);
my $byte2 = evex_byte2($5);
my $byte3 = evex_byte3($3, $5);
my $modrm = 0xc0 | (($4 & 7) | (($6 & 7) << 3));
push @opcode,$byte1,$byte2,$byte3;
push @opcode,($opcodelet{$1});
push @opcode,$modrm;
push @opcode,hex($2);
return ".byte\t".join(',',@opcode);
}
return $line;
}
sub vaesni {
my $line = shift;
my @opcode = (0x62);
my $inst_type = 0x02; # vaesenc
my $byte1, $byte2, $byte3;
my %opcodelet = (
"vaesenc" => 0xdc, "vaesdec" => 0xde,
"vaesenclast" => 0xdd, "vaesdeclast" => 0xdf,
);
if ($line=~/(vaes[a-z]+)\s+%([xyz])mm([0-9]+),\s*%[xyz]mm([0-9]+),\s*%[xyz]mm([0-9]*)/) {
return undef if (!defined($opcodelet{$1}));
$byte1 = evex_byte1($inst_type, $5, $3);
$byte2 = evex_byte2($4);
$byte3 = evex_byte3($2, $4);
my $modrm = 0xc0 | ((($5 & 7) << 3) | ($3 & 7));
push @opcode,$byte1,$byte2,$byte3;
push @opcode,($opcodelet{$1});
push @opcode,$modrm;
return ".byte\t".join(',',@opcode);
} elsif ($line=~/(vaes[a-z]+)\s+0x([a-f,0-9]+)\(%rsp\),\s*%([xyz])mm([0-9]+),\s*%[xyz]mm([0-9]+)/) {
return undef if (!defined($opcodelet{$1}));
$byte1 = evex_byte1($inst_type,$5);
$byte2 = evex_byte2($5);
$byte3 = evex_byte3($3, $5);
push @opcode,$byte1,$byte2,$byte3;
push @opcode,($opcodelet{$1});
my $rsp = 0x04;
my $modrm = 0x80 | ((($5 & 7) << 3) | $rsp);
push @opcode,$modrm;
push @opcode,0x24;
push @opcode, (hex($2) & 0xFF), ((hex($2) >> 8) & 0xFF);
push @opcode, ((hex($2) >> 16) & 0xFF), ((hex($2) >> 24) & 0xFF);
return ".byte\t".join(',',@opcode);
}
return $line;
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(vpclmul.*).*$/vpclmulqdq($1)/gem;
$code =~ s/\b(vaesenc.*).*$/vaesni($1)/gem;
$code =~ s/\b(vaesdec.*).*$/vaesni($1)/gem;
print $code;
close STDOUT or die "error closing STDOUT: $!";