#! /usr/bin/env perl
# Copyright (C) 2023 Intel Corporation
#
# Licensed under the OpenSSL license (the "License").  You may not use
# this file except in compliance with the License.  You can obtain a copy
# in the file LICENSE in the source distribution or at
# https://www.openssl.org/source/license.html

# This implementation is based on the AES-XTS code (AVX512VAES + VPCLMULQDQ)
# from Intel(R) Intelligent Storage Acceleration Library Crypto Version
# (https://github.com/intel/isa-l_crypto).
#
######################################################################
# The main building block of the loop is code that encrypts/decrypts
# 8/16 blocks of data stitching with generation of tweak for the next
# 8/16 blocks, utilizing VAES and VPCLMULQDQ instructions with full width
# of ZMM registers. The main loop is selected based on the input length.
# main_loop_run_16 encrypts/decrypts 16 blocks in parallel and it's selected
# when input length >= 256 bytes (16 blocks)
# main_loop_run_8 encrypts/decrypts 8 blocks in parallel and it's selected
# when 128 bytes <= input length < 256 bytes (8-15 blocks)
# Input length < 128 bytes (8 blocks) is handled by do_n_blocks.
#
# This implementation mainly uses vpshrdq from AVX-512-VBMI2 family and vaesenc,
# vaesdec, vpclmulqdq from AVX-512F family.

# The first two arguments should always be the flavour and output file path.
if ($#ARGV < 1) { die "Not enough arguments provided.
  Two arguments are necessary: the flavour and the output file path."; }

$flavour = shift;
$output  = shift;

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$avx512vaes = 1;
for (@ARGV) { $avx512vaes = 0 if (/-DMY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX/); }

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;

#======================================================================

if ($avx512vaes) {

  my $GP_STORAGE  = $win64 ? (16 * 33)  : (16 * 23);    # store rbx
  my $XMM_STORAGE = $win64 ? (16 * 23) : 0;     # store xmm6:xmm15
  my $VARIABLE_OFFSET = $win64 ? (16 *8 + 16* 15 + 16 * 10 + 8*3) :
                                 (16*8 + 16 * 15 + 8 * 1);

  my $TW = "%rsp";
  my $TWTEMPH = "%rbx";
  my $TWTEMPL = "%rax";
  my $ZPOLY = "%zmm25";

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;;; Function arguments abstraction
  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  my ($key2, $key1, $tweak, $length, $input, $output);

  if ($win64) {
    $input    = "%rcx";
    $output   = "%rdx";
    $length   = "%r8";
    $key1     = "%r9";
    $key2     = "%r10";
    $tweak    = "%r11";
  } else {
    $input    = "%rdi";
    $output   = "%rsi";
    $length   = "%rdx";
    $key1     = "%rcx";
    $key2     = "%r8";
    $tweak    = "%r9";
  }

  # arguments for temp parameters
  my ($tmp1, $gf_poly_8b, $gf_poly_8b_temp);
  if ($win64) {
    $tmp1                = "%r10";
    $gf_poly_8b       = "%rdi";
    $gf_poly_8b_temp  = "%rsi";
  } else {
    $tmp1                = "%r8";
    $gf_poly_8b       = "%r10";
    $gf_poly_8b_temp  = "%r11";
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;;; Helper functions
  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

  # Generates "random" local labels
  sub random_string() {
    my @chars  = ('a' .. 'z', 'A' .. 'Z', '0' .. '9', '_');
    my $length = 15;
    my $str;
    map { $str .= $chars[rand(33)] } 1 .. $length;
    return $str;
  }

  # ; Seed the RNG so the labels are generated deterministically
  srand(12345);

  sub encrypt_tweak_for_encryption {
    my $key2               = $_[0];
    my $state_tweak        = $_[1];
    my $key1               = $_[2];
    my $raw_key            = $_[3];
    my $tmp                = $_[4];
    my $ptr_key2           = $_[5];
    my $ptr_key1           = $_[6];
    my $ptr_expanded_keys  = $_[7];

    $code.=<<___;
    vmovdqu  ($ptr_key2), $key2
    vpxor    $key2, $state_tweak, $state_tweak  # AddRoundKey(ARK) for tweak encryption

    vmovdqu  ($ptr_key1), $key1
    vmovdqa  $key1, 0x80($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x10($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 1 for tweak encryption

    vmovdqu  0x10($ptr_key1), $key1
    vmovdqa  $key1, 0x90($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x20($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 2 for tweak encryption

    vmovdqu  0x20($ptr_key1), $key1
    vmovdqa  $key1, 0xa0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x30($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 3 for tweak encryption

    vmovdqu  0x30($ptr_key1), $key1
    vmovdqa  $key1, 0xb0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x40($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 4 for tweak encryption

    vmovdqu  0x40($ptr_key1), $key1
    vmovdqa  $key1, 0xc0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x50($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 5 for tweak encryption

    vmovdqu  0x50($ptr_key1), $key1
    vmovdqa  $key1, 0xd0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x60($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 6 for tweak encryption

    vmovdqu  0x60($ptr_key1), $key1
    vmovdqa  $key1, 0xe0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x70($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 7 for tweak encryption

    vmovdqu  0x70($ptr_key1), $key1
    vmovdqa  $key1, 0xf0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x80($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 8 for tweak encryption

    vmovdqu  0x80($ptr_key1), $key1
    vmovdqa  $key1, 0x100($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0x90($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 9 for tweak encryption

    vmovdqu  0x90($ptr_key1), $key1
    vmovdqa  $key1, 0x110($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xa0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 10 for tweak encryption

    vmovdqu  0xa0($ptr_key1), $key1
    vmovdqa  $key1, 0x120($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xb0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 11 for tweak encryption

    vmovdqu  0xb0($ptr_key1), $key1
    vmovdqa  $key1, 0x130($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xc0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 12 for tweak encryption

    vmovdqu  0xc0($ptr_key1), $key1
    vmovdqa  $key1, 0x140($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xd0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 13 for tweak encryption

    vmovdqu  0xd0($ptr_key1), $key1
    vmovdqa  $key1, 0x150($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xe0($ptr_key2), $key2
    vaesenclast  $key2, $state_tweak, $state_tweak # round 14 for tweak encryption

    vmovdqu  0xe0($ptr_key1), $key1
    vmovdqa  $key1, 0x160($ptr_expanded_keys)    # store round keys in stack

    vmovdqa  $state_tweak, ($ptr_expanded_keys)  # Store the encrypted Tweak value
___
  }

  sub initialize {
    my @st;
    $st[0] = $_[0];
    $st[1] = $_[1];
    $st[2] = $_[2];
    $st[3] = $_[3];
    $st[4] = $_[4];
    $st[5] = $_[5];
    $st[6] = $_[6];
    $st[7] = $_[7];

    my @tw;
    $tw[0] = $_[8];
    $tw[1] = $_[9];
    $tw[2] = $_[10];
    $tw[3] = $_[11];
    $tw[4] = $_[12];
    $tw[5] = $_[13];
    $tw[6] = $_[14];
    my $num_initial_blocks = $_[15];

    $code .= <<___;
    vmovdqa  0x0($TW), $tw[0]
    mov      0x0($TW), $TWTEMPL
    mov      0x08($TW), $TWTEMPH
    vmovdqu  0x0($input), $st[0]
___

    if ($num_initial_blocks >= 2) {
      for (my $i = 1; $i < $num_initial_blocks; $i++) {
        $code .= "xor      $gf_poly_8b_temp, $gf_poly_8b_temp\n";
        $code .= "shl      \$1, $TWTEMPL\n";
        $code .= "adc      $TWTEMPH, $TWTEMPH\n";
        $code .= "cmovc    $gf_poly_8b, $gf_poly_8b_temp\n";
        $code .= "xor      $gf_poly_8b_temp, $TWTEMPL\n";

        my $offset = $i * 16;
        $code .= "mov      $TWTEMPL, $offset($TW)\n";
        $code .= "mov      $TWTEMPH, `$offset + 8`($TW)\n";
        $code .= "vmovdqa  $offset($TW), $tw[$i]\n";
        $code .= "vmovdqu  $offset($input), $st[$i]\n";
      }
    }
  }

  # encrypt initial blocks of AES
  # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
  # next 8 Tweak values are generated
  sub encrypt_initial {
    my @st;
    $st[0] = $_[0];
    $st[1] = $_[1];
    $st[2] = $_[2];
    $st[3] = $_[3];
    $st[4] = $_[4];
    $st[5] = $_[5];
    $st[6] = $_[6];
    $st[7] = $_[7];

    my @tw;
    $tw[0] = $_[8];
    $tw[1] = $_[9];
    $tw[2] = $_[10];
    $tw[3] = $_[11];
    $tw[4] = $_[12];
    $tw[5] = $_[13];
    $tw[6] = $_[14];
    my $t0 = $_[15];
    my $num_blocks = $_[16];
    my $lt128 = $_[17];

    # num_blocks blocks encrypted
    # num_blocks can be 1, 2, 3, 4, 5, 6, 7

    # xor Tweak value
    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
    }
    $code .= "vmovdqa  0x80($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vpxor $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
___
    }
    # round 1
    $code .= "vmovdqa 0x90($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
    $code .= <<___;
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x0($TW)     # next Tweak1 generated
      mov     $TWTEMPL, 0x08($TW)
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
___
    }

    # round 2
    $code .= "vmovdqa 0xa0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x10($TW) # next Tweak2 generated
___
    }

    # round 3
    $code .= "vmovdqa 0xb0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      mov     $TWTEMPH, 0x18($TW)
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
___
    }

    # round 4
    $code .= "vmovdqa 0xc0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
    $code .= <<___;
    xor     $gf_poly_8b_temp, $TWTEMPL
    mov     $TWTEMPL, 0x20($TW) # next Tweak3 generated
    mov     $TWTEMPH, 0x28($TW)
    xor     $gf_poly_8b_temp, $gf_poly_8b_temp
    shl     \$1, $TWTEMPL
___
    }

    # round 5
    $code .= "vmovdqa 0xd0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
    $code .= <<___;
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x30($TW) # next Tweak4 generated
      mov     $TWTEMPH, 0x38($TW)
___
    }

    # round 6
    $code .= "vmovdqa 0xe0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x40($TW) # next Tweak5 generated
      mov     $TWTEMPH, 0x48($TW)
___
    }

    # round 7
    $code .= "vmovdqa 0xf0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x50($TW) # next Tweak6 generated
      mov     $TWTEMPH, 0x58($TW)
___
    }

    # round 8
    $code .= "vmovdqa 0x100($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x60($TW) # next Tweak7 generated
      mov     $TWTEMPH, 0x68($TW)
___
    }

    # round 9
    $code .= "vmovdqa 0x110($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x70($TW) # next Tweak8 generated
      mov     $TWTEMPH, 0x78($TW)
___
    }

    # round 10
    $code .= "vmovdqa 0x120($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 11
    $code .= "vmovdqa 0x130($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 12
    $code .= "vmovdqa 0x140($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 13
    $code .= "vmovdqa 0x150($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 14
    $code .= "vmovdqa 0x160($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
    }

    # xor Tweak values
    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      # load next Tweak values
      $code .= <<___;
      vmovdqa  0x0($TW), $tw[0]
      vmovdqa  0x10($TW), $tw[1]
      vmovdqa  0x20($TW), $tw[2]
      vmovdqa  0x30($TW), $tw[3]
      vmovdqa  0x40($TW), $tw[4]
      vmovdqa  0x50($TW), $tw[5]
      vmovdqa  0x60($TW), $tw[6]
___
    }
  }

  sub encrypt_tweak_for_decryption {
    my $key2               = $_[0];
    my $state_tweak        = $_[1];
    my $key1               = $_[2];
    my $raw_key            = $_[3];
    my $tmp                = $_[4];
    my $ptr_key2           = $_[5];
    my $ptr_key1           = $_[6];
    my $ptr_expanded_keys  = $_[7];

    $code.=<<___;
    vmovdqu  ($ptr_key2), $key2
    vpxor    $key2, $state_tweak, $state_tweak  # ARK for tweak encryption

    vmovdqu  0xe0($ptr_key1), $key1
    vmovdqa  $key1, 0x160($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x10($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 1 for tweak encryption

    vmovdqu  0xd0($ptr_key1), $key1
    vmovdqa  $key1, 0x150($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x20($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 2 for tweak encryption

    vmovdqu  0xc0($ptr_key1), $key1
    vmovdqa  $key1, 0x140($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x30($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 3 for tweak encryption

    vmovdqu  0xb0($ptr_key1), $key1
    vmovdqa  $key1, 0x130($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x40($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 4 for tweak encryption

    vmovdqu  0xa0($ptr_key1), $key1
    vmovdqa  $key1, 0x120($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x50($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 5 for tweak encryption

    vmovdqu  0x90($ptr_key1), $key1
    vmovdqa  $key1, 0x110($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x60($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 6 for tweak encryption

    vmovdqu  0x80($ptr_key1), $key1
    vmovdqa  $key1, 0x100($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x70($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 7 for tweak encryption

    vmovdqu  0x70($ptr_key1), $key1
    vmovdqa  $key1, 0xf0($ptr_expanded_keys)    # store round keys in stack

    vmovdqu  0x80($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 8 for tweak encryption

    vmovdqu  0x60($ptr_key1), $key1
    vmovdqa  $key1, 0xe0($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0x90($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 9 for tweak encryption

    vmovdqu  0x50($ptr_key1), $key1
    vmovdqa  $key1, 0xd0($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xa0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 10 for tweak encryption

    vmovdqu  0x40($ptr_key1), $key1
    vmovdqa  $key1, 0xc0($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xb0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 11 for tweak encryption

    vmovdqu  0x30($ptr_key1), $key1
    vmovdqa  $key1, 0xb0($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xc0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 12 for tweak encryption

    vmovdqu  0x20($ptr_key1), $key1
    vmovdqa  $key1, 0xa0($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xd0($ptr_key2), $key2
    vaesenc  $key2, $state_tweak, $state_tweak  # round 13 for tweak encryption

    vmovdqu  0x10($ptr_key1), $key1
    vmovdqa  $key1, 0x90($ptr_expanded_keys)   # store round keys in stack

    vmovdqu  0xe0($ptr_key2), $key2
    vaesenclast  $key2, $state_tweak, $state_tweak # round 14 for tweak encryption

    vmovdqu  ($ptr_key1), $key1
    vmovdqa  $key1, 0x80($ptr_expanded_keys)    # store round keys in stack

    vmovdqa  $state_tweak, ($ptr_expanded_keys)  # Store the encrypted Tweak value
___
  }

  # decrypt initial blocks of AES
  # 1, 2, 3, 4, 5, 6 or 7 blocks are encrypted
  # next 8 Tweak values are generated
  sub decrypt_initial {
    my @st;
    $st[0] = $_[0];
    $st[1] = $_[1];
    $st[2] = $_[2];
    $st[3] = $_[3];
    $st[4] = $_[4];
    $st[5] = $_[5];
    $st[6] = $_[6];
    $st[7] = $_[7];

    my @tw;
    $tw[0] = $_[8];
    $tw[1] = $_[9];
    $tw[2] = $_[10];
    $tw[3] = $_[11];
    $tw[4] = $_[12];
    $tw[5] = $_[13];
    $tw[6] = $_[14];
    my $t0 = $_[15];
    my $num_blocks = $_[16];
    my $lt128 = $_[17];

    # num_blocks blocks encrypted
    # num_blocks can be 1, 2, 3, 4, 5, 6, 7

    #  xor Tweak value
    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
    }

    $code .= "vmovdqa  0x80($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vpxor $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
___
    }
    # round 1
    $code .= "vmovdqa 0x90($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
    $code .= <<___;
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, ($TW)     # next Tweak1 generated
      mov     $TWTEMPL, 0x08($TW)
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
___
    }

    # round 2
    $code .= "vmovdqa 0xa0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x10($TW) # next Tweak2 generated
___
    }

    # round 3
    $code .= "vmovdqa 0xb0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      mov     $TWTEMPH, 0x18($TW)
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
___
    }

    # round 4
    $code .= "vmovdqa 0xc0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
    $code .= <<___;
    xor     $gf_poly_8b_temp, $TWTEMPL
    mov     $TWTEMPL, 0x20($TW) # next Tweak3 generated
    mov     $TWTEMPH, 0x28($TW)
    xor     $gf_poly_8b_temp, $gf_poly_8b_temp
    shl     \$1, $TWTEMPL
___
    }

    # round 5
    $code .= "vmovdqa 0xd0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
    $code .= <<___;
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x30($TW) # next Tweak4 generated
      mov     $TWTEMPH, 0x38($TW)
___
    }

    # round 6
    $code .= "vmovdqa 0xe0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x40($TW) # next Tweak5 generated
      mov     $TWTEMPH, 0x48($TW)
___
    }

    # round 7
    $code .= "vmovdqa 0xf0($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x50($TW) # next Tweak6 generated
      mov     $TWTEMPH, 0x58($TW)
___
    }

    # round 8
    $code .= "vmovdqa 0x100($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x60($TW) # next Tweak7 generated
      mov     $TWTEMPH, 0x68($TW)
___
    }

    # round 9
    $code .= "vmovdqa 0x110($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      $code .= <<___;
      xor     $gf_poly_8b_temp, $gf_poly_8b_temp
      shl     \$1, $TWTEMPL
      adc     $TWTEMPH, $TWTEMPH
      cmovc   $gf_poly_8b, $gf_poly_8b_temp
      xor     $gf_poly_8b_temp, $TWTEMPL
      mov     $TWTEMPL, 0x70($TW) # next Tweak8 generated
      mov     $TWTEMPH, 0x78($TW)
___
    }

    # round 10
    $code .= "vmovdqa 0x120($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 11
    $code .= "vmovdqa 0x130($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 12
    $code .= "vmovdqa 0x140($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 13
    $code .= "vmovdqa 0x150($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 14
    $code .= "vmovdqa 0x160($TW), $t0\n";

    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
    }

    # xor Tweak values
    for (my $i = 0; $i < $num_blocks; $i++) {
      $code .= "vpxor $tw[$i], $st[$i], $st[$i]\n";
    }

    if (0 == $lt128) {
      # load next Tweak values
      $code .= <<___;
      vmovdqa  ($TW), $tw1
      vmovdqa  0x10($TW), $tw2
      vmovdqa  0x20($TW), $tw3
      vmovdqa  0x30($TW), $tw4
      vmovdqa  0x40($TW), $tw5
      vmovdqa  0x50($TW), $tw6
      vmovdqa  0x60($TW), $tw7
___
    }
  }

  # Encrypt 8 blocks in parallel
  # generate next 8 tweak values
  sub encrypt_by_eight_zmm {
    my $st1 = $_[0];
    my $st2 = $_[1];
    my $tw1 = $_[2];
    my $tw2 = $_[3];
    my $t0 = $_[4];
    my $last_eight = $_[5];

    $code .= <<___;
    # xor Tweak values
    vpxorq    $tw1, $st1, $st1
    vpxorq    $tw2, $st2, $st2

    # ARK
    vbroadcasti32x4 0x80($TW), $t0
    vpxorq    $t0, $st1, $st1
    vpxorq    $t0, $st2, $st2
___

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw1, %zmm13
      vpclmulqdq	\$0x0, $ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw1, %zmm15
      vpxord		%zmm14, %zmm15, %zmm15
___
    }
    # round 1
    $code .= <<___;
    vbroadcasti32x4 0x90($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 2
    vbroadcasti32x4 0xa0($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 3
    vbroadcasti32x4 0xb0($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2
___

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw2, %zmm13
      vpclmulqdq	\$0x0, $ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw2, %zmm16
      vpxord		%zmm14, %zmm16, %zmm16
___
    }

    $code .= <<___;
    # round 4
    vbroadcasti32x4 0xc0($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 5
    vbroadcasti32x4 0xd0($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 6
    vbroadcasti32x4 0xe0($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 7
    vbroadcasti32x4 0xf0($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 8
    vbroadcasti32x4 0x100($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 9
    vbroadcasti32x4 0x110($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 10
    vbroadcasti32x4 0x120($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 11
    vbroadcasti32x4 0x130($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 12
    vbroadcasti32x4 0x140($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 13
    vbroadcasti32x4 0x150($TW), $t0
    vaesenc  $t0, $st1, $st1
    vaesenc  $t0, $st2, $st2

    # round 14
    vbroadcasti32x4 0x160($TW), $t0
    vaesenclast  $t0, $st1, $st1
    vaesenclast  $t0, $st2, $st2

    # xor Tweak values
    vpxorq    $tw1, $st1, $st1
    vpxorq    $tw2, $st2, $st2

    # load next Tweak values
    vmovdqa32  %zmm15, $tw1
    vmovdqa32  %zmm16, $tw2
___
  }

  # Decrypt 8 blocks in parallel
  # generate next 8 tweak values
  sub decrypt_by_eight_zmm {
    my $st1 = $_[0];
    my $st2 = $_[1];
    my $tw1 = $_[2];
    my $tw2 = $_[3];
    my $t0 = $_[4];
    my $last_eight = $_[5];

    $code .= <<___;
    # xor Tweak values
    vpxorq    $tw1, $st1, $st1
    vpxorq    $tw2, $st2, $st2

    # ARK
    vbroadcasti32x4 0x80($TW), $t0
    vpxorq    $t0, $st1, $st1
    vpxorq    $t0, $st2, $st2
___

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw1, %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw1, %zmm15
      vpxord		%zmm14, %zmm15, %zmm15
___
    }
    # round 1
    $code .= <<___;
    vbroadcasti32x4 0x90($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 2
    vbroadcasti32x4 0xa0($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 3
    vbroadcasti32x4 0xb0($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2
___

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw2, %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw2, %zmm16
      vpxord		%zmm14, %zmm16, %zmm16
___
    }

    $code .= <<___;
    # round 4
    vbroadcasti32x4 0xc0($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 5
    vbroadcasti32x4 0xd0($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 6
    vbroadcasti32x4 0xe0($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 7
    vbroadcasti32x4 0xf0($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 8
    vbroadcasti32x4 0x100($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 9
    vbroadcasti32x4 0x110($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 10
    vbroadcasti32x4 0x120($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 11
    vbroadcasti32x4 0x130($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 12
    vbroadcasti32x4 0x140($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 13
    vbroadcasti32x4 0x150($TW), $t0
    vaesdec  $t0, $st1, $st1
    vaesdec  $t0, $st2, $st2

    # round 14
    vbroadcasti32x4 0x160($TW), $t0
    vaesdeclast  $t0, $st1, $st1
    vaesdeclast  $t0, $st2, $st2

    # xor Tweak values
    vpxorq    $tw1, $st1, $st1
    vpxorq    $tw2, $st2, $st2

    # load next Tweak values
    vmovdqa32  %zmm15, $tw1
    vmovdqa32  %zmm16, $tw2
___
  }

  # Encrypt 16 blocks in parallel
  # generate next 16 tweak values
  sub encrypt_by_16_zmm {
    my @st;
    $st[0] = $_[0];
    $st[1] = $_[1];
    $st[2] = $_[2];
    $st[3] = $_[3];

    my @tw;
    $tw[0] = $_[4];
    $tw[1] = $_[5];
    $tw[2] = $_[6];
    $tw[3] = $_[7];

    my $t0 = $_[8];
    my $last_eight = $_[9];

    # xor Tweak values
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
    }

    # ARK
    $code .= "vbroadcasti32x4 0x80($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vpxorq $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw[2], %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw[2], %zmm15
      vpxord		%zmm14, %zmm15, %zmm15
___
    }

    # round 1
    $code .= "vbroadcasti32x4 0x90($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 2
    $code .= "vbroadcasti32x4 0xa0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 3
    $code .= "vbroadcasti32x4 0xb0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw[3], %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw[3], %zmm16
      vpxord		%zmm14, %zmm16, %zmm16
___
    }
    # round 4
    $code .= "vbroadcasti32x4 0xc0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 5
    $code .= "vbroadcasti32x4 0xd0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 6
    $code .= "vbroadcasti32x4 0xe0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, %zmm15, %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, %zmm15, %zmm17
      vpxord		%zmm14, %zmm17, %zmm17
___
    }
    # round 7
    $code .= "vbroadcasti32x4 0xf0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 8
    $code .= "vbroadcasti32x4 0x100($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 9
    $code .= "vbroadcasti32x4 0x110($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, %zmm16, %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, %zmm16, %zmm18
      vpxord		%zmm14, %zmm18, %zmm18
___
    }
    # round 10
    $code .= "vbroadcasti32x4 0x120($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 11
    $code .= "vbroadcasti32x4 0x130($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 12
    $code .= "vbroadcasti32x4 0x140($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 13
    $code .= "vbroadcasti32x4 0x150($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenc $t0, $st[$i], $st[$i]\n";
    }

    # round 14
    $code .= "vbroadcasti32x4 0x160($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesenclast $t0, $st[$i], $st[$i]\n";
    }


    # xor Tweak values
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
    }

    $code .= <<___;
    # load next Tweak values
    vmovdqa32  %zmm15, $tw[0]
    vmovdqa32  %zmm16, $tw[1]
    vmovdqa32  %zmm17, $tw[2]
    vmovdqa32  %zmm18, $tw[3]
___
  }

  # Decrypt 16 blocks in parallel
  # generate next 8 tweak values
  sub decrypt_by_16_zmm {
    my @st;
    $st[0] = $_[0];
    $st[1] = $_[1];
    $st[2] = $_[2];
    $st[3] = $_[3];

    my @tw;
    $tw[0] = $_[4];
    $tw[1] = $_[5];
    $tw[2] = $_[6];
    $tw[3] = $_[7];

    my $t0 = $_[8];
    my $last_eight = $_[9];

    # xor Tweak values
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
    }

    # ARK
    $code .= "vbroadcasti32x4 0x80($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vpxorq $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw[2], %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw[2], %zmm15
      vpxord		%zmm14, %zmm15, %zmm15
___
    }

    # round 1
    $code .= "vbroadcasti32x4 0x90($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 2
    $code .= "vbroadcasti32x4 0xa0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 3
    $code .= "vbroadcasti32x4 0xb0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, $tw[3], %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, $tw[3], %zmm16
      vpxord		%zmm14, %zmm16, %zmm16
___
    }
    # round 4
    $code .= "vbroadcasti32x4 0xc0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 5
    $code .= "vbroadcasti32x4 0xd0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 6
    $code .= "vbroadcasti32x4 0xe0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, %zmm15, %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, %zmm15, %zmm17
      vpxord		%zmm14, %zmm17, %zmm17
___
    }
    # round 7
    $code .= "vbroadcasti32x4 0xf0($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 8
    $code .= "vbroadcasti32x4 0x100($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 9
    $code .= "vbroadcasti32x4 0x110($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    if (0 == $last_eight) {
      $code .= <<___;
      vpsrldq		\$0xf, %zmm16, %zmm13
      vpclmulqdq	\$0x0,$ZPOLY, %zmm13, %zmm14
      vpslldq		\$0x1, %zmm16, %zmm18
      vpxord		%zmm14, %zmm18, %zmm18
___
    }
    # round 10
    $code .= "vbroadcasti32x4 0x120($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 11
    $code .= "vbroadcasti32x4 0x130($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 12
    $code .= "vbroadcasti32x4 0x140($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 13
    $code .= "vbroadcasti32x4 0x150($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdec $t0, $st[$i], $st[$i]\n";
    }

    # round 14
    $code .= "vbroadcasti32x4 0x160($TW), $t0\n";
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vaesdeclast $t0, $st[$i], $st[$i]\n";
    }

    # xor Tweak values
    for (my $i = 0; $i < 4; $i++) {
      $code .= "vpxorq    $tw[$i], $st[$i], $st[$i]\n";
    }

    $code .= <<___;
    # load next Tweak values
    vmovdqa32  %zmm15, $tw[0]
    vmovdqa32  %zmm16, $tw[1]
    vmovdqa32  %zmm17, $tw[2]
    vmovdqa32  %zmm18, $tw[3]
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;void aes_hw_xts_encrypt_avx512(
  # ;               const uint8_t *in,        // input data
  # ;               uint8_t *out,             // output data
  # ;               size_t length,            // sector size, in bytes
  # ;               const AES_KEY *key1,      // key used for "ECB" encryption, 16*2 bytes
  # ;               const AES_KEY *key2,      // key used for tweaking, 16*2 bytes
  # ;               const uint8_t iv[16])      // initial tweak value, 16 bytes
  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

  my $rndsuffix = &random_string();

  $code .= <<___;
#ifndef MY_ASSEMBLER_IS_TOO_OLD_FOR_512AVX
.text
___

  {
  $code.=<<___;
  .globl	aes_hw_xts_encrypt_avx512
  .hidden	aes_hw_xts_encrypt_avx512
  .type	aes_hw_xts_encrypt_avx512,\@abi-omnipotent
  .align	32
  aes_hw_xts_encrypt_avx512:
  .cfi_startproc
          endbranch
___
  }
  $code .= "push 	 %rbp\n";
  $code .= "mov 	 %rsp,%rbp\n";
  $code .= "sub 	 \$$VARIABLE_OFFSET,%rsp\n";
  $code .= "and 	 \$0xffffffffffffffc0,%rsp\n";
  $code .= "mov 	 %rbx,$GP_STORAGE($TW)\n";

  if ($win64) {
    $code .= "mov 	 %rdi,$GP_STORAGE + 8*1($TW)\n";
    $code .= "mov 	 %rsi,$GP_STORAGE + 8*2($TW)\n";
    $code .= "vmovdqa      %xmm6, $XMM_STORAGE + 16*0($TW)\n";
    $code .= "vmovdqa      %xmm7, $XMM_STORAGE + 16*1($TW)\n";
    $code .= "vmovdqa      %xmm8, $XMM_STORAGE + 16*2($TW)\n";
    $code .= "vmovdqa      %xmm9, $XMM_STORAGE + 16*3($TW)\n";
    $code .= "vmovdqa      %xmm10, $XMM_STORAGE + 16*4($TW)\n";
    $code .= "vmovdqa      %xmm11, $XMM_STORAGE + 16*5($TW)\n";
    $code .= "vmovdqa      %xmm12, $XMM_STORAGE + 16*6($TW)\n";
    $code .= "vmovdqa      %xmm13, $XMM_STORAGE + 16*7($TW)\n";
    $code .= "vmovdqa      %xmm14, $XMM_STORAGE + 16*8($TW)\n";
    $code .= "vmovdqa      %xmm15, $XMM_STORAGE + 16*9($TW)\n";
  }

  $code .= "mov 	 \$0x87, $gf_poly_8b\n";
  $code .= "vmovdqu 	 ($tweak),%xmm1\n";      # read initial tweak values
  $code .= "vpxor 	 %xmm4,%xmm4,%xmm4\n";   # for key expansion

  encrypt_tweak_for_encryption("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
                               $key2, $key1, $TW);

  if ($win64) {
    $code .= "mov	 $input, 8 + 8*5(%rbp)\n";  # ciphertext pointer
    $code .= "mov        $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
  }

  {
  $code.=<<___;

  cmp 	 \$0x80,$length
  jl 	 .L_less_than_128_bytes_${rndsuffix}
  vpbroadcastq 	 $gf_poly_8b,$ZPOLY
  cmp 	 \$0x100,$length
  jge 	 .L_start_by16_${rndsuffix}
  cmp 	 \$0x80,$length
  jge 	 .L_start_by8_${rndsuffix}

  .L_do_n_blocks_${rndsuffix}:
  cmp 	 \$0x0,$length
  je 	 .L_ret_${rndsuffix}
  cmp 	 \$0x70,$length
  jge 	 .L_remaining_num_blocks_is_7_${rndsuffix}
  cmp 	 \$0x60,$length
  jge 	 .L_remaining_num_blocks_is_6_${rndsuffix}
  cmp 	 \$0x50,$length
  jge 	 .L_remaining_num_blocks_is_5_${rndsuffix}
  cmp 	 \$0x40,$length
  jge 	 .L_remaining_num_blocks_is_4_${rndsuffix}
  cmp 	 \$0x30,$length
  jge 	 .L_remaining_num_blocks_is_3_${rndsuffix}
  cmp 	 \$0x20,$length
  jge 	 .L_remaining_num_blocks_is_2_${rndsuffix}
  cmp 	 \$0x10,$length
  jge 	 .L_remaining_num_blocks_is_1_${rndsuffix}
  vmovdqa 	 %xmm0,%xmm8
  vmovdqa 	 %xmm9,%xmm0
  jmp 	 .L_steal_cipher_${rndsuffix}

  .L_remaining_num_blocks_is_7_${rndsuffix}:
  mov 	 \$0xffffffffffffffff,$tmp1
  shr 	 \$0x10,$tmp1
  kmovq 	 $tmp1,%k1
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%zmm2{%k1}
  add 	 \$0x70,$input
___
  }

  encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu8 	 %zmm2,0x40($output){%k1}
  add 	 \$0x70,$output
  vextracti32x4 	 \$0x2,%zmm2,%xmm8
  vextracti32x4 	 \$0x3,%zmm10,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}

  .L_remaining_num_blocks_is_6_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%ymm2
  add 	 \$0x60,$input
___
  }

  encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu8 	 %ymm2,0x40($output)
  add 	 \$0x60,$output
  vextracti32x4 	 \$0x1,%zmm2,%xmm8
  vextracti32x4 	 \$0x2,%zmm10,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}

  .L_remaining_num_blocks_is_5_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu 	 0x40($input),%xmm2
  add 	 \$0x50,$input
___
  }

  encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu 	 %xmm2,0x40($output)
  add 	 \$0x50,$output
  vmovdqa 	 %xmm2,%xmm8
  vextracti32x4 	 \$0x1,%zmm10,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}

  .L_remaining_num_blocks_is_4_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  add 	 \$0x40,$input
___
  }

  encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  add 	 \$0x40,$output
  vextracti32x4 	 \$0x3,%zmm1,%xmm8
  vextracti32x4 	 \$0x0,%zmm10,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}
___
  }

  {
  $code .= <<___;
  .L_remaining_num_blocks_is_3_${rndsuffix}:
  vextracti32x4 	 \$0x1,%zmm9,%xmm10
  vextracti32x4 	 \$0x2,%zmm9,%xmm11
  vmovdqu 	 ($input),%xmm1
  vmovdqu 	 0x10($input),%xmm2
  vmovdqu 	 0x20($input),%xmm3
  add 	 \$0x30,$input
___
  }

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  add 	 \$0x30,$output
  vmovdqa 	 %xmm3,%xmm8
  vextracti32x4 	 \$0x3,%zmm9,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}
___
  }

  {
  $code .= <<___;
  .L_remaining_num_blocks_is_2_${rndsuffix}:
  vextracti32x4 	 \$0x1,%zmm9,%xmm10
  vmovdqu 	 ($input),%xmm1
  vmovdqu 	 0x10($input),%xmm2
  add 	 \$0x20,$input
___
  }

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  add 	 \$0x20,$output
  vmovdqa 	 %xmm2,%xmm8
  vextracti32x4 	 \$0x2,%zmm9,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}
___
  }

  {
  $code .= <<___;
  .L_remaining_num_blocks_is_1_${rndsuffix}:
  vmovdqu 	 ($input),%xmm1
  add 	 \$0x10,$input
___
  }

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  add 	 \$0x10,$output
  vmovdqa 	 %xmm1,%xmm8
  vextracti32x4 	 \$0x1,%zmm9,%xmm0
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_${rndsuffix}

  .L_start_by16_${rndsuffix}:
  vbroadcasti32x4 	 (%rsp),%zmm0
  vbroadcasti32x4 shufb_15_7(%rip),%zmm8
  mov 	 \$0xaa,$tmp1
  kmovq 	 $tmp1,%k2
  vpshufb 	 %zmm8,%zmm0,%zmm1
  vpsllvq const_dq3210(%rip),%zmm0,%zmm4
  vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
  vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
  vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
  vpxord 	 %zmm4,%zmm3,%zmm9
  vpsllvq const_dq7654(%rip),%zmm0,%zmm5
  vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
  vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
  vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
  vpxord 	 %zmm5,%zmm7,%zmm10
  vpsrldq 	 \$0xf,%zmm9,%zmm13
  vpclmulqdq 	 \$0x0,%zmm25,%zmm13,%zmm14
  vpslldq 	 \$0x1,%zmm9,%zmm11
  vpxord 	 %zmm14,%zmm11,%zmm11
  vpsrldq 	 \$0xf,%zmm10,%zmm15
  vpclmulqdq 	 \$0x0,%zmm25,%zmm15,%zmm16
  vpslldq 	 \$0x1,%zmm10,%zmm12
  vpxord 	 %zmm16,%zmm12,%zmm12

  .L_main_loop_run_16_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%zmm2
  vmovdqu8 	 0x80($input),%zmm3
  vmovdqu8 	 0xc0($input),%zmm4
  add 	 \$0x100,$input
___
  }

  encrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
                    "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu8 	 %zmm2,0x40($output)
  vmovdqu8 	 %zmm3,0x80($output)
  vmovdqu8 	 %zmm4,0xc0($output)
  add 	 \$0x100,$output
  sub 	 \$0x100,$length
  cmp 	 \$0x100,$length
  jge 	 .L_main_loop_run_16_${rndsuffix}
  cmp 	 \$0x80,$length
  jge 	 .L_main_loop_run_8_${rndsuffix}
  vextracti32x4 	 \$0x3,%zmm4,%xmm0
  jmp 	 .L_do_n_blocks_${rndsuffix}

  .L_start_by8_${rndsuffix}:
  vbroadcasti32x4 	 (%rsp),%zmm0
  vbroadcasti32x4 shufb_15_7(%rip),%zmm8
  mov 	 \$0xaa,$tmp1
  kmovq 	 $tmp1,%k2
  vpshufb 	 %zmm8,%zmm0,%zmm1
  vpsllvq const_dq3210(%rip),%zmm0,%zmm4
  vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
  vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
  vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
  vpxord 	 %zmm4,%zmm3,%zmm9
  vpsllvq const_dq7654(%rip),%zmm0,%zmm5
  vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
  vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
  vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
  vpxord 	 %zmm5,%zmm7,%zmm10

  .L_main_loop_run_8_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%zmm2
  add 	 \$0x80,$input
___
  }

  encrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu8 	 %zmm2,0x40($output)
  add 	 \$0x80,$output
  sub 	 \$0x80,$length
  cmp 	 \$0x80,$length
  jge 	 .L_main_loop_run_8_${rndsuffix}
  vextracti32x4 	 \$0x3,%zmm2,%xmm0
  jmp 	 .L_do_n_blocks_${rndsuffix}

  .L_steal_cipher_next_${rndsuffix}:
  xor 	 $gf_poly_8b_temp,$gf_poly_8b_temp
  shl 	 \$1, $TWTEMPL
  adc 	 $TWTEMPH,$TWTEMPH
  cmovc  $gf_poly_8b,$gf_poly_8b_temp
  xor 	 $gf_poly_8b_temp,$TWTEMPL
  mov 	 $TWTEMPL,($TW)
  mov 	 $TWTEMPH,0x8($TW)
  vmovdqa 	 ($TW),%xmm0

  .L_steal_cipher_${rndsuffix}:
  vmovdqa 	 %xmm8,%xmm2
  lea vpshufb_shf_table(%rip),$TWTEMPL
  vmovdqu 	 ($TWTEMPL,$length,1),%xmm10
  vpshufb 	 %xmm10,%xmm8,%xmm8
  vmovdqu 	 -0x10($input,$length,1),%xmm3
  vmovdqu 	 %xmm8,-0x10($output,$length,1)
  lea vpshufb_shf_table(%rip),$TWTEMPL
  add \$16, $TWTEMPL
  sub 	 $length,$TWTEMPL
  vmovdqu 	 ($TWTEMPL),%xmm10
  vpxor mask1(%rip),%xmm10,%xmm10
  vpshufb 	 %xmm10,%xmm3,%xmm3
  vpblendvb 	 %xmm10,%xmm2,%xmm3,%xmm3
  vpxor 	 %xmm0,%xmm3,%xmm8
  vpxor 	 0x80(%rsp),%xmm8,%xmm8
  vaesenc 	 0x90(%rsp),%xmm8,%xmm8
  vaesenc 	 0xa0(%rsp),%xmm8,%xmm8
  vaesenc 	 0xb0(%rsp),%xmm8,%xmm8
  vaesenc 	 0xc0(%rsp),%xmm8,%xmm8
  vaesenc 	 0xd0(%rsp),%xmm8,%xmm8
  vaesenc 	 0xe0(%rsp),%xmm8,%xmm8
  vaesenc 	 0xf0(%rsp),%xmm8,%xmm8
  vaesenc 	 0x100(%rsp),%xmm8,%xmm8
  vaesenc 	 0x110(%rsp),%xmm8,%xmm8
  vaesenc 	 0x120(%rsp),%xmm8,%xmm8
  vaesenc 	 0x130(%rsp),%xmm8,%xmm8
  vaesenc 	 0x140(%rsp),%xmm8,%xmm8
  vaesenc 	 0x150(%rsp),%xmm8,%xmm8
  vaesenclast 	 0x160(%rsp),%xmm8,%xmm8
  vpxor 	 %xmm0,%xmm8,%xmm8
  vmovdqu 	 %xmm8,-0x10($output)
___
  }

  {
  $code .= <<___;
  .L_ret_${rndsuffix}:
  mov 	 $GP_STORAGE($TW),%rbx
  xor    $tmp1,$tmp1
  mov    $tmp1,$GP_STORAGE($TW)
  # Zero-out the whole of `%zmm0`.
  vpxorq %zmm0,%zmm0,%zmm0
___
  }

  if ($win64) {
    $code .= <<___;
    mov $GP_STORAGE + 8*1($TW),%rdi
    mov $tmp1,$GP_STORAGE + 8*1($TW)
    mov $GP_STORAGE + 8*2($TW),%rsi
    mov $tmp1,$GP_STORAGE + 8*2($TW)

    vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
    vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
    vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
    vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9

    # Zero the 64 bytes we just restored to the xmm registers.
    vmovdqa64 %zmm0,$XMM_STORAGE($TW)

    vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
    vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
    vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
    vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13

    # And again.
    vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)

    vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
    vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15

    # Last round is only 32 bytes (256-bits), so we use `%ymm` as the
    # source operand.
    vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
___
  }

  {
  $code .= <<___;
  # Zero-out the stack frames used for `key1`, 64 bytes at a time.
  vmovdqa64    %zmm0,0x80(%rsp)
  vmovdqa64    %zmm0,0xc0(%rsp)
  vmovdqa64    %zmm0,0x100(%rsp)

  # Stack usage is not divisible by 64, so we use a kmask register to
  # only mov 48 of the bytes (6 quad-words).
  mov       \$0x3f,$tmp1
  kmovq     $tmp1,%k2
  vmovdqa64 %zmm0,0x140(%rsp){%k2}

  mov %rbp,%rsp
  pop %rbp
  vzeroupper
  ret

  .L_less_than_128_bytes_${rndsuffix}:
  cmp 	 \$0x10,$length
  jb 	 .L_ret_${rndsuffix}
  mov 	 $length,$tmp1
  and 	 \$0x70,$tmp1
  cmp 	 \$0x60,$tmp1
  je 	 .L_num_blocks_is_6_${rndsuffix}
  cmp 	 \$0x50,$tmp1
  je 	 .L_num_blocks_is_5_${rndsuffix}
  cmp 	 \$0x40,$tmp1
  je 	 .L_num_blocks_is_4_${rndsuffix}
  cmp 	 \$0x30,$tmp1
  je 	 .L_num_blocks_is_3_${rndsuffix}
  cmp 	 \$0x20,$tmp1
  je 	 .L_num_blocks_is_2_${rndsuffix}
  cmp 	 \$0x10,$tmp1
  je 	 .L_num_blocks_is_1_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n";
  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 7);

  $code .= "add      \$0x70,$input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  vmovdqu 	 %xmm6,0x50($output)
  vmovdqu 	 %xmm7,0x60($output)
  add 	 \$0x70,$output
  vmovdqa 	 %xmm7,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n";
  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 6);

  $code .= "add      \$0x60,$input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  vmovdqu 	 %xmm6,0x50($output)
  add 	 \$0x60,$output
  vmovdqa 	 %xmm6,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n";
  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 5);

  $code .= "add      \$0x50,$input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  add 	 \$0x50,$output
  vmovdqa 	 %xmm5,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 4);

  $code .= "add      \$0x40, $input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  add 	 \$0x40,$output
  vmovdqa 	 %xmm4,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 3);

  $code .= "add      \$0x30,$input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  add 	 \$0x30,$output
  vmovdqa 	 %xmm3,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 2);

  $code .= "add      \$0x20,$input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  add 	 \$0x20,$output
  vmovdqa 	 %xmm2,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 1);

  $code .= "add      \$0x10,$input\n";

  encrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  add 	 \$0x10,$output
  vmovdqa 	 %xmm1,%xmm8
  and 	 \$0xf,$length
  je 	 .L_ret_${rndsuffix}
  jmp 	 .L_steal_cipher_next_${rndsuffix}
  .cfi_endproc
___
  }

  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
  # ;void aes_hw_xts_decrypt_avx512(
  # ;               const uint8_t *in,        // input data
  # ;               uint8_t *out,             // output data
  # ;               size_t length,            // sector size, in bytes
  # ;               const AES_KEY *key1,      // key used for "ECB" encryption, 16*2 bytes
  # ;               const AES_KEY *key2,      // key used for tweaking, 16*2 bytes
  # ;               const uint8_t iv[16])      // initial tweak value, 16 bytes
  # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

  my $rndsuffix = &random_string();

  {
  $code.=<<___;
  .globl	aes_hw_xts_decrypt_avx512
  .hidden	aes_hw_xts_decrypt_avx512
  .type	aes_hw_xts_decrypt_avx512,\@abi-omnipotent
  .align	32
  aes_hw_xts_decrypt_avx512:
  .cfi_startproc
          endbranch
___
  }
  $code .= "push 	 %rbp\n";
  $code .= "mov 	 %rsp,%rbp\n";
  $code .= "sub 	 \$$VARIABLE_OFFSET,%rsp\n";
  $code .= "and 	 \$0xffffffffffffffc0,%rsp\n";
  $code .= "mov 	 %rbx,$GP_STORAGE($TW)\n";

  if ($win64) {
    $code .= "mov 	 %rdi,$GP_STORAGE + 8*1($TW)\n";
    $code .= "mov 	 %rsi,$GP_STORAGE + 8*2($TW)\n";
    $code .= "vmovdqa      %xmm6, $XMM_STORAGE + 16*0($TW)\n";
    $code .= "vmovdqa      %xmm7, $XMM_STORAGE + 16*1($TW)\n";
    $code .= "vmovdqa      %xmm8, $XMM_STORAGE + 16*2($TW)\n";
    $code .= "vmovdqa      %xmm9, $XMM_STORAGE + 16*3($TW)\n";
    $code .= "vmovdqa      %xmm10, $XMM_STORAGE + 16*4($TW)\n";
    $code .= "vmovdqa      %xmm11, $XMM_STORAGE + 16*5($TW)\n";
    $code .= "vmovdqa      %xmm12, $XMM_STORAGE + 16*6($TW)\n";
    $code .= "vmovdqa      %xmm13, $XMM_STORAGE + 16*7($TW)\n";
    $code .= "vmovdqa      %xmm14, $XMM_STORAGE + 16*8($TW)\n";
    $code .= "vmovdqa      %xmm15, $XMM_STORAGE + 16*9($TW)\n";
  }

  $code .= "mov 	 \$0x87, $gf_poly_8b\n";
  $code .= "vmovdqu 	 ($tweak),%xmm1\n";      # read initial tweak values
  $code .= "vpxor 	 %xmm4,%xmm4,%xmm4\n"; # for key expansion

  encrypt_tweak_for_decryption("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4",
                               $key2, $key1, $TW);

  if ($win64) {
    $code .= "mov	 $input, 8 + 8*5(%rbp)\n"; # ciphertext pointer
    $code .= "mov        $output, 8 + 8*6(%rbp)\n"; # plaintext pointer
  }

  {
  $code.=<<___;

  cmp 	 \$0x80,$length
  jb 	 .L_less_than_128_bytes_${rndsuffix}
  vpbroadcastq 	 $gf_poly_8b,$ZPOLY
  cmp 	 \$0x100,$length
  jge 	 .L_start_by16_${rndsuffix}
  jmp 	 .L_start_by8_${rndsuffix}

  .L_do_n_blocks_${rndsuffix}:
  cmp 	 \$0x0,$length
  je 	 .L_ret_${rndsuffix}
  cmp 	 \$0x70,$length
  jge 	 .L_remaining_num_blocks_is_7_${rndsuffix}
  cmp 	 \$0x60,$length
  jge 	 .L_remaining_num_blocks_is_6_${rndsuffix}
  cmp 	 \$0x50,$length
  jge 	 .L_remaining_num_blocks_is_5_${rndsuffix}
  cmp 	 \$0x40,$length
  jge 	 .L_remaining_num_blocks_is_4_${rndsuffix}
  cmp 	 \$0x30,$length
  jge 	 .L_remaining_num_blocks_is_3_${rndsuffix}
  cmp 	 \$0x20,$length
  jge 	 .L_remaining_num_blocks_is_2_${rndsuffix}
  cmp 	 \$0x10,$length
  jge 	 .L_remaining_num_blocks_is_1_${rndsuffix}

  # _remaining_num_blocks_is_0:
  vmovdqu		%xmm5, %xmm1
  # xmm5 contains last full block to decrypt with next teawk
___
  }
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);

  {
  $code .= <<___;
  vmovdqu %xmm1, -0x10($output)
  vmovdqa %xmm1, %xmm8

  # Calc previous tweak
  mov		\$0x1,$tmp1
  kmovq		$tmp1, %k1
  vpsllq	\$0x3f,%xmm9,%xmm13
  vpsraq	\$0x3f,%xmm13,%xmm14
  vpandq	%xmm25,%xmm14,%xmm5
  vpxorq        %xmm5,%xmm9,%xmm9{%k1}
  vpsrldq       \$0x8,%xmm9,%xmm10
  .byte 98, 211, 181, 8, 115, 194, 1 #vpshrdq \$0x1,%xmm10,%xmm9,%xmm0
  vpslldq       \$0x8,%xmm13,%xmm13
  vpxorq        %xmm13,%xmm0,%xmm0
  jmp           .L_steal_cipher_${rndsuffix}

  .L_remaining_num_blocks_is_7_${rndsuffix}:
  mov 	 \$0xffffffffffffffff,$tmp1
  shr 	 \$0x10,$tmp1
  kmovq 	 $tmp1,%k1
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%zmm2{%k1}
  add 	         \$0x70,$input
  and            \$0xf,$length
  je             .L_done_7_remain_${rndsuffix}
  vextracti32x4   \$0x2,%zmm10,%xmm12
  vextracti32x4   \$0x3,%zmm10,%xmm13
  vinserti32x4    \$0x2,%xmm13,%zmm10,%zmm10
___
  }

  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1, ($output)
  vmovdqu8 	 %zmm2, 0x40($output){%k1}
  add 	         \$0x70, $output
  vextracti32x4  \$0x2,%zmm2,%xmm8
  vmovdqa        %xmm12,%xmm0
  jmp            .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_7_remain_${rndsuffix}:\n";
  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8        %zmm1, ($output)
  vmovdqu8        %zmm2, 0x40($output){%k1}
  jmp     .L_ret_${rndsuffix}

  .L_remaining_num_blocks_is_6_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%ymm2
  add 	         \$0x60,$input
  and            \$0xf, $length
  je             .L_done_6_remain_${rndsuffix}
  vextracti32x4   \$0x1,%zmm10,%xmm12
  vextracti32x4   \$0x2,%zmm10,%xmm13
  vinserti32x4    \$0x1,%xmm13,%zmm10,%zmm10
___
  }

  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1, ($output)
  vmovdqu8 	 %ymm2, 0x40($output)
  add 	         \$0x60,$output
  vextracti32x4  \$0x1,%zmm2,%xmm8
  vmovdqa        %xmm12,%xmm0
  jmp            .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_6_remain_${rndsuffix}:\n";
  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8        %zmm1, ($output)
  vmovdqu8        %ymm2,0x40($output)
  jmp             .L_ret_${rndsuffix}

  .L_remaining_num_blocks_is_5_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu 	 0x40($input),%xmm2
  add 	         \$0x50,$input
  and            \$0xf,$length
  je             .L_done_5_remain_${rndsuffix}
  vmovdqa        %xmm10,%xmm12
  vextracti32x4  \$0x1,%zmm10,%xmm10
___
  }

  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8         %zmm1, ($output)
  vmovdqu          %xmm2, 0x40($output)
  add              \$0x50, $output
  vmovdqa          %xmm2,%xmm8
  vmovdqa          %xmm12,%xmm0
  jmp              .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_5_remain_${rndsuffix}:\n";
  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8        %zmm1, ($output)
  vmovdqu8        %xmm2, 0x40($output)
  jmp             .L_ret_${rndsuffix}

  .L_remaining_num_blocks_is_4_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  add 	         \$0x40,$input
  and            \$0xf, $length
  je             .L_done_4_remain_${rndsuffix}
  vextracti32x4   \$0x3,%zmm9,%xmm12
  vinserti32x4    \$0x3,%xmm10,%zmm9,%zmm9
___
  }

  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8        %zmm1,($output)
  add             \$0x40,$output
  vextracti32x4   \$0x3,%zmm1,%xmm8
  vmovdqa         %xmm12,%xmm0
  jmp             .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_4_remain_${rndsuffix}:\n";
  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 1);

  {
  $code .= <<___;
  vmovdqu8        %zmm1, ($output)
  jmp             .L_ret_${rndsuffix}

  .L_remaining_num_blocks_is_3_${rndsuffix}:
  vmovdqu         ($input),%xmm1
  vmovdqu         0x10($input),%xmm2
  vmovdqu         0x20($input),%xmm3
  add             \$0x30,$input
  and             \$0xf,$length
  je              .L_done_3_remain_${rndsuffix}
  vextracti32x4   \$0x2,%zmm9,%xmm13
  vextracti32x4   \$0x1,%zmm9,%xmm10
  vextracti32x4   \$0x3,%zmm9,%xmm11
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  add 	         \$0x30,$output
  vmovdqa 	 %xmm3,%xmm8
  vmovdqa        %xmm13,%xmm0
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }
  $code .= "\n.L_done_3_remain_${rndsuffix}:\n";
  $code .= "vextracti32x4   \$0x1,%zmm9,%xmm10\n";
  $code .= "vextracti32x4   \$0x2,%zmm9,%xmm11\n";

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);

  {
  $code .= <<___;
  vmovdqu %xmm1,($output)
  vmovdqu %xmm2,0x10($output)
  vmovdqu %xmm3,0x20($output)
  jmp     .L_ret_${rndsuffix}

  .L_remaining_num_blocks_is_2_${rndsuffix}:
  vmovdqu         ($input),%xmm1
  vmovdqu         0x10($input),%xmm2
  add             \$0x20,$input
  and             \$0xf,$length
  je              .L_done_2_remain_${rndsuffix}
  vextracti32x4   \$0x2,%zmm9,%xmm10
  vextracti32x4   \$0x1,%zmm9,%xmm12
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  add 	         \$0x20,$output
  vmovdqa 	 %xmm2,%xmm8
  vmovdqa 	 %xmm12,%xmm0
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }
  $code .= "\n.L_done_2_remain_${rndsuffix}:\n";
  $code .= "vextracti32x4   \$0x1,%zmm9,%xmm10\n";

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);

  {
  $code .= <<___;
  vmovdqu   %xmm1,($output)
  vmovdqu   %xmm2,0x10($output)
  jmp       .L_ret_${rndsuffix}

  .L_remaining_num_blocks_is_1_${rndsuffix}:
  vmovdqu 	 ($input),%xmm1
  add 	         \$0x10,$input
  and            \$0xf,$length
  je             .L_done_1_remain_${rndsuffix}
  vextracti32x4  \$0x1,%zmm9,%xmm11
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm11", "%xmm10", "%xmm9", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);
  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  add 	         \$0x10,$output
  vmovdqa 	 %xmm1,%xmm8
  vmovdqa 	 %xmm9,%xmm0
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_1_remain_${rndsuffix}:\n";

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);

  {
  $code .= <<___;
  vmovdqu   %xmm1, ($output)
  jmp       .L_ret_${rndsuffix}

  .L_start_by16_${rndsuffix}:
  vbroadcasti32x4 	 ($TW),%zmm0
  vbroadcasti32x4 shufb_15_7(%rip),%zmm8
  mov 	 \$0xaa,$tmp1
  kmovq 	 $tmp1,%k2

  # Mult tweak by 2^{3, 2, 1, 0}
  vpshufb 	 %zmm8,%zmm0,%zmm1
  vpsllvq const_dq3210(%rip),%zmm0,%zmm4
  vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
  vpclmulqdq 	 \$0x0,$ZPOLY,%zmm2,%zmm3
  vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
  vpxord 	 %zmm4,%zmm3,%zmm9

  # Mult tweak by 2^{7, 6, 5, 4}
  vpsllvq const_dq7654(%rip),%zmm0,%zmm5
  vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
  vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
  vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
  vpxord 	 %zmm5,%zmm7,%zmm10

  # Make next 8 tweek values by all x 2^8
  vpsrldq 	 \$0xf,%zmm9,%zmm13
  vpclmulqdq 	 \$0x0,%zmm25,%zmm13,%zmm14
  vpslldq 	 \$0x1,%zmm9,%zmm11
  vpxord 	 %zmm14,%zmm11,%zmm11

  vpsrldq 	 \$0xf,%zmm10,%zmm15
  vpclmulqdq 	 \$0x0,%zmm25,%zmm15,%zmm16
  vpslldq 	 \$0x1,%zmm10,%zmm12
  vpxord 	 %zmm16,%zmm12,%zmm12

  .L_main_loop_run_16_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%zmm2
  vmovdqu8 	 0x80($input),%zmm3
  vmovdqu8 	 0xc0($input),%zmm4
  vmovdqu8 	 0xf0($input),%xmm5
  add 	 \$0x100,$input
___
  }

  decrypt_by_16_zmm("%zmm1", "%zmm2", "%zmm3", "%zmm4", "%zmm9",
                    "%zmm10", "%zmm11", "%zmm12", "%zmm0", 0);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu8 	 %zmm2,0x40($output)
  vmovdqu8 	 %zmm3,0x80($output)
  vmovdqu8 	 %zmm4,0xc0($output)
  add 	 \$0x100,$output
  sub 	 \$0x100,$length
  cmp 	 \$0x100,$length
  jge 	 .L_main_loop_run_16_${rndsuffix}

  cmp 	 \$0x80,$length
  jge 	 .L_main_loop_run_8_${rndsuffix}
  jmp 	 .L_do_n_blocks_${rndsuffix}

  .L_start_by8_${rndsuffix}:
  # Make first 7 tweek values
  vbroadcasti32x4 	 ($TW),%zmm0
  vbroadcasti32x4 shufb_15_7(%rip),%zmm8
  mov 	 \$0xaa,$tmp1
  kmovq 	 $tmp1,%k2

  # Mult tweak by 2^{3, 2, 1, 0}
  vpshufb 	 %zmm8,%zmm0,%zmm1
  vpsllvq const_dq3210(%rip),%zmm0,%zmm4
  vpsrlvq const_dq5678(%rip),%zmm1,%zmm2
  vpclmulqdq 	 \$0x0,%zmm25,%zmm2,%zmm3
  vpxorq 	 %zmm2,%zmm4,%zmm4{%k2}
  vpxord 	 %zmm4,%zmm3,%zmm9

  # Mult tweak by 2^{7, 6, 5, 4}
  vpsllvq const_dq7654(%rip),%zmm0,%zmm5
  vpsrlvq const_dq1234(%rip),%zmm1,%zmm6
  vpclmulqdq 	 \$0x0,%zmm25,%zmm6,%zmm7
  vpxorq 	 %zmm6,%zmm5,%zmm5{%k2}
  vpxord 	 %zmm5,%zmm7,%zmm10

  .L_main_loop_run_8_${rndsuffix}:
  vmovdqu8 	 ($input),%zmm1
  vmovdqu8 	 0x40($input),%zmm2
  vmovdqu8 	 0x70($input),%xmm5
  add 	         \$0x80,$input
___
  }


  decrypt_by_eight_zmm("%zmm1", "%zmm2", "%zmm9", "%zmm10", "%zmm0", 0);

  {
  $code .= <<___;
  vmovdqu8 	 %zmm1,($output)
  vmovdqu8 	 %zmm2,0x40($output)
  add 	 \$0x80,$output
  sub 	 \$0x80,$length
  cmp 	 \$0x80,$length
  jge 	 .L_main_loop_run_8_${rndsuffix}
  jmp 	 .L_do_n_blocks_${rndsuffix}

  .L_steal_cipher_${rndsuffix}:
  # start cipher stealing simplified: xmm8-last cipher block, xmm0-next tweak
  vmovdqa 	 %xmm8,%xmm2

  # shift xmm8 to the left by 16-N_val bytes
  lea vpshufb_shf_table(%rip),$TWTEMPL
  vmovdqu 	 ($TWTEMPL,$length,1),%xmm10
  vpshufb 	 %xmm10,%xmm8,%xmm8


  vmovdqu 	 -0x10($input,$length,1),%xmm3
  vmovdqu 	 %xmm8,-0x10($output,$length,1)

  # shift xmm3 to the right by 16-N_val bytes
  lea vpshufb_shf_table(%rip), $TWTEMPL
  add \$16, $TWTEMPL
  sub 	 $length,$TWTEMPL
  vmovdqu 	 ($TWTEMPL),%xmm10
  vpxor mask1(%rip),%xmm10,%xmm10
  vpshufb 	 %xmm10,%xmm3,%xmm3

  vpblendvb 	 %xmm10,%xmm2,%xmm3,%xmm3

  # xor Tweak value
  vpxor 	 %xmm0,%xmm3,%xmm8

  # decrypt last block with cipher stealing
  vpxor 	 0x80(%rsp),%xmm8,%xmm8
  vaesdec 	 0x90(%rsp),%xmm8,%xmm8
  vaesdec 	 0xa0(%rsp),%xmm8,%xmm8
  vaesdec 	 0xb0(%rsp),%xmm8,%xmm8
  vaesdec 	 0xc0(%rsp),%xmm8,%xmm8
  vaesdec 	 0xd0(%rsp),%xmm8,%xmm8
  vaesdec 	 0xe0(%rsp),%xmm8,%xmm8
  vaesdec 	 0xf0(%rsp),%xmm8,%xmm8
  vaesdec 	 0x100(%rsp),%xmm8,%xmm8
  vaesdec 	 0x110(%rsp),%xmm8,%xmm8
  vaesdec 	 0x120(%rsp),%xmm8,%xmm8
  vaesdec 	 0x130(%rsp),%xmm8,%xmm8
  vaesdec 	 0x140(%rsp),%xmm8,%xmm8
  vaesdec 	 0x150(%rsp),%xmm8,%xmm8
  vaesdeclast 	 0x160(%rsp),%xmm8,%xmm8

  # xor Tweak value
  vpxor 	 %xmm0,%xmm8,%xmm8

  .L_done_${rndsuffix}:
  # store last ciphertext value
  vmovdqu 	 %xmm8,-0x10($output)
___
  }

  {
  $code .= <<___;
  .L_ret_${rndsuffix}:
  mov 	 $GP_STORAGE($TW),%rbx
  xor    $tmp1,$tmp1
  mov    $tmp1,$GP_STORAGE($TW)
  # Zero-out the whole of `%zmm0`.
  vpxorq %zmm0,%zmm0,%zmm0
___
  }

  if ($win64) {
    $code .= <<___;
    mov $GP_STORAGE + 8*1($TW),%rdi
    mov $tmp1,$GP_STORAGE + 8*1($TW)
    mov $GP_STORAGE + 8*2($TW),%rsi
    mov $tmp1,$GP_STORAGE + 8*2($TW)

    vmovdqa $XMM_STORAGE + 16 * 0($TW), %xmm6
    vmovdqa $XMM_STORAGE + 16 * 1($TW), %xmm7
    vmovdqa $XMM_STORAGE + 16 * 2($TW), %xmm8
    vmovdqa $XMM_STORAGE + 16 * 3($TW), %xmm9

    # Zero the 64 bytes we just restored to the xmm registers.
    vmovdqa64 %zmm0,$XMM_STORAGE($TW)

    vmovdqa $XMM_STORAGE + 16 * 4($TW), %xmm10
    vmovdqa $XMM_STORAGE + 16 * 5($TW), %xmm11
    vmovdqa $XMM_STORAGE + 16 * 6($TW), %xmm12
    vmovdqa $XMM_STORAGE + 16 * 7($TW), %xmm13

    # And again.
    vmovdqa64 %zmm0,$XMM_STORAGE + 16 * 4($TW)

    vmovdqa $XMM_STORAGE + 16 * 8($TW), %xmm14
    vmovdqa $XMM_STORAGE + 16 * 9($TW), %xmm15

    # Last round is only 32 bytes (256-bits), so we use `%ymm` as the
    # source operand.
    vmovdqa %ymm0,$XMM_STORAGE + 16 * 8($TW)
___
  }

  {
  $code .= <<___;
  # Zero-out the stack frames used for `key1`, 64 bytes at a time.
  vmovdqa64    %zmm0,0x80(%rsp)
  vmovdqa64    %zmm0,0xc0(%rsp)
  vmovdqa64    %zmm0,0x100(%rsp)

  # Stack usage is not divisible by 64, so we use a kmask register to
  # only mov 48 of the bytes (6 quad-words).
  mov       \$0x3f,$tmp1
  kmovq     $tmp1,%k2
  vmovdqa64 %zmm0,0x140(%rsp){%k2}

  mov %rbp,%rsp
  pop %rbp
  vzeroupper
  ret

  .L_less_than_128_bytes_${rndsuffix}:
  cmp 	 \$0x10,$length
  jb 	 .L_ret_${rndsuffix}

  mov 	 $length,$tmp1
  and 	 \$0x70,$tmp1
  cmp 	 \$0x60,$tmp1
  je 	 .L_num_blocks_is_6_${rndsuffix}
  cmp 	 \$0x50,$tmp1
  je 	 .L_num_blocks_is_5_${rndsuffix}
  cmp 	 \$0x40,$tmp1
  je 	 .L_num_blocks_is_4_${rndsuffix}
  cmp 	 \$0x30,$tmp1
  je 	 .L_num_blocks_is_3_${rndsuffix}
  cmp 	 \$0x20,$tmp1
  je 	 .L_num_blocks_is_2_${rndsuffix}
  cmp 	 \$0x10,$tmp1
  je 	 .L_num_blocks_is_1_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_7_${rndsuffix}:\n";
  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 7);

  {
  $code .= <<___;
  add    \$0x70,$input
  and    \$0xf,$length
  je      .L_done_7_${rndsuffix}

  .L_steal_cipher_7_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm15,%xmm16
   vmovdqa     0x10(%rsp),%xmm15
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  vmovdqu 	 %xmm6,0x50($output)
  add 	         \$0x70,$output
  vmovdqa64 	 %xmm16,%xmm0
  vmovdqa 	 %xmm7,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_7_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 7, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  vmovdqu 	 %xmm6,0x50($output)
  add 	         \$0x70,$output
  vmovdqa 	 %xmm7,%xmm8
  jmp 	         .L_done_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_6_${rndsuffix}:\n";
  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 6);

  {
  $code .= <<___;
  add    \$0x60,$input
  and    \$0xf,$length
  je      .L_done_6_${rndsuffix}

  .L_steal_cipher_6_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm14,%xmm15
   vmovdqa     0x10(%rsp),%xmm14
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  add 	         \$0x60,$output
  vmovdqa 	 %xmm15,%xmm0
  vmovdqa 	 %xmm6,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }
  $code .= "\n.L_done_6_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 6, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  vmovdqu 	 %xmm5,0x40($output)
  add 	         \$0x60,$output
  vmovdqa 	 %xmm6,%xmm8
  jmp 	         .L_done_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_5_${rndsuffix}:\n";
  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 5);

  {
  $code .= <<___;
  add    \$0x50,$input
  and    \$0xf,$length
  je      .L_done_5_${rndsuffix}

  .L_steal_cipher_5_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm13,%xmm14
   vmovdqa     0x10($TW),%xmm13
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  add 	         \$0x50,$output
  vmovdqa 	 %xmm14,%xmm0
  vmovdqa 	 %xmm5,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_5_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 5, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  vmovdqu 	 %xmm4,0x30($output)
  add 	         \$0x50,$output
  vmovdqa 	 %xmm5,%xmm8
  jmp 	         .L_done_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_4_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 4);

  {
  $code .= <<___;
  add    \$0x40,$input
  and    \$0xf,$length
  je      .L_done_4_${rndsuffix}

  .L_steal_cipher_4_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm12,%xmm13
   vmovdqa     0x10($TW),%xmm12
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  add 	         \$0x40,$output
  vmovdqa 	 %xmm13,%xmm0
  vmovdqa 	 %xmm4,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_4_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 4, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  vmovdqu 	 %xmm3,0x20($output)
  add 	         \$0x40,$output
  vmovdqa 	 %xmm4,%xmm8
  jmp 	         .L_done_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_3_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 3);

  {
  $code .= <<___;
  add    \$0x30,$input
  and    \$0xf,$length
  je      .L_done_3_${rndsuffix}

  .L_steal_cipher_3_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm11,%xmm12
   vmovdqa     0x10($TW),%xmm11
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  add 	         \$0x30,$output
  vmovdqa 	 %xmm12,%xmm0
  vmovdqa 	 %xmm3,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }
  $code .= "\n.L_done_3_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 3, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  vmovdqu 	 %xmm2,0x10($output)
  add 	         \$0x30,$output
  vmovdqa 	 %xmm3,%xmm8
  jmp 	         .L_done_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_2_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 2);

  {
  $code .= <<___;
  add    \$0x20,$input
  and    \$0xf,$length
  je      .L_done_2_${rndsuffix}

  .L_steal_cipher_2_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm10,%xmm11
   vmovdqa     0x10($TW),%xmm10
___
  }

  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  add 	         \$0x20,$output
  vmovdqa 	 %xmm11,%xmm0
  vmovdqa 	 %xmm2,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }

  $code .= "\n.L_done_2_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 2, 1);

  {
  $code .= <<___;
  vmovdqu 	 %xmm1,($output)
  add 	         \$0x20,$output
  vmovdqa 	 %xmm2,%xmm8
  jmp 	         .L_done_${rndsuffix}
___
  }

  $code .= "\n.L_num_blocks_is_1_${rndsuffix}:\n";

  initialize("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
             "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
             "%xmm13", "%xmm14", "%xmm15", 1);

  {
  $code .= <<___;
  add    \$0x10,$input
  and    \$0xf,$length
  je      .L_done_1_${rndsuffix}

  .L_steal_cipher_1_${rndsuffix}:
   xor         $gf_poly_8b_temp, $gf_poly_8b_temp
   shl         \$1, $TWTEMPL
   adc         $TWTEMPH, $TWTEMPH
   cmovc       $gf_poly_8b, $gf_poly_8b_temp
   xor         $gf_poly_8b_temp, $TWTEMPL
   mov         $TWTEMPL,0x10($TW)
   mov         $TWTEMPH,0x18($TW)
   vmovdqa64   %xmm9,%xmm10
   vmovdqa     0x10($TW),%xmm9
___
  }
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);

  {
  $code .= <<___;
  add 	         \$0x10,$output
  vmovdqa 	 %xmm10,%xmm0
  vmovdqa 	 %xmm1,%xmm8
  jmp 	         .L_steal_cipher_${rndsuffix}
___
  }
  $code .= "\n.L_done_1_${rndsuffix}:\n";
  decrypt_initial("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6",
                  "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", "%xmm12",
                  "%xmm13", "%xmm14", "%xmm15", "%xmm0", 1, 1);

  {
  $code .= <<___;
  add 	         \$0x10,$output
  vmovdqa 	 %xmm1,%xmm8
  jmp 	         .L_done_${rndsuffix}
  .cfi_endproc
___
  }

  $code .= <<___;
  .section .rodata
  .align 16

  vpshufb_shf_table:
    .quad 0x8786858483828100, 0x8f8e8d8c8b8a8988
    .quad 0x0706050403020100, 0x000e0d0c0b0a0908

  mask1:
    .quad 0x8080808080808080, 0x8080808080808080

  const_dq3210:
    .quad 0, 0, 1, 1, 2, 2, 3, 3
  const_dq5678:
    .quad 8, 8, 7, 7, 6, 6, 5, 5
  const_dq7654:
    .quad 4, 4, 5, 5, 6, 6, 7, 7
  const_dq1234:
    .quad 4, 4, 3, 3, 2, 2, 1, 1

  shufb_15_7:
    .byte  15, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 7, 0xff, 0xff
    .byte  0xff, 0xff, 0xff, 0xff, 0xff

.text
#endif
___
} else {
    $code .= <<___;
    .text
    .globl  aes_hw_xts_encrypt_avx512
    .globl  aes_hw_xts_decrypt_avx512

    aes_hw_xts_encrypt_avx512:
    aes_hw_xts_decrypt_avx512:
    .byte   0x0f,0x0b    # ud2
    ret
___
}

# Bits 7 & 4 contain the src1 register's MSB in inverted form
# Bits 6 & 5 contian the dst register's MSB in inverted form
# Bits 1 & 0 is fixed to 10 for vaesenc* instrcutions and 11
# for vpclmulqdq instruction
sub evex_byte1 {
  my ($mm, $src1, $dst) = @_;
  # set default to zero
  $src1 = 0 if (!defined($src1));
  $dst = 0 if (!defined($dst));

  my $byte = 0xf0 | $mm;

  if (($src1 & 0x8) > 0) {
      $byte = $byte & 0x7f;
  }
  if (($src1 & 0x10) > 0) {
      $byte = $byte & 0xef;
  }

  if (($dst & 0x8) > 0) {
      $byte = $byte & 0xdf;
  }
  if (($dst & 0x10) > 0) {
      $byte = $byte & 0xbf;
  }
  return $byte;
}

# Bits 6->3 contians the lower 4 bits of src2 register in inverted form
# Bits 0->2 is fixed to 101
sub evex_byte2 {
  my $src2 = shift;
  $src2 = ($src2 & 0x0f) ^ 0x0f;
  return (($src2 << 3) | 0x05);
}

# Bits 6 & 5 tells about the operand register types and bit 3 contains
# the src2 register's MSB in inverted form
sub evex_byte3 {
  my ($type, $src2) = @_;
  my $byte = 0x0; # default for xmm registers
  if ($type eq 'y') {
	$byte = 0x01;
  } elsif ($type eq 'z') {
	$byte = 0x02;
  }

  $byte = $byte << 5;

  if (!($src2 & 0x10)) {
      $byte = $byte | 0x08;
  }
  return $byte;
}

sub vpclmulqdq {
  my $line = shift;
  my @opcode = (0x62);
  my $inst_type = 0x03; #vpclmulqdq
  my %opcodelet = (
     "vpclmulqdq" => 0x44,
  );
  if ($line=~/(vpclmul[a-z]+)\s+\$0x([0-9]+),\s*%([xyz])mm([0-9]+),\s*%[xyz]mm([0-9]+),\s*%[xyz]mm([0-9]+)/) {
        return undef if (!defined($opcodelet{$1}));
        my $byte1 = evex_byte1($inst_type, $6, $4);
        my $byte2 = evex_byte2($5);
        my $byte3 = evex_byte3($3, $5);
        my $modrm = 0xc0 | (($4 & 7) | (($6 & 7) << 3));
	push @opcode,$byte1,$byte2,$byte3;
	push @opcode,($opcodelet{$1});
	push @opcode,$modrm;
	push @opcode,hex($2);
        return ".byte\t".join(',',@opcode);
  }
  return $line;
}

sub vaesni {
  my $line = shift;
  my @opcode = (0x62);
  my $inst_type = 0x02; # vaesenc
  my $byte1, $byte2, $byte3;
  my %opcodelet = (
     "vaesenc" => 0xdc, "vaesdec" => 0xde,
     "vaesenclast" => 0xdd, "vaesdeclast" => 0xdf,
  );
  if ($line=~/(vaes[a-z]+)\s+%([xyz])mm([0-9]+),\s*%[xyz]mm([0-9]+),\s*%[xyz]mm([0-9]*)/) {
        return undef if (!defined($opcodelet{$1}));
        $byte1 = evex_byte1($inst_type, $5, $3);
        $byte2 = evex_byte2($4);
        $byte3 = evex_byte3($2, $4);
        my $modrm = 0xc0 | ((($5 & 7) << 3) | ($3 & 7));
	push @opcode,$byte1,$byte2,$byte3;
	push @opcode,($opcodelet{$1});
	push @opcode,$modrm;
        return ".byte\t".join(',',@opcode);
  } elsif ($line=~/(vaes[a-z]+)\s+0x([a-f,0-9]+)\(%rsp\),\s*%([xyz])mm([0-9]+),\s*%[xyz]mm([0-9]+)/) {
        return undef if (!defined($opcodelet{$1}));
        $byte1 = evex_byte1($inst_type,$5);
        $byte2 = evex_byte2($5);
        $byte3 = evex_byte3($3, $5);
        push @opcode,$byte1,$byte2,$byte3;
        push @opcode,($opcodelet{$1});
        my $rsp = 0x04;
        my $modrm = 0x80 | ((($5 & 7) << 3) | $rsp);
        push @opcode,$modrm;
        push @opcode,0x24;
        push @opcode, (hex($2) & 0xFF), ((hex($2) >> 8) & 0xFF);
        push @opcode, ((hex($2) >> 16) & 0xFF), ((hex($2) >> 24) & 0xFF);
        return ".byte\t".join(',',@opcode);
  }
  return $line;
}

$code =~ s/\`([^\`]*)\`/eval($1)/gem;
$code =~ s/\b(vpclmul.*).*$/vpclmulqdq($1)/gem;
$code =~ s/\b(vaesenc.*).*$/vaesni($1)/gem;
$code =~ s/\b(vaesdec.*).*$/vaesni($1)/gem;

print $code;

close STDOUT or die "error closing STDOUT: $!";
