Index: head/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision 364822) @@ -1,1474 +1,1474 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Multi-buffer AES-NI procedures process several independent buffers # in parallel by interleaving independent instructions. # # Cycles per byte for interleave factor 4: # # asymptotic measured # --------------------------- # Westmere 5.00/4=1.25 5.13/4=1.28 # Atom 15.0/4=3.75 ?15.7/4=3.93 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29 # Haswell 4.44/4=1.11 4.44/4=1.11 # Bulldozer 5.75/4=1.44 5.76/4=1.44 # # Cycles per byte for interleave factor 8 (not implemented for # pre-AVX processors, where higher interleave factor incidentally # doesn't result in improvement): # # asymptotic measured # --------------------------- # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) # Haswell 5.00/8=0.63 5.00/8=0.63 # Bulldozer 5.75/8=0.72 5.77/8=0.72 # # (*) Sandy/Ivy Bridge are known to handle high interleave factors # suboptimally; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void aesni_multi_cbc_encrypt ( # struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; # const AES_KEY *key, # int num); /* 1 or 2 */ # $inp="%rdi"; # 1st arg $key="%rsi"; # 2nd arg $num="%edx"; @inptr=map("%r$_",(8..11)); @outptr=map("%r$_",(12..15)); ($rndkey0,$rndkey1)=("%xmm0","%xmm1"); @out=map("%xmm$_",(2..5)); @inp=map("%xmm$_",(6..9)); ($counters,$mask,$zero)=map("%xmm$_",(10..12)); ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl aesni_multi_cbc_encrypt .type aesni_multi_cbc_encrypt,\@function,3 .align 32 aesni_multi_cbc_encrypt: .cfi_startproc ___ $code.=<<___ if ($avx); cmp \$2,$num jb .Lenc_non_avx mov OPENSSL_ia32cap_P+4(%rip),%ecx test \$`1<<28`,%ecx # AVX bit jnz _avx_cbc_enc_shortcut jmp .Lenc_non_avx .align 16 .Lenc_non_avx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,0x60(%rsp) movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters sub \$48,%rsp and \$-64,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Lenc4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*2($inp),$inp .Lenc4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*2`($inp),@inptr[$i] cmp $num,$one mov `40*$i+8-40*2`($inp),@outptr[$i] cmovg $one,$num # find maximum test $one,$one movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Lenc4x_done movups 0x10-0x78($key),$rndkey1 pxor $zero,@out[0] movups 0x20-0x78($key),$rndkey0 pxor $zero,@out[1] mov 0xf0-0x78($key),$rounds pxor $zero,@out[2] movdqu (@inptr[0]),@inp[0] # load inputs pxor $zero,@out[3] movdqu (@inptr[1]),@inp[1] pxor @inp[0],@out[0] movdqu (@inptr[2]),@inp[2] pxor @inp[1],@out[1] movdqu (@inptr[3]),@inp[3] pxor @inp[2],@out[2] pxor @inp[3],@out[3] movdqa 32(%rsp),$counters # load counters xor $offset,$offset jmp .Loop_enc4x .align 32 .Loop_enc4x: add \$16,$offset lea 16(%rsp),$sink # sink pointer mov \$1,$one # constant of 1 sub $offset,$sink aesenc $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesenc $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[2],$offset) aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 ___ for($i=0;$i<4;$i++) { my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; $code.=<<___; cmp `32+4*$i`(%rsp),$one aesenc $rndkey,@out[0] aesenc $rndkey,@out[1] aesenc $rndkey,@out[2] cmovge $sink,@inptr[$i] # cancel input cmovg $sink,@outptr[$i] # sink output aesenc $rndkey,@out[3] movups `0x40+16*$i-0x78`($key),$rndkey ___ } $code.=<<___; movdqa $counters,$mask aesenc $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesenc $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 pxor $zero,$zero aesenc $rndkey1,@out[0] pcmpgtd $zero,$mask movdqu -0x78($key),$zero # reload 0-round key aesenc $rndkey1,@out[1] paddd $mask,$counters # decrement counters movdqa $counters,32(%rsp) # update counters aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0x90-0x78($key),$rndkey1 cmp \$11,$rounds aesenc $rndkey0,@out[0] aesenc $rndkey0,@out[1] aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xa0-0x78($key),$rndkey0 jb .Lenc4x_tail aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0xb0-0x78($key),$rndkey1 aesenc $rndkey0,@out[0] aesenc $rndkey0,@out[1] aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xc0-0x78($key),$rndkey0 je .Lenc4x_tail aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0xd0-0x78($key),$rndkey1 aesenc $rndkey0,@out[0] aesenc $rndkey0,@out[1] aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Lenc4x_tail .align 32 .Lenc4x_tail: aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movdqu (@inptr[0],$offset),@inp[0] movdqu 0x10-0x78($key),$rndkey1 aesenclast $rndkey0,@out[0] movdqu (@inptr[1],$offset),@inp[1] pxor $zero,@inp[0] aesenclast $rndkey0,@out[1] movdqu (@inptr[2],$offset),@inp[2] pxor $zero,@inp[1] aesenclast $rndkey0,@out[2] movdqu (@inptr[3],$offset),@inp[3] pxor $zero,@inp[2] aesenclast $rndkey0,@out[3] movdqu 0x20-0x78($key),$rndkey0 pxor $zero,@inp[3] movups @out[0],-16(@outptr[0],$offset) pxor @inp[0],@out[0] movups @out[1],-16(@outptr[1],$offset) pxor @inp[1],@out[1] movups @out[2],-16(@outptr[2],$offset) pxor @inp[2],@out[2] movups @out[3],-16(@outptr[3],$offset) pxor @inp[3],@out[3] dec $num jnz .Loop_enc4x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 mov 24(%rsp),$num #pxor @inp[0],@out[0] #pxor @inp[1],@out[1] #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! #pxor @inp[2],@out[2] #movdqu @out[1],`40*1+24-40*2`($inp) #pxor @inp[3],@out[3] #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... lea `40*4`($inp),$inp dec $num jnz .Lenc4x_loop_grande .Lenc4x_done: ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 #movaps -0x68(%rax),%xmm13 #movaps -0x58(%rax),%xmm14 #movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lenc4x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt .globl aesni_multi_cbc_decrypt .type aesni_multi_cbc_decrypt,\@function,3 .align 32 aesni_multi_cbc_decrypt: .cfi_startproc ___ $code.=<<___ if ($avx); cmp \$2,$num jb .Ldec_non_avx mov OPENSSL_ia32cap_P+4(%rip),%ecx test \$`1<<28`,%ecx # AVX bit jnz _avx_cbc_dec_shortcut jmp .Ldec_non_avx .align 16 .Ldec_non_avx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,0x60(%rsp) movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters sub \$48,%rsp and \$-64,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Ldec4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*2($inp),$inp .Ldec4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*2`($inp),@inptr[$i] cmp $num,$one mov `40*$i+8-40*2`($inp),@outptr[$i] cmovg $one,$num # find maximum test $one,$one movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldec4x_done movups 0x10-0x78($key),$rndkey1 movups 0x20-0x78($key),$rndkey0 mov 0xf0-0x78($key),$rounds movdqu (@inptr[0]),@out[0] # load inputs movdqu (@inptr[1]),@out[1] pxor $zero,@out[0] movdqu (@inptr[2]),@out[2] pxor $zero,@out[1] movdqu (@inptr[3]),@out[3] pxor $zero,@out[2] pxor $zero,@out[3] movdqa 32(%rsp),$counters # load counters xor $offset,$offset jmp .Loop_dec4x .align 32 .Loop_dec4x: add \$16,$offset lea 16(%rsp),$sink # sink pointer mov \$1,$one # constant of 1 sub $offset,$sink aesdec $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesdec $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[3],$offset) aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 ___ for($i=0;$i<4;$i++) { my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; $code.=<<___; cmp `32+4*$i`(%rsp),$one aesdec $rndkey,@out[0] aesdec $rndkey,@out[1] aesdec $rndkey,@out[2] cmovge $sink,@inptr[$i] # cancel input cmovg $sink,@outptr[$i] # sink output aesdec $rndkey,@out[3] movups `0x40+16*$i-0x78`($key),$rndkey ___ } $code.=<<___; movdqa $counters,$mask aesdec $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesdec $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 pxor $zero,$zero aesdec $rndkey1,@out[0] pcmpgtd $zero,$mask movdqu -0x78($key),$zero # reload 0-round key aesdec $rndkey1,@out[1] paddd $mask,$counters # decrement counters movdqa $counters,32(%rsp) # update counters aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0x90-0x78($key),$rndkey1 cmp \$11,$rounds aesdec $rndkey0,@out[0] aesdec $rndkey0,@out[1] aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xa0-0x78($key),$rndkey0 jb .Ldec4x_tail aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0xb0-0x78($key),$rndkey1 aesdec $rndkey0,@out[0] aesdec $rndkey0,@out[1] aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xc0-0x78($key),$rndkey0 je .Ldec4x_tail aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0xd0-0x78($key),$rndkey1 aesdec $rndkey0,@out[0] aesdec $rndkey0,@out[1] aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Ldec4x_tail .align 32 .Ldec4x_tail: aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] aesdec $rndkey1,@out[2] pxor $rndkey0,@inp[0] pxor $rndkey0,@inp[1] aesdec $rndkey1,@out[3] movdqu 0x10-0x78($key),$rndkey1 pxor $rndkey0,@inp[2] pxor $rndkey0,@inp[3] movdqu 0x20-0x78($key),$rndkey0 aesdeclast @inp[0],@out[0] aesdeclast @inp[1],@out[1] movdqu -16(@inptr[0],$offset),@inp[0] # load next IV movdqu -16(@inptr[1],$offset),@inp[1] aesdeclast @inp[2],@out[2] aesdeclast @inp[3],@out[3] movdqu -16(@inptr[2],$offset),@inp[2] movdqu -16(@inptr[3],$offset),@inp[3] movups @out[0],-16(@outptr[0],$offset) movdqu (@inptr[0],$offset),@out[0] movups @out[1],-16(@outptr[1],$offset) movdqu (@inptr[1],$offset),@out[1] pxor $zero,@out[0] movups @out[2],-16(@outptr[2],$offset) movdqu (@inptr[2],$offset),@out[2] pxor $zero,@out[1] movups @out[3],-16(@outptr[3],$offset) movdqu (@inptr[3],$offset),@out[3] pxor $zero,@out[2] pxor $zero,@out[3] dec $num jnz .Loop_dec4x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 mov 24(%rsp),$num lea `40*4`($inp),$inp dec $num jnz .Ldec4x_loop_grande .Ldec4x_done: ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 #movaps -0x68(%rax),%xmm13 #movaps -0x58(%rax),%xmm14 #movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Ldec4x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt ___ if ($avx) {{{ my @ptr=map("%r$_",(8..15)); my $offload=$sink; my @out=map("%xmm$_",(2..9)); my @inp=map("%xmm$_",(10..13)); my ($counters,$zero)=("%xmm14","%xmm15"); $code.=<<___; .type aesni_multi_cbc_encrypt_avx,\@function,3 .align 32 aesni_multi_cbc_encrypt_avx: .cfi_startproc _avx_cbc_enc_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters # +64 distances between inputs and outputs # +128 off-load area for @inp[0..3] sub \$192,%rsp and \$-128,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Lenc8x_body: vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*4($inp),$inp shr \$1,$num .Lenc8x_loop_grande: #mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; $code.=<<___; mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer cmp $num,$one mov `40*$i+8-40*4`($inp),$temp # output pointer cmovg $one,$num # find maximum test $one,$one vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output mov $temp,`64+8*$i`(%rsp) # initialize distances ___ } $code.=<<___; test $num,$num jz .Lenc8x_done vmovups 0x10-0x78($key),$rndkey1 vmovups 0x20-0x78($key),$rndkey0 mov 0xf0-0x78($key),$rounds vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round lea 128(%rsp),$offload # offload area vpxor (@ptr[1]),$zero,@inp[1] vpxor (@ptr[2]),$zero,@inp[2] vpxor (@ptr[3]),$zero,@inp[3] vpxor @inp[0],@out[0],@out[0] vpxor (@ptr[4]),$zero,@inp[0] vpxor @inp[1],@out[1],@out[1] vpxor (@ptr[5]),$zero,@inp[1] vpxor @inp[2],@out[2],@out[2] vpxor (@ptr[6]),$zero,@inp[2] vpxor @inp[3],@out[3],@out[3] vpxor (@ptr[7]),$zero,@inp[3] vpxor @inp[0],@out[4],@out[4] mov \$1,$one # constant of 1 vpxor @inp[1],@out[5],@out[5] vpxor @inp[2],@out[6],@out[6] vpxor @inp[3],@out[7],@out[7] jmp .Loop_enc8x .align 32 .Loop_enc8x: ___ for($i=0;$i<8;$i++) { my $rndkey=($i&1)?$rndkey0:$rndkey1; $code.=<<___; vaesenc $rndkey,@out[0],@out[0] cmp 32+4*$i(%rsp),$one ___ $code.=<<___ if ($i); mov 64+8*$i(%rsp),$offset ___ $code.=<<___; vaesenc $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesenc $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesenc $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input vaesenc $rndkey,@out[4],@out[4] cmovg %rsp,$offset # sink output vaesenc $rndkey,@out[5],@out[5] sub @ptr[$i],$offset vaesenc $rndkey,@out[6],@out[6] vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round mov $offset,64+8*$i(%rsp) vaesenc $rndkey,@out[7],@out[7] vmovups `16*(3+$i)-0x78`($key),$rndkey lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output ___ $code.=<<___ if ($i<4) vmovdqu @inp[$i%4],`16*$i`($offload) # off-load ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Lenc8x_tail vaesenc $rndkey1,@out[0],@out[0] vaesenc $rndkey1,@out[1],@out[1] vaesenc $rndkey1,@out[2],@out[2] vaesenc $rndkey1,@out[3],@out[3] vaesenc $rndkey1,@out[4],@out[4] vaesenc $rndkey1,@out[5],@out[5] vaesenc $rndkey1,@out[6],@out[6] vaesenc $rndkey1,@out[7],@out[7] vmovups 0xb0-0x78($key),$rndkey1 vaesenc $rndkey0,@out[0],@out[0] vaesenc $rndkey0,@out[1],@out[1] vaesenc $rndkey0,@out[2],@out[2] vaesenc $rndkey0,@out[3],@out[3] vaesenc $rndkey0,@out[4],@out[4] vaesenc $rndkey0,@out[5],@out[5] vaesenc $rndkey0,@out[6],@out[6] vaesenc $rndkey0,@out[7],@out[7] vmovups 0xc0-0x78($key),$rndkey0 je .Lenc8x_tail vaesenc $rndkey1,@out[0],@out[0] vaesenc $rndkey1,@out[1],@out[1] vaesenc $rndkey1,@out[2],@out[2] vaesenc $rndkey1,@out[3],@out[3] vaesenc $rndkey1,@out[4],@out[4] vaesenc $rndkey1,@out[5],@out[5] vaesenc $rndkey1,@out[6],@out[6] vaesenc $rndkey1,@out[7],@out[7] vmovups 0xd0-0x78($key),$rndkey1 vaesenc $rndkey0,@out[0],@out[0] vaesenc $rndkey0,@out[1],@out[1] vaesenc $rndkey0,@out[2],@out[2] vaesenc $rndkey0,@out[3],@out[3] vaesenc $rndkey0,@out[4],@out[4] vaesenc $rndkey0,@out[5],@out[5] vaesenc $rndkey0,@out[6],@out[6] vaesenc $rndkey0,@out[7],@out[7] vmovups 0xe0-0x78($key),$rndkey0 .Lenc8x_tail: vaesenc $rndkey1,@out[0],@out[0] vpxor $zero,$zero,$zero vaesenc $rndkey1,@out[1],@out[1] vaesenc $rndkey1,@out[2],@out[2] vpcmpgtd $zero,$counters,$zero vaesenc $rndkey1,@out[3],@out[3] vaesenc $rndkey1,@out[4],@out[4] vpaddd $counters,$zero,$zero # decrement counters vmovdqu 48(%rsp),$counters vaesenc $rndkey1,@out[5],@out[5] mov 64(%rsp),$offset # pre-load 1st offset vaesenc $rndkey1,@out[6],@out[6] vaesenc $rndkey1,@out[7],@out[7] vmovups 0x10-0x78($key),$rndkey1 vaesenclast $rndkey0,@out[0],@out[0] vmovdqa $zero,32(%rsp) # update counters vpxor $zero,$zero,$zero vaesenclast $rndkey0,@out[1],@out[1] vaesenclast $rndkey0,@out[2],@out[2] vpcmpgtd $zero,$counters,$zero vaesenclast $rndkey0,@out[3],@out[3] vaesenclast $rndkey0,@out[4],@out[4] vpaddd $zero,$counters,$counters # decrement counters vmovdqu -0x78($key),$zero # 0-round vaesenclast $rndkey0,@out[5],@out[5] vaesenclast $rndkey0,@out[6],@out[6] vmovdqa $counters,48(%rsp) # update counters vaesenclast $rndkey0,@out[7],@out[7] vmovups 0x20-0x78($key),$rndkey0 vmovups @out[0],-16(@ptr[0]) # write output sub $offset,@ptr[0] # switch to input vpxor 0x00($offload),@out[0],@out[0] vmovups @out[1],-16(@ptr[1]) sub `64+1*8`(%rsp),@ptr[1] vpxor 0x10($offload),@out[1],@out[1] vmovups @out[2],-16(@ptr[2]) sub `64+2*8`(%rsp),@ptr[2] vpxor 0x20($offload),@out[2],@out[2] vmovups @out[3],-16(@ptr[3]) sub `64+3*8`(%rsp),@ptr[3] vpxor 0x30($offload),@out[3],@out[3] vmovups @out[4],-16(@ptr[4]) sub `64+4*8`(%rsp),@ptr[4] vpxor @inp[0],@out[4],@out[4] vmovups @out[5],-16(@ptr[5]) sub `64+5*8`(%rsp),@ptr[5] vpxor @inp[1],@out[5],@out[5] vmovups @out[6],-16(@ptr[6]) sub `64+6*8`(%rsp),@ptr[6] vpxor @inp[2],@out[6],@out[6] vmovups @out[7],-16(@ptr[7]) sub `64+7*8`(%rsp),@ptr[7] vpxor @inp[3],@out[7],@out[7] dec $num jnz .Loop_enc8x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num #lea `40*8`($inp),$inp #dec $num #jnz .Lenc8x_loop_grande .Lenc8x_done: vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lenc8x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx .type aesni_multi_cbc_decrypt_avx,\@function,3 .align 32 aesni_multi_cbc_decrypt_avx: .cfi_startproc _avx_cbc_dec_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters # +64 distances between inputs and outputs # +128 off-load area for @inp[0..3] # +192 IV/input offload sub \$256,%rsp and \$-256,%rsp sub \$192,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Ldec8x_body: vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*4($inp),$inp shr \$1,$num .Ldec8x_loop_grande: #mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; $code.=<<___; mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer cmp $num,$one mov `40*$i+8-40*4`($inp),$temp # output pointer cmovg $one,$num # find maximum test $one,$one vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output mov $temp,`64+8*$i`(%rsp) # initialize distances vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV ___ } $code.=<<___; test $num,$num jz .Ldec8x_done vmovups 0x10-0x78($key),$rndkey1 vmovups 0x20-0x78($key),$rndkey0 mov 0xf0-0x78($key),$rounds lea 192+128(%rsp),$offload # offload area vmovdqu (@ptr[0]),@out[0] # load inputs vmovdqu (@ptr[1]),@out[1] vmovdqu (@ptr[2]),@out[2] vmovdqu (@ptr[3]),@out[3] vmovdqu (@ptr[4]),@out[4] vmovdqu (@ptr[5]),@out[5] vmovdqu (@ptr[6]),@out[6] vmovdqu (@ptr[7]),@out[7] vmovdqu @out[0],0x00($offload) # offload inputs vpxor $zero,@out[0],@out[0] # xor inputs with 0-round vmovdqu @out[1],0x10($offload) vpxor $zero,@out[1],@out[1] vmovdqu @out[2],0x20($offload) vpxor $zero,@out[2],@out[2] vmovdqu @out[3],0x30($offload) vpxor $zero,@out[3],@out[3] vmovdqu @out[4],0x40($offload) vpxor $zero,@out[4],@out[4] vmovdqu @out[5],0x50($offload) vpxor $zero,@out[5],@out[5] vmovdqu @out[6],0x60($offload) vpxor $zero,@out[6],@out[6] vmovdqu @out[7],0x70($offload) vpxor $zero,@out[7],@out[7] xor \$0x80,$offload mov \$1,$one # constant of 1 jmp .Loop_dec8x .align 32 .Loop_dec8x: ___ for($i=0;$i<8;$i++) { my $rndkey=($i&1)?$rndkey0:$rndkey1; $code.=<<___; vaesdec $rndkey,@out[0],@out[0] cmp 32+4*$i(%rsp),$one ___ $code.=<<___ if ($i); mov 64+8*$i(%rsp),$offset ___ $code.=<<___; vaesdec $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesdec $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesdec $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input vaesdec $rndkey,@out[4],@out[4] cmovg %rsp,$offset # sink output vaesdec $rndkey,@out[5],@out[5] sub @ptr[$i],$offset vaesdec $rndkey,@out[6],@out[6] vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input mov $offset,64+8*$i(%rsp) vaesdec $rndkey,@out[7],@out[7] vmovups `16*(3+$i)-0x78`($key),$rndkey lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output ___ $code.=<<___ if ($i<4); vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Ldec8x_tail vaesdec $rndkey1,@out[0],@out[0] vaesdec $rndkey1,@out[1],@out[1] vaesdec $rndkey1,@out[2],@out[2] vaesdec $rndkey1,@out[3],@out[3] vaesdec $rndkey1,@out[4],@out[4] vaesdec $rndkey1,@out[5],@out[5] vaesdec $rndkey1,@out[6],@out[6] vaesdec $rndkey1,@out[7],@out[7] vmovups 0xb0-0x78($key),$rndkey1 vaesdec $rndkey0,@out[0],@out[0] vaesdec $rndkey0,@out[1],@out[1] vaesdec $rndkey0,@out[2],@out[2] vaesdec $rndkey0,@out[3],@out[3] vaesdec $rndkey0,@out[4],@out[4] vaesdec $rndkey0,@out[5],@out[5] vaesdec $rndkey0,@out[6],@out[6] vaesdec $rndkey0,@out[7],@out[7] vmovups 0xc0-0x78($key),$rndkey0 je .Ldec8x_tail vaesdec $rndkey1,@out[0],@out[0] vaesdec $rndkey1,@out[1],@out[1] vaesdec $rndkey1,@out[2],@out[2] vaesdec $rndkey1,@out[3],@out[3] vaesdec $rndkey1,@out[4],@out[4] vaesdec $rndkey1,@out[5],@out[5] vaesdec $rndkey1,@out[6],@out[6] vaesdec $rndkey1,@out[7],@out[7] vmovups 0xd0-0x78($key),$rndkey1 vaesdec $rndkey0,@out[0],@out[0] vaesdec $rndkey0,@out[1],@out[1] vaesdec $rndkey0,@out[2],@out[2] vaesdec $rndkey0,@out[3],@out[3] vaesdec $rndkey0,@out[4],@out[4] vaesdec $rndkey0,@out[5],@out[5] vaesdec $rndkey0,@out[6],@out[6] vaesdec $rndkey0,@out[7],@out[7] vmovups 0xe0-0x78($key),$rndkey0 .Ldec8x_tail: vaesdec $rndkey1,@out[0],@out[0] vpxor $zero,$zero,$zero vaesdec $rndkey1,@out[1],@out[1] vaesdec $rndkey1,@out[2],@out[2] vpcmpgtd $zero,$counters,$zero vaesdec $rndkey1,@out[3],@out[3] vaesdec $rndkey1,@out[4],@out[4] vpaddd $counters,$zero,$zero # decrement counters vmovdqu 48(%rsp),$counters vaesdec $rndkey1,@out[5],@out[5] mov 64(%rsp),$offset # pre-load 1st offset vaesdec $rndkey1,@out[6],@out[6] vaesdec $rndkey1,@out[7],@out[7] vmovups 0x10-0x78($key),$rndkey1 vaesdeclast $rndkey0,@out[0],@out[0] vmovdqa $zero,32(%rsp) # update counters vpxor $zero,$zero,$zero vaesdeclast $rndkey0,@out[1],@out[1] vpxor 0x00($offload),@out[0],@out[0] # xor with IV vaesdeclast $rndkey0,@out[2],@out[2] vpxor 0x10($offload),@out[1],@out[1] vpcmpgtd $zero,$counters,$zero vaesdeclast $rndkey0,@out[3],@out[3] vpxor 0x20($offload),@out[2],@out[2] vaesdeclast $rndkey0,@out[4],@out[4] vpxor 0x30($offload),@out[3],@out[3] vpaddd $zero,$counters,$counters # decrement counters vmovdqu -0x78($key),$zero # 0-round vaesdeclast $rndkey0,@out[5],@out[5] vpxor 0x40($offload),@out[4],@out[4] vaesdeclast $rndkey0,@out[6],@out[6] vpxor 0x50($offload),@out[5],@out[5] vmovdqa $counters,48(%rsp) # update counters vaesdeclast $rndkey0,@out[7],@out[7] vpxor 0x60($offload),@out[6],@out[6] vmovups 0x20-0x78($key),$rndkey0 vmovups @out[0],-16(@ptr[0]) # write output sub $offset,@ptr[0] # switch to input vmovdqu 128+0(%rsp),@out[0] vpxor 0x70($offload),@out[7],@out[7] vmovups @out[1],-16(@ptr[1]) sub `64+1*8`(%rsp),@ptr[1] vmovdqu @out[0],0x00($offload) vpxor $zero,@out[0],@out[0] vmovdqu 128+16(%rsp),@out[1] vmovups @out[2],-16(@ptr[2]) sub `64+2*8`(%rsp),@ptr[2] vmovdqu @out[1],0x10($offload) vpxor $zero,@out[1],@out[1] vmovdqu 128+32(%rsp),@out[2] vmovups @out[3],-16(@ptr[3]) sub `64+3*8`(%rsp),@ptr[3] vmovdqu @out[2],0x20($offload) vpxor $zero,@out[2],@out[2] vmovdqu 128+48(%rsp),@out[3] vmovups @out[4],-16(@ptr[4]) sub `64+4*8`(%rsp),@ptr[4] vmovdqu @out[3],0x30($offload) vpxor $zero,@out[3],@out[3] vmovdqu @inp[0],0x40($offload) vpxor @inp[0],$zero,@out[4] vmovups @out[5],-16(@ptr[5]) sub `64+5*8`(%rsp),@ptr[5] vmovdqu @inp[1],0x50($offload) vpxor @inp[1],$zero,@out[5] vmovups @out[6],-16(@ptr[6]) sub `64+6*8`(%rsp),@ptr[6] vmovdqu @inp[2],0x60($offload) vpxor @inp[2],$zero,@out[6] vmovups @out[7],-16(@ptr[7]) sub `64+7*8`(%rsp),@ptr[7] vmovdqu @inp[3],0x70($offload) vpxor @inp[3],$zero,@out[7] xor \$128,$offload dec $num jnz .Loop_dec8x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num #lea `40*8`($inp),$inp #dec $num #jnz .Ldec8x_loop_grande .Ldec8x_done: vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Ldec8x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx ___ }}} if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->Rip<.Lprologue jb .Lin_prologue mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue mov 16(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_aesni_multi_cbc_encrypt .rva .LSEH_end_aesni_multi_cbc_encrypt .rva .LSEH_info_aesni_multi_cbc_encrypt .rva .LSEH_begin_aesni_multi_cbc_decrypt .rva .LSEH_end_aesni_multi_cbc_decrypt .rva .LSEH_info_aesni_multi_cbc_decrypt ___ $code.=<<___ if ($avx); .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx .rva .LSEH_end_aesni_multi_cbc_encrypt_avx .rva .LSEH_info_aesni_multi_cbc_encrypt_avx .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx .rva .LSEH_end_aesni_multi_cbc_decrypt_avx .rva .LSEH_info_aesni_multi_cbc_decrypt_avx ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_aesni_multi_cbc_encrypt: .byte 9,0,0,0 .rva se_handler .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] .LSEH_info_aesni_multi_cbc_decrypt: .byte 9,0,0,0 .rva se_handler .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_aesni_multi_cbc_encrypt_avx: .byte 9,0,0,0 .rva se_handler .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] .LSEH_info_aesni_multi_cbc_decrypt_avx: .byte 9,0,0,0 .rva se_handler .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); push @opcode,$rex|0x40 if($rex); } sub aesni { my $line=shift; my @opcode=(0x66); if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { rex(\@opcode,$4,$3); push @opcode,0x0f,0x3a,0xdf; push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M my $c=$2; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesimc" => 0xdb, "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); push @opcode,0x0f,0x38,$opcodelet{$1}; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); my $off = $2; push @opcode,0x44 if ($3>=8); push @opcode,0x0f,0x38,$opcodelet{$1}; push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M push @opcode,($off=~/^0/?oct($off):$off)&0xff; return ".byte\t".join(',',@opcode); } return $line; } $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl (revision 364822) @@ -1,2146 +1,2146 @@ #! /usr/bin/env perl # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # June 2011 # # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled # in http://download.intel.com/design/intarch/papers/323686.pdf, is # that since AESNI-CBC encrypt exhibit *very* low instruction-level # parallelism, interleaving it with another algorithm would allow to # utilize processor resources better and achieve better performance. # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and # AESNI code is weaved into it. Below are performance numbers in # cycles per processed byte, less is better, for standalone AESNI-CBC # encrypt, sum of the latter and standalone SHA1, and "stitched" # subroutine: # # AES-128-CBC +SHA1 stitch gain # Westmere 3.77[+5.3] 9.07 6.55 +38% # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) # Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%) # Bulldozer 5.77[+6.0] 11.72 6.37 +84% # Ryzen(**) 2.71[+1.93] 4.64 2.74 +69% # Goldmont(**) 3.82[+1.70] 5.52 4.20 +31% # # AES-192-CBC # Westmere 4.51 9.81 6.80 +44% # Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%) # Ivy Bridge 6.05 10.65 6.07 +75% # Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%) # Bulldozer 6.89 12.84 6.96 +84% # # AES-256-CBC # Westmere 5.25 10.55 7.21 +46% # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) # Ivy Bridge 7.05 11.65 7.12 +64% # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) # Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%) # Bulldozer 8.00 13.95 8.25 +69% # Ryzen(**) 3.71 5.64 3.72 +52% # Goldmont(**) 5.35 7.05 5.76 +22% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 # results collected on AVX-capable CPU, i.e. apply on OSes that # don't support AVX. # (**) SHAEXT results. # # Needless to mention that it makes no sense to implement "stitched" # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 # fully utilize parallelism, so stitching would not give any gain # anyway. Well, there might be some, e.g. because of better cache # locality... For reference, here are performance results for # standalone AESNI-CBC decrypt: # # AES-128-CBC AES-192-CBC AES-256-CBC # Westmere 1.25 1.50 1.75 # Sandy Bridge 0.74 0.91 1.09 # Ivy Bridge 0.74 0.90 1.11 # Haswell 0.63 0.76 0.88 # Bulldozer 0.70 0.85 0.99 # And indeed: # # AES-256-CBC +SHA1 stitch gain # Westmere 1.75 7.20 6.68 +7.8% # Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%) # Ivy Bridge 1.11 5.70 5.45 +4.6% # Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%) # Bulldozer 0.99 6.95 5.95 +17%(**) # # (*) Tiny improvement coefficient on Haswell is because we compare # AVX1 stitch to sum with AVX2 SHA1. # (**) Execution is fully dominated by integer code sequence and # SIMD still hardly shows [in single-process benchmark;-] $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.09); $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); -$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); +$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); $shaext=1; ### set to zero if compiling for 1.0.1 $stitched_decrypt=0; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void aesni_cbc_sha1_enc(const void *inp, # void *out, # size_t length, # const AES_KEY *key, # unsigned char *iv, # SHA_CTX *ctx, # const void *in0); $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc,\@abi-omnipotent .align 32 aesni_cbc_sha1_enc: .cfi_startproc # caller should check for SSSE3 and AES-NI bits mov OPENSSL_ia32cap_P+0(%rip),%r10d mov OPENSSL_ia32cap_P+4(%rip),%r11 ___ $code.=<<___ if ($shaext); bt \$61,%r11 # check SHA bit jc aesni_cbc_sha1_enc_shaext ___ $code.=<<___ if ($avx); and \$`1<<28`,%r11d # mask AVX bit and \$`1<<30`,%r10d # mask "Intel CPU" bit or %r11d,%r10d cmp \$`1<<28|1<<30`,%r10d je aesni_cbc_sha1_enc_avx ___ $code.=<<___; jmp aesni_cbc_sha1_enc_ssse3 ret .cfi_endproc .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc ___ my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0; my $K_XX_XX="%r11"; my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc my @rndkey=("%xmm14","%xmm15"); # for enc my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec if (1) { # reassign for Atom Silvermont # The goal is to minimize amount of instructions with more than # 3 prefix bytes. Or in more practical terms to keep AES-NI *and* # SSSE3 instructions to upper half of the register bank. @X=map("%xmm$_",(8..11,4..7)); @Tx=map("%xmm$_",(12,13,3)); ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); @rndkey=("%xmm0","%xmm1"); } sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_ssse3,\@function,6 .align 32 aesni_cbc_sha1_enc_ssse3: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact #jz .Lepilogue_ssse3 # debugging artefact push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` #mov $in0,$inp # debugging artefact #lea 64(%rsp),$ctx # debugging artefact ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_ssse3: ___ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization movdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] movdqa 64($K_XX_XX),@Tx[2] # pbswap mask movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @Tx[2],@X[-4&7] # byte swap pshufb @Tx[2],@X[-3&7] pshufb @Tx[2],@X[-2&7] add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @Tx[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU psubd @Tx[1],@X[-4&7] # restore X[] movdqa @X[-3&7],16(%rsp) psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] movups -112($key),$rndkey0 # $key[0] movups 16-112($key),$rndkey[0] # forward reference jmp .Loop_ssse3 ___ my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; movups `16*$n`($in0),$in # load input xorps $rndkey0,$in ___ $code.=<<___ if ($n); movups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; xorps $in,$iv movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv movups 16-112($key),$rndkey[1] # forward reference ___ } else { $code.=<<___; movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xupdate_ssse3_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_ror/); eval(shift(@insns)) if (@insns[0] =~ /_ror/); &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); } eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],30); eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xuplast_ssse3_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); &je (shift); unshift(@Tx,pop(@Tx)); &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); &movdqu (@X[-1&7],"48($inp)"); &pshufb (@X[-4&7],@Tx[2]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } $Xi++; } sub Xtail_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } my @body_00_19 = ( '($a,$b,$c,$d,$e)=@V;'. '&$_ror ($b,$j?7:2);', # $b>>>2 '&xor (@T[0],$d);', '&mov (@T[1],$a);', # $b for next round '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer '&xor ($b,$c);', # $c^$d for next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&and (@T[1],$b);', # ($b&($c^$d)) for next round '&xor ($b,$c);', # restore $b '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); sub body_00_19 () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39() if ($rx==19); $rx++; use integer; my ($k,$n); my @r=@body_00_19; $n = scalar(@r); $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); $jj++; return @r; } my @body_20_39 = ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer '&xor (@T[0],$d) if($j==19);'. '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) '&mov (@T[1],$a);', # $b for next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round '&$_ror ($b,7);', # $b>>>2 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); sub body_20_39 () { # b^d^c # on entry @T[0]=b^d return &body_40_59() if ($rx==39); $rx++; use integer; my ($k,$n); my @r=@body_20_39; $n = scalar(@r); $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20); $jj++; return @r; } my @body_40_59 = ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) '&xor ($c,$d) if ($j>=40);', # restore $c '&$_ror ($b,7);', # $b>>>2 '&mov (@T[1],$a);', # $b for next round '&xor (@T[0],$c);', '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j==59);'. '&xor (@T[1],$b) if ($j< 59);', # b^c for next round '&xor ($b,$c) if ($j< 59);', # c^d for next round '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); sub body_40_59 () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) $rx++; use integer; my ($k,$n); my @r=@body_40_59; $n = scalar(@r); $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40); $jj++; return @r; } $code.=<<___; .align 32 .Loop_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_32_79(\&body_00_19); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); $code.=<<___; movups $iv,48($out,$in0) # write output lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_ssse3 .Ldone_ssse3: ___ $jj=$j=$saved_j; @V=@saved_V; $r=$saved_r; @rndkey=@saved_rndkey; &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); $code.=<<___; movups $iv,48($out,$in0) # write output mov 88(%rsp),$ivp # restore $ivp add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) movups $iv,($ivp) # write IV ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_def_cfa %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_ssse3: ret .cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 ___ if ($stitched_decrypt) {{{ # reset ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $j=$jj=$r=$rx=0; $Xi=4; # reassign for Atom Silvermont (see above) ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4)); @X=map("%xmm$_",(8..13,6,7)); @Tx=map("%xmm$_",(14,15,5)); my @aes256_dec = ( '&movdqu($inout0,"0x00($in0)");', '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);', '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);', '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);', '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");', '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3] undef,undef ); for ($i=0;$i<13;$i++) { push (@aes256_dec,( '&aesdec ($inout0,$rndkey0);', '&aesdec ($inout1,$rndkey0);', '&aesdec ($inout2,$rndkey0);', '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");' )); push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); push (@aes256_dec,(undef,undef)) if ($i==5); } push(@aes256_dec,( '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");', '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");', '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");', '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");', '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");', '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);', '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);', '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);', '&movups ("0x30($out,$in0)",$inout3);' )); sub body_00_19_dec () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39_dec() if ($rx==19); my @r=@body_00_19; unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); $rx++; return @r; } sub body_20_39_dec () { # b^d^c # on entry @T[0]=b^d return &body_40_59_dec() if ($rx==39); my @r=@body_20_39; unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); $rx++; return @r; } sub body_40_59_dec () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) my @r=@body_40_59; unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); $rx++; return @r; } $code.=<<___; .globl aesni256_cbc_sha1_dec .type aesni256_cbc_sha1_dec,\@abi-omnipotent .align 32 aesni256_cbc_sha1_dec: .cfi_startproc # caller should check for SSSE3 and AES-NI bits mov OPENSSL_ia32cap_P+0(%rip),%r10d mov OPENSSL_ia32cap_P+4(%rip),%r11d ___ $code.=<<___ if ($avx); and \$`1<<28`,%r11d # mask AVX bit and \$`1<<30`,%r10d # mask "Intel CPU" bit or %r11d,%r10d cmp \$`1<<28|1<<30`,%r10d je aesni256_cbc_sha1_dec_avx ___ $code.=<<___; jmp aesni256_cbc_sha1_dec_ssse3 ret .cfi_endproc .size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec .type aesni256_cbc_sha1_dec_ssse3,\@function,6 .align 32 aesni256_cbc_sha1_dec_ssse3: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_dec_ssse3: ___ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization movdqu ($ivp),@X[3] # load IV #mov $ivp,88(%rsp) # save $ivp ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments $code.=<<___; shl \$6,$len sub $in0,$out add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] movdqa 64($K_XX_XX),@Tx[2] # pbswap mask movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @Tx[2],@X[-4&7] # byte swap add \$64,$inp pshufb @Tx[2],@X[-3&7] pshufb @Tx[2],@X[-2&7] pshufb @Tx[2],@X[-1&7] paddd @Tx[1],@X[-4&7] # add K_00_19 paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU psubd @Tx[1],@X[-4&7] # restore X[] movdqa @X[-3&7],16(%rsp) psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] movdqu -112($key),$rndkey0 # $key[0] jmp .Loop_dec_ssse3 .align 32 .Loop_dec_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_32_79(\&body_00_19_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_rx=$rx; &Xloop_ssse3(\&body_20_39_dec); &Xloop_ssse3(\&body_20_39_dec); &Xloop_ssse3(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_dec_ssse3 .Ldone_dec_ssse3: ___ $jj=$j=$saved_j; @V=@saved_V; $rx=$saved_rx; &Xtail_ssse3(\&body_20_39_dec); &Xtail_ssse3(\&body_20_39_dec); &Xtail_ssse3(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) movups @X[3],($ivp) # write IV ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_cfa_def %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_cfa_def %rsp,8 .Lepilogue_dec_ssse3: ret .cfi_endproc .size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3 ___ }}} $j=$jj=$r=$rx=0; if ($avx) { my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); my @rndkey=("%xmm14","%xmm15"); my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec my $Kx=@Tx[2]; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_avx,\@function,6 .align 32 aesni_cbc_sha1_enc_avx: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact #jz .Lepilogue_avx # debugging artefact push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` #mov $in0,$inp # debugging artefact #lea 64(%rsp),$ctx # debugging artefact ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_avx: ___ $code.=<<___; vzeroall mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization vmovdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask vmovdqa 0($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] vmovdqu 48($inp),@X[-1&7] vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap add \$64,$inp vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) vmovups -112($key),$rndkey[1] # $key[0] vmovups 16-112($key),$rndkey[0] # forward reference jmp .Loop_avx ___ my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; vmovdqu `16*$n`($in0),$in # load input vpxor $rndkey[1],$in,$in ___ $code.=<<___ if ($n); vmovups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; vpxor $in,$iv,$iv vaesenc $rndkey[0],$iv,$iv vmovups `32+16*$k-112`($key),$rndkey[1] ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Lvaesenclast$sn vaesenc $rndkey[0],$iv,$iv vmovups `32+16*($k+0)-112`($key),$rndkey[1] vaesenc $rndkey[1],$iv,$iv vmovups `32+16*($k+1)-112`($key),$rndkey[0] je .Lvaesenclast$sn vaesenc $rndkey[0],$iv,$iv vmovups `32+16*($k+2)-112`($key),$rndkey[1] vaesenc $rndkey[1],$iv,$iv vmovups `32+16*($k+3)-112`($key),$rndkey[0] .Lvaesenclast$sn: vaesenclast $rndkey[0],$iv,$iv vmovups -112($key),$rndkey[0] vmovups 16-112($key),$rndkey[1] # forward reference ___ } else { $code.=<<___; vaesenc $rndkey[0],$iv,$iv vmovups `32+16*$k-112`($key),$rndkey[1] ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 &vpsrld (@Tx[0],@Tx[1],30); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslld (@Tx[1],@Tx[1],2); &vpxor (@X[0],@X[0],@Tx[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); &vpaddd (@Tx[1],$Kx,@X[-1&7]); &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); # ror eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpsrld (@Tx[0],@X[0],30); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_avx_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); &je (shift); &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); &vmovdqu(@X[-1&7],"48($inp)"); &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } $Xi++; } sub Xtail_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } $code.=<<___; .align 32 .Loop_avx: ___ &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_32_79(\&body_00_19); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); $code.=<<___; vmovups $iv,48($out,$in0) # write output lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_avx .Ldone_avx: ___ $jj=$j=$saved_j; @V=@saved_V; $r=$saved_r; @rndkey=@saved_rndkey; &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); $code.=<<___; vmovups $iv,48($out,$in0) # write output mov 88(%rsp),$ivp # restore $ivp add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) vmovups $iv,($ivp) # write IV vzeroall ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_def_cfa %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_avx: ret .cfi_endproc .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx ___ if ($stitched_decrypt) {{{ # reset ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $j=$jj=$r=$rx=0; $Xi=4; @aes256_dec = ( '&vpxor ($inout0,$rndkey0,"0x00($in0)");', '&vpxor ($inout1,$rndkey0,"0x10($in0)");', '&vpxor ($inout2,$rndkey0,"0x20($in0)");', '&vpxor ($inout3,$rndkey0,"0x30($in0)");', '&vmovups($rndkey0,"16-112($key)");', '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3] undef,undef ); for ($i=0;$i<13;$i++) { push (@aes256_dec,( '&vaesdec ($inout0,$inout0,$rndkey0);', '&vaesdec ($inout1,$inout1,$rndkey0);', '&vaesdec ($inout2,$inout2,$rndkey0);', '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");' )); push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); push (@aes256_dec,(undef,undef)) if ($i==5); } push(@aes256_dec,( '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");', '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");', '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");', '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");', '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");', '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);', '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);', '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);', '&vmovups ("0x30($out,$in0)",$inout3);' )); $code.=<<___; .type aesni256_cbc_sha1_dec_avx,\@function,6 .align 32 aesni256_cbc_sha1_dec_avx: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_dec_avx: ___ $code.=<<___; vzeroall mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization vmovdqu ($ivp),@X[3] # load IV ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments $code.=<<___; shl \$6,$len sub $in0,$out add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask vmovdqa 0($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] vmovdqu 48($inp),@X[-1&7] vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap add \$64,$inp vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) vmovups -112($key),$rndkey0 # $key[0] jmp .Loop_dec_avx .align 32 .Loop_dec_avx: ___ &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_32_79(\&body_00_19_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_rx=$rx; &Xloop_avx(\&body_20_39_dec); &Xloop_avx(\&body_20_39_dec); &Xloop_avx(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_dec_avx .Ldone_dec_avx: ___ $jj=$j=$saved_j; @V=@saved_V; $rx=$saved_rx; &Xtail_avx(\&body_20_39_dec); &Xtail_avx(\&body_20_39_dec); &Xtail_avx(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) vmovups @X[3],($ivp) # write IV vzeroall ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_def_cfa %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_dec_avx: ret .cfi_endproc .size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx ___ }}} } $code.=<<___; .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by " .align 64 ___ if ($shaext) {{{ ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $rounds="%r11d"; ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); @rndkey=("%xmm0","%xmm1"); $r=0; my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12)); my @MSG=map("%xmm$_",(3..6)); $code.=<<___; .type aesni_cbc_sha1_enc_shaext,\@function,6 .align 32 aesni_cbc_sha1_enc_shaext: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument ___ $code.=<<___ if ($win64); lea `-8-10*16`(%rsp),%rsp movaps %xmm6,-8-10*16(%rax) movaps %xmm7,-8-9*16(%rax) movaps %xmm8,-8-8*16(%rax) movaps %xmm9,-8-7*16(%rax) movaps %xmm10,-8-6*16(%rax) movaps %xmm11,-8-5*16(%rax) movaps %xmm12,-8-4*16(%rax) movaps %xmm13,-8-3*16(%rax) movaps %xmm14,-8-2*16(%rax) movaps %xmm15,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; movdqu ($ctx),$ABCD movd 16($ctx),$E movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization pshufd \$0b00011011,$ABCD,$ABCD # flip word order pshufd \$0b00011011,$E,$E # flip word order jmp .Loop_shaext .align 16 .Loop_shaext: ___ &$aesenc(); $code.=<<___; movdqu ($inp),@MSG[0] movdqa $E,$E_SAVE # offload $E pshufb $BSWAP,@MSG[0] movdqu 0x10($inp),@MSG[1] movdqa $ABCD,$ABCD_SAVE # offload $ABCD ___ &$aesenc(); $code.=<<___; pshufb $BSWAP,@MSG[1] paddd @MSG[0],$E movdqu 0x20($inp),@MSG[2] lea 0x40($inp),$inp pxor $E_SAVE,@MSG[0] # black magic ___ &$aesenc(); $code.=<<___; pxor $E_SAVE,@MSG[0] # black magic movdqa $ABCD,$E_ pshufb $BSWAP,@MSG[2] sha1rnds4 \$0,$E,$ABCD # 0-3 sha1nexte @MSG[1],$E_ ___ &$aesenc(); $code.=<<___; sha1msg1 @MSG[1],@MSG[0] movdqu -0x10($inp),@MSG[3] movdqa $ABCD,$E pshufb $BSWAP,@MSG[3] ___ &$aesenc(); $code.=<<___; sha1rnds4 \$0,$E_,$ABCD # 4-7 sha1nexte @MSG[2],$E pxor @MSG[2],@MSG[0] sha1msg1 @MSG[2],@MSG[1] ___ &$aesenc(); for($i=2;$i<20-4;$i++) { $code.=<<___; movdqa $ABCD,$E_ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11 sha1nexte @MSG[3],$E_ ___ &$aesenc(); $code.=<<___; sha1msg2 @MSG[3],@MSG[0] pxor @MSG[3],@MSG[1] sha1msg1 @MSG[3],@MSG[2] ___ ($E,$E_)=($E_,$E); push(@MSG,shift(@MSG)); &$aesenc(); } $code.=<<___; movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 64-67 sha1nexte @MSG[3],$E_ sha1msg2 @MSG[3],@MSG[0] pxor @MSG[3],@MSG[1] ___ &$aesenc(); $code.=<<___; movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 68-71 sha1nexte @MSG[0],$E sha1msg2 @MSG[0],@MSG[1] ___ &$aesenc(); $code.=<<___; movdqa $E_SAVE,@MSG[0] movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 72-75 sha1nexte @MSG[1],$E_ ___ &$aesenc(); $code.=<<___; movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 76-79 sha1nexte $MSG[0],$E ___ while($r<40) { &$aesenc(); } # remaining aesenc's $code.=<<___; dec $len paddd $ABCD_SAVE,$ABCD movups $iv,48($out,$in0) # write output lea 64($in0),$in0 jnz .Loop_shaext pshufd \$0b00011011,$ABCD,$ABCD pshufd \$0b00011011,$E,$E movups $iv,($ivp) # write IV movdqu $ABCD,($ctx) movd $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-10*16(%rax),%xmm6 movaps -8-9*16(%rax),%xmm7 movaps -8-8*16(%rax),%xmm8 movaps -8-7*16(%rax),%xmm9 movaps -8-6*16(%rax),%xmm10 movaps -8-5*16(%rax),%xmm11 movaps -8-4*16(%rax),%xmm12 movaps -8-3*16(%rax),%xmm13 movaps -8-2*16(%rax),%xmm14 movaps -8-1*16(%rax),%xmm15 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext ___ }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail ___ $code.=<<___ if ($shaext); lea aesni_cbc_sha1_enc_shaext(%rip),%r10 cmp %r10,%rbx jb .Lseh_no_shaext lea (%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea 168(%rax),%rax # adjust stack pointer jmp .Lcommon_seh_tail .Lseh_no_shaext: ___ $code.=<<___; lea 96(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea `104+10*16`(%rax),%rax # adjust stack pointer mov 0(%rax),%r15 mov 8(%rax),%r14 mov 16(%rax),%r13 mov 24(%rax),%r12 mov 32(%rax),%rbp mov 40(%rax),%rbx lea 48(%rax),%rax mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size ssse3_handler,.-ssse3_handler .section .pdata .align 4 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_aesni_cbc_sha1_enc_avx .rva .LSEH_end_aesni_cbc_sha1_enc_avx .rva .LSEH_info_aesni_cbc_sha1_enc_avx ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext .rva .LSEH_end_aesni_cbc_sha1_enc_shaext .rva .LSEH_info_aesni_cbc_sha1_enc_shaext ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_aesni_cbc_sha1_enc_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_aesni_cbc_sha1_enc_avx: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($shaext); .LSEH_info_aesni_cbc_sha1_enc_shaext: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); unshift @opcode,$rex|0x40 if($rex); } sub sha1rnds4 { if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x3a,0xcc); rex(\@opcode,$3,$2); push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } else { return "sha1rnds4\t".@_[0]; } } sub sha1op38 { my $instr = shift; my %opcodelet = ( "sha1nexte" => 0xc8, "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } sub aesni { my $line=shift; my @opcode=(0x0f,0x38); if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M unshift @opcode,0x66; return ".byte\t".join(',',@opcode); } return $line; } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl (revision 364822) @@ -1,1802 +1,1802 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # January 2013 # # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled # in http://download.intel.com/design/intarch/papers/323686.pdf, is # that since AESNI-CBC encrypt exhibit *very* low instruction-level # parallelism, interleaving it with another algorithm would allow to # utilize processor resources better and achieve better performance. # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and # AESNI code is weaved into it. As SHA256 dominates execution time, # stitch performance does not depend on AES key length. Below are # performance numbers in cycles per processed byte, less is better, # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched # subroutine: # # AES-128/-192/-256+SHA256 this(**) gain # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% # # (*) there are XOP, AVX1 and AVX2 code paths, meaning that # Westmere is omitted from loop, this is because gain was not # estimated high enough to justify the effort; # (**) these are EVP-free results, results obtained with 'speed # -evp aes-256-cbc-hmac-sha256' will vary by percent or two; # (***) these are SHAEXT results; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=$avx; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $func="aesni_cbc_sha256_enc"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; ######################################################################## # void aesni_cbc_sha256_enc(const void *inp, # void *out, # size_t length, # const AES_KEY *key, # unsigned char *iv, # SHA256_CTX *ctx, # const void *in0); ($inp, $out, $len, $key, $ivp, $ctx, $in0) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $Tbl="%rbp"; $_inp="16*$SZ+0*8(%rsp)"; $_out="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $_key="16*$SZ+3*8(%rsp)"; $_ivp="16*$SZ+4*8(%rsp)"; $_ctx="16*$SZ+5*8(%rsp)"; $_in0="16*$SZ+6*8(%rsp)"; $_rsp="`16*$SZ+7*8`(%rsp)"; $framesz=16*$SZ+8*8; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl $func .type $func,\@abi-omnipotent .align 16 $func: .cfi_startproc ___ if ($avx) { $code.=<<___; lea OPENSSL_ia32cap_P(%rip),%r11 mov \$1,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe mov 0(%r11),%eax mov 4(%r11),%r10 ___ $code.=<<___ if ($shaext); bt \$61,%r10 # check for SHA jc ${func}_shaext ___ $code.=<<___; mov %r10,%r11 shr \$32,%r11 test \$`1<<11`,%r10d # check for XOP jnz ${func}_xop ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 cmp \$`1<<8|1<<5|1<<3`,%r11d je ${func}_avx2 ___ $code.=<<___; and \$`1<<28`,%r10d # check for AVX jnz ${func}_avx ud2 ___ } $code.=<<___; xor %eax,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe ud2 .Lprobe: ret .cfi_endproc .size $func,.-$func .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 .long 0,0,0,0, 0,0,0,0 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by " .align 64 ___ ###################################################################### # SIMD code paths # {{{ ($iv,$inout,$roundkey,$temp, $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); $aesni_cbc_idx=0; @aesni_cbc_block = ( ## &vmovdqu ($roundkey,"0x00-0x80($inp)");' ## &vmovdqu ($inout,($inp)); ## &mov ($_inp,$inp); '&vpxor ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', '&vpxor ($inout,$inout,$iv);', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', '&vaesenclast ($temp,$inout,$roundkey);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', '&vpand ($iv,$temp,$mask10);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', '&vaesenclast ($temp,$inout,$roundkey);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', '&vpand ($temp,$temp,$mask12);'. ' &vaesenc ($inout,$inout,$roundkey);'. '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', '&vpor ($iv,$iv,$temp);'. ' &vaesenclast ($temp,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' ## &mov ($inp,$_inp); ## &mov ($out,$_out); ## &vpand ($temp,$temp,$mask14); ## &vpor ($iv,$iv,$temp); ## &vmovdqu ($iv,($out,$inp); ## &lea (inp,16($inp)); ); my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&xor ($a0,$e)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e @aesni_cbc_block[$aesni_cbc_idx++]. '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&xor ($a2,$b)', # a^b, b^c in next round '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&add ($h,$a4)', # h+=Ch(e,f,g) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&add ($d,$h)', # d+=h '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } if ($avx) {{ ###################################################################### # XOP code path # $code.=<<___; .type ${func}_xop,\@function,6 .align 64 ${func}_xop: .cfi_startproc .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later mov $out,$_out mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_xop: vzeroall mov $inp,%r12 # borrow $a4 lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV sub \$9,%r14 mov $SZ*0(%r15),$A mov $SZ*1(%r15),$B mov $SZ*2(%r15),$C mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqa 0x00(%r13,%r14,8),$mask14 vmovdqa 0x10(%r13,%r14,8),$mask12 vmovdqa 0x20(%r13,%r14,8),$mask10 vmovdqu 0x00-0x80($inp),$roundkey jmp .Lloop_xop ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00(%rsi,%r12),@X[0] vmovdqu 0x10(%rsi,%r12),@X[1] vmovdqu 0x20(%rsi,%r12),@X[2] vmovdqu 0x30(%rsi,%r12),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: sub \$-16*2*$SZ,$Tbl # size optimization vmovdqu (%r12),$inout # $a4 mov %r12,$_inp # $a4 ___ sub XOP_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[3],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq ($t3,$t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq ($t3,$t3,8); # 22 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &mov ("%r12",$_inp); # borrow $a4 &vpand ($temp,$temp,$mask14); &mov ("%r15",$_out); # borrow $a2 &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r12)",$iv); # write output &lea ("%r12","16(%r12)"); # inp++ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); &vmovdqu ($inout,"(%r12)"); &mov ($_inp,"%r12"); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_inp,%r12 # borrow $a4 mov $_out,%r13 # borrow $a0 mov $_ctx,%r15 # borrow $a2 mov $_in0,%rsi # borrow $a3 vpand $mask14,$temp,$temp mov $a1,$A vpor $temp,$iv,$iv vmovdqu $iv,(%r13,%r12) # write output lea 16(%r12),%r12 # inp++ add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H cmp $_end,%r12 mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jb .Lloop_xop mov $_ivp,$ivp mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`(%rsp),%xmm6 movaps `$framesz+16*1`(%rsp),%xmm7 movaps `$framesz+16*2`(%rsp),%xmm8 movaps `$framesz+16*3`(%rsp),%xmm9 movaps `$framesz+16*4`(%rsp),%xmm10 movaps `$framesz+16*5`(%rsp),%xmm11 movaps `$framesz+16*6`(%rsp),%xmm12 movaps `$framesz+16*7`(%rsp),%xmm13 movaps `$framesz+16*8`(%rsp),%xmm14 movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_xop: ret .cfi_endproc .size ${func}_xop,.-${func}_xop ___ ###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; .type ${func}_avx,\@function,6 .align 64 ${func}_avx: .cfi_startproc .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later mov $out,$_out mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_avx: vzeroall mov $inp,%r12 # borrow $a4 lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV sub \$9,%r14 mov $SZ*0(%r15),$A mov $SZ*1(%r15),$B mov $SZ*2(%r15),$C mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqa 0x00(%r13,%r14,8),$mask14 vmovdqa 0x10(%r13,%r14,8),$mask12 vmovdqa 0x20(%r13,%r14,8),$mask10 vmovdqu 0x00-0x80($inp),$roundkey ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00(%rsi,%r12),@X[0] vmovdqu 0x10(%rsi,%r12),@X[1] vmovdqu 0x20(%rsi,%r12),@X[2] vmovdqu 0x30(%rsi,%r12),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: sub \$-16*2*$SZ,$Tbl # size optimization vmovdqu (%r12),$inout # $a4 mov %r12,$_inp # $a4 ___ sub Xupdate_256_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] '&vpsrld ($t2,$t0,$sigma0[0]);', '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] '&vpsrld ($t3,$t0,$sigma0[2])', '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrld ($t2,$t3,$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) '&vpsrlq ($t3,$t3,$sigma1[0]);', '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) '&vpshufd ($t2,$t2,0b10000100)', '&vpsrldq ($t2,$t2,8)', '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,$sigma1[2])', '&vpsrlq ($t3,$t3,$sigma1[0])', '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufd ($t2,$t2,0b11101000)', '&vpslldq ($t2,$t2,8)', '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) ); } sub AVX_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &mov ("%r12",$_inp); # borrow $a4 &vpand ($temp,$temp,$mask14); &mov ("%r15",$_out); # borrow $a2 &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r12)",$iv); # write output &lea ("%r12","16(%r12)"); # inp++ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); &vmovdqu ($inout,"(%r12)"); &mov ($_inp,"%r12"); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_inp,%r12 # borrow $a4 mov $_out,%r13 # borrow $a0 mov $_ctx,%r15 # borrow $a2 mov $_in0,%rsi # borrow $a3 vpand $mask14,$temp,$temp mov $a1,$A vpor $temp,$iv,$iv vmovdqu $iv,(%r13,%r12) # write output lea 16(%r12),%r12 # inp++ add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H cmp $_end,%r12 mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jb .Lloop_avx mov $_ivp,$ivp mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`(%rsp),%xmm6 movaps `$framesz+16*1`(%rsp),%xmm7 movaps `$framesz+16*2`(%rsp),%xmm8 movaps `$framesz+16*3`(%rsp),%xmm9 movaps `$framesz+16*4`(%rsp),%xmm10 movaps `$framesz+16*5`(%rsp),%xmm11 movaps `$framesz+16*6`(%rsp),%xmm12 movaps `$framesz+16*7`(%rsp),%xmm13 movaps `$framesz+16*8`(%rsp),%xmm14 movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size ${func}_avx,.-${func}_avx ___ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; sub bodyx_00_15 () { # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] '&and ($a4,$e)', # f&e '&rorx ($a0,$e,$Sigma1[2])', '&rorx ($a2,$e,$Sigma1[1])', '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past '&lea ($h,"($h,$a4)")', '&andn ($a4,$e,$g)', # ~e&g '&xor ($a0,$a2)', '&rorx ($a1,$e,$Sigma1[0])', '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) '&xor ($a0,$a1)', # Sigma1(e) '&mov ($a2,$a)', '&rorx ($a4,$a,$Sigma0[2])', '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) '&xor ($a2,$b)', # a^b, b^c in next round '&rorx ($a1,$a,$Sigma0[1])', '&rorx ($a0,$a,$Sigma0[0])', '&lea ($d,"($d,$h)")', # d+=h '&and ($a3,$a2)', # (b^c)&(a^b) @aesni_cbc_block[$aesni_cbc_idx++]. '&xor ($a1,$a4)', '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&xor ($a1,$a0)', # Sigma0(a) '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) '&mov ($a4,$e)', # copy of f in future '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); # and at the finish one has to $a+=$a1 } $code.=<<___; .type ${func}_avx2,\@function,6 .align 64 ${func}_avx2: .cfi_startproc .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later #mov $out,$_out # kept in $offload mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_avx2: vzeroall mov $inp,%r13 # borrow $a0 vpinsrq \$1,$out,$offload,$offload lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV lea -9(%r14),%r14 vmovdqa 0x00(%r12,%r14,8),$mask14 vmovdqa 0x10(%r12,%r14,8),$mask12 vmovdqa 0x20(%r12,%r14,8),$mask10 sub \$-16*$SZ,%r13 # inp++, size optimization mov $SZ*0(%r15),$A lea (%rsi,%r13),%r12 # borrow $a0 mov $SZ*1(%r15),$B cmp $len,%r13 # $_end mov $SZ*2(%r15),$C cmove %rsp,%r12 # next block or random data mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqu 0x00-0x80($inp),$roundkey ___ if ($SZ==4) { # SHA256 my @X = map("%ymm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); $code.=<<___; jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t3,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t3,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[2],@X[2] lea -16*$SZ(%r13),%r13 vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rsi as frame pointer mov $_rsp,%rsi .cfi_def_cfa %rsi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rsi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; mov $B,$a3 vmovdqa $t2,0x00(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x20(%rsp) mov $F,$a4 sub \$-16*2*$SZ,$Tbl # size optimization jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: vmovdqu (%r13),$inout vpinsrq \$0,%r13,$offload,$offload ___ sub AVX2_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 96 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%2)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &AVX2_256_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &vmovq ("%r13",$offload); # borrow $a0 &vpextrq ("%r15",$offload,1); # borrow $a2 &vpand ($temp,$temp,$mask14); &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r13)",$iv); # write output &lea ("%r13","16(%r13)"); # inp++ &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1)."($Tbl)",0); &jne (".Lavx2_00_47"); &vmovdqu ($inout,"(%r13)"); &vpinsrq ($offload,$offload,"%r13",0); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } $code.=<<___; vpextrq \$1,$offload,%r12 # $_out, borrow $a4 vmovq $offload,%r13 # $_inp, borrow $a0 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 add $a1,$A lea `2*$SZ*($rounds-8)`(%rsp),$Tbl vpand $mask14,$temp,$temp vpor $temp,$iv,$iv vmovdqu $iv,(%r12,%r13) # write output lea 16(%r13),%r13 add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) cmp `$PUSH8+2*8`($Tbl),%r13 # $_end je .Ldone_avx2 xor $a1,$a1 mov $B,$a3 mov $F,$a4 xor $C,$a3 # magic jmp .Lower_avx2 .align 16 .Lower_avx2: vmovdqu (%r13),$inout vpinsrq \$0,%r13,$offload,$offload ___ $aesni_cbc_idx=0; for ($i=0; $i<16; ) { my $base="+16($Tbl)"; foreach(bodyx_00_15()) { eval; } &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); } $code.=<<___; vmovq $offload,%r13 # borrow $a0 vpextrq \$1,$offload,%r15 # borrow $a2 vpand $mask14,$temp,$temp vpor $temp,$iv,$iv lea -$PUSH8($Tbl),$Tbl vmovdqu $iv,(%r15,%r13) # write output lea 16(%r13),%r13 # inp++ cmp %rsp,$Tbl jae .Lower_avx2 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 lea 16*$SZ(%r13),%r13 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 add $a1,$A lea `2*$SZ*($rounds-8)`(%rsp),%rsp add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G lea (%rsi,%r13),%r12 add $SZ*7(%r15),$H cmp $_end,%r13 mov $A,$SZ*0(%r15) cmove %rsp,%r12 # next block or stale data mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jbe .Loop_avx2 lea (%rsp),$Tbl # temporarily use $Tbl as index to $_rsp # this avoids the need to save a secondary frame pointer at -8(%rsp) .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 .Ldone_avx2: mov 16*$SZ+4*8($Tbl),$ivp mov 16*$SZ+7*8($Tbl),%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`($Tbl),%xmm6 movaps `$framesz+16*1`($Tbl),%xmm7 movaps `$framesz+16*2`($Tbl),%xmm8 movaps `$framesz+16*3`($Tbl),%xmm9 movaps `$framesz+16*4`($Tbl),%xmm10 movaps `$framesz+16*5`($Tbl),%xmm11 movaps `$framesz+16*6`($Tbl),%xmm12 movaps `$framesz+16*7`($Tbl),%xmm13 movaps `$framesz+16*8`($Tbl),%xmm14 movaps `$framesz+16*9`($Tbl),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} }} {{ my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my ($rounds,$Tbl)=("%r11d","%rbx"); my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); my @rndkey=("%xmm4","%xmm5"); my $r=0; my $sn=0; my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); my @MSG=map("%xmm$_",(10..13)); my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; movups `16*$n`($in0),$in # load input xorps $rndkey0,$in ___ $code.=<<___ if ($n); movups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; xorps $in,$iv movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv movups 16-112($key),$rndkey[1] # forward reference nop ___ } else { $code.=<<___; movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; if ($shaext) { my $Tbl="%rax"; $code.=<<___; .type ${func}_shaext,\@function,6 .align 32 ${func}_shaext: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument ___ $code.=<<___ if ($win64); lea `-8-10*16`(%rsp),%rsp movaps %xmm6,-8-10*16(%rax) movaps %xmm7,-8-9*16(%rax) movaps %xmm8,-8-8*16(%rax) movaps %xmm9,-8-7*16(%rax) movaps %xmm10,-8-6*16(%rax) movaps %xmm11,-8-5*16(%rax) movaps %xmm12,-8-4*16(%rax) movaps %xmm13,-8-3*16(%rax) movaps %xmm14,-8-2*16(%rax) movaps %xmm15,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x200-0x80($Tbl),$TMP # byte swap mask mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload movdqa $ABEF,$ABEF_SAVE # offload ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 1*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] lea 0x40($inp),$inp ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 2*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256msg1 @MSG[1],@MSG[0] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP paddd $TMP,@MSG[0] ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 3*32-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256msg1 @MSG[2],@MSG[1] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP paddd $TMP,@MSG[1] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { &$aesenc() if (($r%10)==0); $code.=<<___; movdqa $i*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256msg1 @MSG[3],@MSG[2] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP paddd $TMP,@MSG[2] ___ &$aesenc(); &$aesenc() if ($r==19); $code.=<<___; sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256msg1 @MSG[3],@MSG[2] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP paddd $TMP,@MSG[2] ___ &$aesenc(); &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 14*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 15*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi ___ &$aesenc(); &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF #pxor $CDGH,$rndkey0 # black magic ___ while ($r<40) { &$aesenc(); } # remaining aesenc's $code.=<<___; #xorps $CDGH,$rndkey0 # black magic paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF dec $len movups $iv,48($out,$in0) # write output lea 64($in0),$in0 jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movups $iv,($ivp) # write IV movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps 0*16(%rsp),%xmm6 movaps 1*16(%rsp),%xmm7 movaps 2*16(%rsp),%xmm8 movaps 3*16(%rsp),%xmm9 movaps 4*16(%rsp),%xmm10 movaps 5*16(%rsp),%xmm11 movaps 6*16(%rsp),%xmm12 movaps 7*16(%rsp),%xmm13 movaps 8*16(%rsp),%xmm14 movaps 9*16(%rsp),%xmm15 lea 8+10*16(%rsp),%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size ${func}_shaext,.-${func}_shaext ___ } }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64 && $avx) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HanderlData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ $code.=<<___ if ($shaext); lea aesni_cbc_sha256_enc_shaext(%rip),%r10 cmp %r10,%rbx jb .Lnot_in_shaext lea (%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea 168(%rax),%rax # adjust stack pointer jmp .Lin_prologue .Lnot_in_shaext: ___ $code.=<<___ if ($avx>1); lea .Lavx2_shortcut(%rip),%r10 cmp %r10,%rbx # context->RipRbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .rva .LSEH_begin_${func}_xop .rva .LSEH_end_${func}_xop .rva .LSEH_info_${func}_xop .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_${func}_avx2 .rva .LSEH_end_${func}_avx2 .rva .LSEH_info_${func}_avx2 ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_${func}_shaext .rva .LSEH_end_${func}_shaext .rva .LSEH_info_${func}_shaext ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_${func}_xop: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] .LSEH_info_${func}_avx: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_${func}_avx2: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ $code.=<<___ if ($shaext); .LSEH_info_${func}_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); unshift @opcode,$rex|0x40 if($rex); } { my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); sub sha256op38 { my $instr = shift; if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } } $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/bn/asm/rsaz-avx2.pl =================================================================== --- head/crypto/openssl/crypto/bn/asm/rsaz-avx2.pl (revision 364821) +++ head/crypto/openssl/crypto/bn/asm/rsaz-avx2.pl (revision 364822) @@ -1,1982 +1,1982 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2012, Intel Corporation. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # # References: # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # Exponentiation, Using Advanced Vector Instructions Architectures", # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # [2] S. Gueron: "Efficient Software Implementations of Modular # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # Proceedings of 9th International Conference on Information Technology: # New Generations (ITNG 2012), pp.821-823 (2012) # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # resistant 1024-bit modular exponentiation, for optimizing RSA2048 # on AVX2 capable x86_64 platforms", # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest # # +13% improvement over original submission by # # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this # 2.3GHz Haswell 621 765/+23% 1113/+79% # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% # # (*) if system doesn't support AVX2, for reference purposes; # (**) scaled to 2.3GHz to simplify comparison; # (***) scalar AD*X code is faster than AVX2 and is preferred code # path for Broadwell; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); $addx = ($1>=2.23); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); $addx = ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); $addx = ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT = *OUT; if ($avx>1) {{{ { # void AMS_WW( my $rp="%rdi"; # BN_ULONG *rp, my $ap="%rsi"; # const BN_ULONG *ap, my $np="%rdx"; # const BN_ULONG *np, my $n0="%ecx"; # const BN_ULONG n0, my $rep="%r8d"; # int repeat); # The registers that hold the accumulated redundant result # The AMM works on 1024 bit operands, and redundant word size is 29 # Therefore: ceil(1024/29)/4 = 9 my $ACC0="%ymm0"; my $ACC1="%ymm1"; my $ACC2="%ymm2"; my $ACC3="%ymm3"; my $ACC4="%ymm4"; my $ACC5="%ymm5"; my $ACC6="%ymm6"; my $ACC7="%ymm7"; my $ACC8="%ymm8"; my $ACC9="%ymm9"; # Registers that hold the broadcasted words of bp, currently used my $B1="%ymm10"; my $B2="%ymm11"; # Registers that hold the broadcasted words of Y, currently used my $Y1="%ymm12"; my $Y2="%ymm13"; # Helper registers my $TEMP1="%ymm14"; my $AND_MASK="%ymm15"; # alu registers that hold the first words of the ACC my $r0="%r9"; my $r1="%r10"; my $r2="%r11"; my $r3="%r12"; my $i="%r14d"; # loop counter my $tmp = "%r15"; my $FrameSize=32*18+32*8; # place for A^2 and 2*A my $aap=$r0; my $tp0="%rbx"; my $tp1=$r3; my $tpa=$tmp; $np="%r13"; # reassigned argument $code.=<<___; .text .globl rsaz_1024_sqr_avx2 .type rsaz_1024_sqr_avx2,\@function,5 .align 64 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 .cfi_startproc lea (%rsp), %rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 vzeroupper ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp vmovaps %xmm6,-0xd8(%rax) vmovaps %xmm7,-0xc8(%rax) vmovaps %xmm8,-0xb8(%rax) vmovaps %xmm9,-0xa8(%rax) vmovaps %xmm10,-0x98(%rax) vmovaps %xmm11,-0x88(%rax) vmovaps %xmm12,-0x78(%rax) vmovaps %xmm13,-0x68(%rax) vmovaps %xmm14,-0x58(%rax) vmovaps %xmm15,-0x48(%rax) .Lsqr_1024_body: ___ $code.=<<___; mov %rax,%rbp .cfi_def_cfa_register %rbp mov %rdx, $np # reassigned argument sub \$$FrameSize, %rsp mov $np, $tmp sub \$-128, $rp # size optimization sub \$-128, $ap sub \$-128, $np and \$4095, $tmp # see if $np crosses page add \$32*10, $tmp shr \$12, $tmp vpxor $ACC9,$ACC9,$ACC9 jz .Lsqr_1024_no_n_copy # unaligned 256-bit load that crosses page boundary can # cause >2x performance degradation here, so if $np does # cross page boundary, copy it to stack and make sure stack # frame doesn't... sub \$32*10,%rsp vmovdqu 32*0-128($np), $ACC0 and \$-2048, %rsp vmovdqu 32*1-128($np), $ACC1 vmovdqu 32*2-128($np), $ACC2 vmovdqu 32*3-128($np), $ACC3 vmovdqu 32*4-128($np), $ACC4 vmovdqu 32*5-128($np), $ACC5 vmovdqu 32*6-128($np), $ACC6 vmovdqu 32*7-128($np), $ACC7 vmovdqu 32*8-128($np), $ACC8 lea $FrameSize+128(%rsp),$np vmovdqu $ACC0, 32*0-128($np) vmovdqu $ACC1, 32*1-128($np) vmovdqu $ACC2, 32*2-128($np) vmovdqu $ACC3, 32*3-128($np) vmovdqu $ACC4, 32*4-128($np) vmovdqu $ACC5, 32*5-128($np) vmovdqu $ACC6, 32*6-128($np) vmovdqu $ACC7, 32*7-128($np) vmovdqu $ACC8, 32*8-128($np) vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero .Lsqr_1024_no_n_copy: and \$-1024, %rsp vmovdqu 32*1-128($ap), $ACC1 vmovdqu 32*2-128($ap), $ACC2 vmovdqu 32*3-128($ap), $ACC3 vmovdqu 32*4-128($ap), $ACC4 vmovdqu 32*5-128($ap), $ACC5 vmovdqu 32*6-128($ap), $ACC6 vmovdqu 32*7-128($ap), $ACC7 vmovdqu 32*8-128($ap), $ACC8 lea 192(%rsp), $tp0 # 64+128=192 vmovdqu .Land_mask(%rip), $AND_MASK jmp .LOOP_GRANDE_SQR_1024 .align 32 .LOOP_GRANDE_SQR_1024: lea 32*18+128(%rsp), $aap # size optimization lea 448(%rsp), $tp1 # 64+128+256=448 # the squaring is performed as described in Variant B of # "Speeding up Big-Number Squaring", so start by calculating # the A*2=A+A vector vpaddq $ACC1, $ACC1, $ACC1 vpbroadcastq 32*0-128($ap), $B1 vpaddq $ACC2, $ACC2, $ACC2 vmovdqa $ACC1, 32*0-128($aap) vpaddq $ACC3, $ACC3, $ACC3 vmovdqa $ACC2, 32*1-128($aap) vpaddq $ACC4, $ACC4, $ACC4 vmovdqa $ACC3, 32*2-128($aap) vpaddq $ACC5, $ACC5, $ACC5 vmovdqa $ACC4, 32*3-128($aap) vpaddq $ACC6, $ACC6, $ACC6 vmovdqa $ACC5, 32*4-128($aap) vpaddq $ACC7, $ACC7, $ACC7 vmovdqa $ACC6, 32*5-128($aap) vpaddq $ACC8, $ACC8, $ACC8 vmovdqa $ACC7, 32*6-128($aap) vpxor $ACC9, $ACC9, $ACC9 vmovdqa $ACC8, 32*7-128($aap) vpmuludq 32*0-128($ap), $B1, $ACC0 vpbroadcastq 32*1-128($ap), $B2 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half vpmuludq $B1, $ACC1, $ACC1 vmovdqu $ACC9, 32*10-448($tp1) vpmuludq $B1, $ACC2, $ACC2 vmovdqu $ACC9, 32*11-448($tp1) vpmuludq $B1, $ACC3, $ACC3 vmovdqu $ACC9, 32*12-448($tp1) vpmuludq $B1, $ACC4, $ACC4 vmovdqu $ACC9, 32*13-448($tp1) vpmuludq $B1, $ACC5, $ACC5 vmovdqu $ACC9, 32*14-448($tp1) vpmuludq $B1, $ACC6, $ACC6 vmovdqu $ACC9, 32*15-448($tp1) vpmuludq $B1, $ACC7, $ACC7 vmovdqu $ACC9, 32*16-448($tp1) vpmuludq $B1, $ACC8, $ACC8 vpbroadcastq 32*2-128($ap), $B1 vmovdqu $ACC9, 32*17-448($tp1) mov $ap, $tpa mov \$4, $i jmp .Lsqr_entry_1024 ___ $TEMP0=$Y1; $TEMP2=$Y2; $code.=<<___; .align 32 .LOOP_SQR_1024: vpbroadcastq 32*1-128($tpa), $B2 vpmuludq 32*0-128($ap), $B1, $ACC0 vpaddq 32*0-192($tp0), $ACC0, $ACC0 vpmuludq 32*0-128($aap), $B1, $ACC1 vpaddq 32*1-192($tp0), $ACC1, $ACC1 vpmuludq 32*1-128($aap), $B1, $ACC2 vpaddq 32*2-192($tp0), $ACC2, $ACC2 vpmuludq 32*2-128($aap), $B1, $ACC3 vpaddq 32*3-192($tp0), $ACC3, $ACC3 vpmuludq 32*3-128($aap), $B1, $ACC4 vpaddq 32*4-192($tp0), $ACC4, $ACC4 vpmuludq 32*4-128($aap), $B1, $ACC5 vpaddq 32*5-192($tp0), $ACC5, $ACC5 vpmuludq 32*5-128($aap), $B1, $ACC6 vpaddq 32*6-192($tp0), $ACC6, $ACC6 vpmuludq 32*6-128($aap), $B1, $ACC7 vpaddq 32*7-192($tp0), $ACC7, $ACC7 vpmuludq 32*7-128($aap), $B1, $ACC8 vpbroadcastq 32*2-128($tpa), $B1 vpaddq 32*8-192($tp0), $ACC8, $ACC8 .Lsqr_entry_1024: vmovdqu $ACC0, 32*0-192($tp0) vmovdqu $ACC1, 32*1-192($tp0) vpmuludq 32*1-128($ap), $B2, $TEMP0 vpaddq $TEMP0, $ACC2, $ACC2 vpmuludq 32*1-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC3, $ACC3 vpmuludq 32*2-128($aap), $B2, $TEMP2 vpaddq $TEMP2, $ACC4, $ACC4 vpmuludq 32*3-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq 32*4-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC6, $ACC6 vpmuludq 32*5-128($aap), $B2, $TEMP2 vpaddq $TEMP2, $ACC7, $ACC7 vpmuludq 32*6-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq 32*7-128($aap), $B2, $ACC0 vpbroadcastq 32*3-128($tpa), $B2 vpaddq 32*9-192($tp0), $ACC0, $ACC0 vmovdqu $ACC2, 32*2-192($tp0) vmovdqu $ACC3, 32*3-192($tp0) vpmuludq 32*2-128($ap), $B1, $TEMP2 vpaddq $TEMP2, $ACC4, $ACC4 vpmuludq 32*2-128($aap), $B1, $TEMP0 vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq 32*3-128($aap), $B1, $TEMP1 vpaddq $TEMP1, $ACC6, $ACC6 vpmuludq 32*4-128($aap), $B1, $TEMP2 vpaddq $TEMP2, $ACC7, $ACC7 vpmuludq 32*5-128($aap), $B1, $TEMP0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq 32*6-128($aap), $B1, $TEMP1 vpaddq $TEMP1, $ACC0, $ACC0 vpmuludq 32*7-128($aap), $B1, $ACC1 vpbroadcastq 32*4-128($tpa), $B1 vpaddq 32*10-448($tp1), $ACC1, $ACC1 vmovdqu $ACC4, 32*4-192($tp0) vmovdqu $ACC5, 32*5-192($tp0) vpmuludq 32*3-128($ap), $B2, $TEMP0 vpaddq $TEMP0, $ACC6, $ACC6 vpmuludq 32*3-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC7, $ACC7 vpmuludq 32*4-128($aap), $B2, $TEMP2 vpaddq $TEMP2, $ACC8, $ACC8 vpmuludq 32*5-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC0, $ACC0 vpmuludq 32*6-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC1, $ACC1 vpmuludq 32*7-128($aap), $B2, $ACC2 vpbroadcastq 32*5-128($tpa), $B2 vpaddq 32*11-448($tp1), $ACC2, $ACC2 vmovdqu $ACC6, 32*6-192($tp0) vmovdqu $ACC7, 32*7-192($tp0) vpmuludq 32*4-128($ap), $B1, $TEMP0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq 32*4-128($aap), $B1, $TEMP1 vpaddq $TEMP1, $ACC0, $ACC0 vpmuludq 32*5-128($aap), $B1, $TEMP2 vpaddq $TEMP2, $ACC1, $ACC1 vpmuludq 32*6-128($aap), $B1, $TEMP0 vpaddq $TEMP0, $ACC2, $ACC2 vpmuludq 32*7-128($aap), $B1, $ACC3 vpbroadcastq 32*6-128($tpa), $B1 vpaddq 32*12-448($tp1), $ACC3, $ACC3 vmovdqu $ACC8, 32*8-192($tp0) vmovdqu $ACC0, 32*9-192($tp0) lea 8($tp0), $tp0 vpmuludq 32*5-128($ap), $B2, $TEMP2 vpaddq $TEMP2, $ACC1, $ACC1 vpmuludq 32*5-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC2, $ACC2 vpmuludq 32*6-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC3, $ACC3 vpmuludq 32*7-128($aap), $B2, $ACC4 vpbroadcastq 32*7-128($tpa), $B2 vpaddq 32*13-448($tp1), $ACC4, $ACC4 vmovdqu $ACC1, 32*10-448($tp1) vmovdqu $ACC2, 32*11-448($tp1) vpmuludq 32*6-128($ap), $B1, $TEMP0 vpaddq $TEMP0, $ACC3, $ACC3 vpmuludq 32*6-128($aap), $B1, $TEMP1 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 vpaddq $TEMP1, $ACC4, $ACC4 vpmuludq 32*7-128($aap), $B1, $ACC5 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration vpaddq 32*14-448($tp1), $ACC5, $ACC5 vmovdqu $ACC3, 32*12-448($tp1) vmovdqu $ACC4, 32*13-448($tp1) lea 8($tpa), $tpa vpmuludq 32*7-128($ap), $B2, $TEMP0 vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq 32*7-128($aap), $B2, $ACC6 vpaddq 32*15-448($tp1), $ACC6, $ACC6 vpmuludq 32*8-128($ap), $ACC0, $ACC7 vmovdqu $ACC5, 32*14-448($tp1) vpaddq 32*16-448($tp1), $ACC7, $ACC7 vmovdqu $ACC6, 32*15-448($tp1) vmovdqu $ACC7, 32*16-448($tp1) lea 8($tp1), $tp1 dec $i jnz .LOOP_SQR_1024 ___ $ZERO = $ACC9; $TEMP0 = $B1; $TEMP2 = $B2; $TEMP3 = $Y1; $TEMP4 = $Y2; $code.=<<___; # we need to fix indices 32-39 to avoid overflow vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) lea 192(%rsp), $tp0 # 64+128=192 vpsrlq \$29, $ACC8, $TEMP1 vpand $AND_MASK, $ACC8, $ACC8 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpermq \$0x93, $TEMP1, $TEMP1 vpxor $ZERO, $ZERO, $ZERO vpermq \$0x93, $TEMP2, $TEMP2 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC8, $ACC8 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpaddq $TEMP2, $ACC2, $ACC2 vmovdqu $ACC1, 32*9-192($tp0) vmovdqu $ACC2, 32*10-192($tp0) mov (%rsp), %rax mov 8(%rsp), $r1 mov 16(%rsp), $r2 mov 24(%rsp), $r3 vmovdqu 32*1(%rsp), $ACC1 vmovdqu 32*2-192($tp0), $ACC2 vmovdqu 32*3-192($tp0), $ACC3 vmovdqu 32*4-192($tp0), $ACC4 vmovdqu 32*5-192($tp0), $ACC5 vmovdqu 32*6-192($tp0), $ACC6 vmovdqu 32*7-192($tp0), $ACC7 mov %rax, $r0 imull $n0, %eax and \$0x1fffffff, %eax vmovd %eax, $Y1 mov %rax, %rdx imulq -128($np), %rax vpbroadcastq $Y1, $Y1 add %rax, $r0 mov %rdx, %rax imulq 8-128($np), %rax shr \$29, $r0 add %rax, $r1 mov %rdx, %rax imulq 16-128($np), %rax add $r0, $r1 add %rax, $r2 imulq 24-128($np), %rdx add %rdx, $r3 mov $r1, %rax imull $n0, %eax and \$0x1fffffff, %eax mov \$9, $i jmp .LOOP_REDUCE_1024 .align 32 .LOOP_REDUCE_1024: vmovd %eax, $Y2 vpbroadcastq $Y2, $Y2 vpmuludq 32*1-128($np), $Y1, $TEMP0 mov %rax, %rdx imulq -128($np), %rax vpaddq $TEMP0, $ACC1, $ACC1 add %rax, $r1 vpmuludq 32*2-128($np), $Y1, $TEMP1 mov %rdx, %rax imulq 8-128($np), %rax vpaddq $TEMP1, $ACC2, $ACC2 vpmuludq 32*3-128($np), $Y1, $TEMP2 .byte 0x67 add %rax, $r2 .byte 0x67 mov %rdx, %rax imulq 16-128($np), %rax shr \$29, $r1 vpaddq $TEMP2, $ACC3, $ACC3 vpmuludq 32*4-128($np), $Y1, $TEMP0 add %rax, $r3 add $r1, $r2 vpaddq $TEMP0, $ACC4, $ACC4 vpmuludq 32*5-128($np), $Y1, $TEMP1 mov $r2, %rax imull $n0, %eax vpaddq $TEMP1, $ACC5, $ACC5 vpmuludq 32*6-128($np), $Y1, $TEMP2 and \$0x1fffffff, %eax vpaddq $TEMP2, $ACC6, $ACC6 vpmuludq 32*7-128($np), $Y1, $TEMP0 vpaddq $TEMP0, $ACC7, $ACC7 vpmuludq 32*8-128($np), $Y1, $TEMP1 vmovd %eax, $Y1 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below vpaddq $TEMP1, $ACC8, $ACC8 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below vpbroadcastq $Y1, $Y1 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above vmovdqu 32*3-8-128($np), $TEMP1 mov %rax, %rdx imulq -128($np), %rax vpaddq $TEMP2, $ACC1, $ACC1 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above vmovdqu 32*4-8-128($np), $TEMP2 add %rax, $r2 mov %rdx, %rax imulq 8-128($np), %rax vpaddq $TEMP0, $ACC2, $ACC2 add $r3, %rax shr \$29, $r2 vpmuludq $Y2, $TEMP1, $TEMP1 vmovdqu 32*5-8-128($np), $TEMP0 add $r2, %rax vpaddq $TEMP1, $ACC3, $ACC3 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*6-8-128($np), $TEMP1 .byte 0x67 mov %rax, $r3 imull $n0, %eax vpaddq $TEMP2, $ACC4, $ACC4 vpmuludq $Y2, $TEMP0, $TEMP0 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 and \$0x1fffffff, %eax vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq $Y2, $TEMP1, $TEMP1 vmovdqu 32*8-8-128($np), $TEMP0 vpaddq $TEMP1, $ACC6, $ACC6 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*9-8-128($np), $ACC9 vmovd %eax, $ACC0 # borrow ACC0 for Y2 imulq -128($np), %rax vpaddq $TEMP2, $ACC7, $ACC7 vpmuludq $Y2, $TEMP0, $TEMP0 vmovdqu 32*1-16-128($np), $TEMP1 vpbroadcastq $ACC0, $ACC0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq $Y2, $ACC9, $ACC9 vmovdqu 32*2-16-128($np), $TEMP2 add %rax, $r3 ___ ($ACC0,$Y2)=($Y2,$ACC0); $code.=<<___; vmovdqu 32*1-24-128($np), $ACC0 vpmuludq $Y1, $TEMP1, $TEMP1 vmovdqu 32*3-16-128($np), $TEMP0 vpaddq $TEMP1, $ACC1, $ACC1 vpmuludq $Y2, $ACC0, $ACC0 vpmuludq $Y1, $TEMP2, $TEMP2 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 vpaddq $ACC1, $ACC0, $ACC0 vpaddq $TEMP2, $ACC2, $ACC2 vpmuludq $Y1, $TEMP0, $TEMP0 vmovdqu 32*5-16-128($np), $TEMP2 .byte 0x67 vmovq $ACC0, %rax vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 vpaddq $TEMP0, $ACC3, $ACC3 vpmuludq $Y1, $TEMP1, $TEMP1 vmovdqu 32*6-16-128($np), $TEMP0 vpaddq $TEMP1, $ACC4, $ACC4 vpmuludq $Y1, $TEMP2, $TEMP2 vmovdqu 32*7-16-128($np), $TEMP1 vpaddq $TEMP2, $ACC5, $ACC5 vpmuludq $Y1, $TEMP0, $TEMP0 vmovdqu 32*8-16-128($np), $TEMP2 vpaddq $TEMP0, $ACC6, $ACC6 vpmuludq $Y1, $TEMP1, $TEMP1 shr \$29, $r3 vmovdqu 32*9-16-128($np), $TEMP0 add $r3, %rax vpaddq $TEMP1, $ACC7, $ACC7 vpmuludq $Y1, $TEMP2, $TEMP2 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below mov %rax, $r0 imull $n0, %eax vpaddq $TEMP2, $ACC8, $ACC8 vpmuludq $Y1, $TEMP0, $TEMP0 and \$0x1fffffff, %eax vmovd %eax, $Y1 vmovdqu 32*3-24-128($np), $TEMP2 .byte 0x67 vpaddq $TEMP0, $ACC9, $ACC9 vpbroadcastq $Y1, $Y1 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above vmovdqu 32*4-24-128($np), $TEMP0 mov %rax, %rdx imulq -128($np), %rax mov 8(%rsp), $r1 vpaddq $TEMP1, $ACC2, $ACC1 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*5-24-128($np), $TEMP1 add %rax, $r0 mov %rdx, %rax imulq 8-128($np), %rax .byte 0x67 shr \$29, $r0 mov 16(%rsp), $r2 vpaddq $TEMP2, $ACC3, $ACC2 vpmuludq $Y2, $TEMP0, $TEMP0 vmovdqu 32*6-24-128($np), $TEMP2 add %rax, $r1 mov %rdx, %rax imulq 16-128($np), %rax vpaddq $TEMP0, $ACC4, $ACC3 vpmuludq $Y2, $TEMP1, $TEMP1 vmovdqu 32*7-24-128($np), $TEMP0 imulq 24-128($np), %rdx # future $r3 add %rax, $r2 lea ($r0,$r1), %rax vpaddq $TEMP1, $ACC5, $ACC4 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*8-24-128($np), $TEMP1 mov %rax, $r1 imull $n0, %eax vpmuludq $Y2, $TEMP0, $TEMP0 vpaddq $TEMP2, $ACC6, $ACC5 vmovdqu 32*9-24-128($np), $TEMP2 and \$0x1fffffff, %eax vpaddq $TEMP0, $ACC7, $ACC6 vpmuludq $Y2, $TEMP1, $TEMP1 add 24(%rsp), %rdx vpaddq $TEMP1, $ACC8, $ACC7 vpmuludq $Y2, $TEMP2, $TEMP2 vpaddq $TEMP2, $ACC9, $ACC8 vmovq $r3, $ACC9 mov %rdx, $r3 dec $i jnz .LOOP_REDUCE_1024 ___ ($ACC0,$Y2)=($Y2,$ACC0); $code.=<<___; lea 448(%rsp), $tp1 # size optimization vpaddq $ACC9, $Y2, $ACC0 vpxor $ZERO, $ZERO, $ZERO vpaddq 32*9-192($tp0), $ACC0, $ACC0 vpaddq 32*10-448($tp1), $ACC1, $ACC1 vpaddq 32*11-448($tp1), $ACC2, $ACC2 vpaddq 32*12-448($tp1), $ACC3, $ACC3 vpaddq 32*13-448($tp1), $ACC4, $ACC4 vpaddq 32*14-448($tp1), $ACC5, $ACC5 vpaddq 32*15-448($tp1), $ACC6, $ACC6 vpaddq 32*16-448($tp1), $ACC7, $ACC7 vpaddq 32*17-448($tp1), $ACC8, $ACC8 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vpaddq $TEMP4, $ACC4, $ACC4 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vmovdqu $ACC0, 32*0-128($rp) vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vmovdqu $ACC1, 32*1-128($rp) vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vmovdqu $ACC2, 32*2-128($rp) vpaddq $TEMP4, $ACC4, $ACC4 vmovdqu $ACC3, 32*3-128($rp) ___ $TEMP5=$ACC0; $code.=<<___; vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vmovdqu $ACC4, 32*4-128($rp) vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vmovdqu $ACC5, 32*5-128($rp) vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vmovdqu $ACC6, 32*6-128($rp) vpaddq $TEMP4, $ACC8, $ACC8 vmovdqu $ACC7, 32*7-128($rp) vmovdqu $ACC8, 32*8-128($rp) mov $rp, $ap dec $rep jne .LOOP_GRANDE_SQR_1024 vzeroall mov %rbp, %rax .cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); .Lsqr_1024_in_tail: movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lsqr_1024_epilogue: ret .cfi_endproc .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 ___ } { # void AMM_WW( my $rp="%rdi"; # BN_ULONG *rp, my $ap="%rsi"; # const BN_ULONG *ap, my $bp="%rdx"; # const BN_ULONG *bp, my $np="%rcx"; # const BN_ULONG *np, my $n0="%r8d"; # unsigned int n0); # The registers that hold the accumulated redundant result # The AMM works on 1024 bit operands, and redundant word size is 29 # Therefore: ceil(1024/29)/4 = 9 my $ACC0="%ymm0"; my $ACC1="%ymm1"; my $ACC2="%ymm2"; my $ACC3="%ymm3"; my $ACC4="%ymm4"; my $ACC5="%ymm5"; my $ACC6="%ymm6"; my $ACC7="%ymm7"; my $ACC8="%ymm8"; my $ACC9="%ymm9"; # Registers that hold the broadcasted words of multiplier, currently used my $Bi="%ymm10"; my $Yi="%ymm11"; # Helper registers my $TEMP0=$ACC0; my $TEMP1="%ymm12"; my $TEMP2="%ymm13"; my $ZERO="%ymm14"; my $AND_MASK="%ymm15"; # alu registers that hold the first words of the ACC my $r0="%r9"; my $r1="%r10"; my $r2="%r11"; my $r3="%r12"; my $i="%r14d"; my $tmp="%r15"; $bp="%r13"; # reassigned argument $code.=<<___; .globl rsaz_1024_mul_avx2 .type rsaz_1024_mul_avx2,\@function,5 .align 64 rsaz_1024_mul_avx2: .cfi_startproc lea (%rsp), %rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); vzeroupper lea -0xa8(%rsp),%rsp vmovaps %xmm6,-0xd8(%rax) vmovaps %xmm7,-0xc8(%rax) vmovaps %xmm8,-0xb8(%rax) vmovaps %xmm9,-0xa8(%rax) vmovaps %xmm10,-0x98(%rax) vmovaps %xmm11,-0x88(%rax) vmovaps %xmm12,-0x78(%rax) vmovaps %xmm13,-0x68(%rax) vmovaps %xmm14,-0x58(%rax) vmovaps %xmm15,-0x48(%rax) .Lmul_1024_body: ___ $code.=<<___; mov %rax,%rbp .cfi_def_cfa_register %rbp vzeroall mov %rdx, $bp # reassigned argument sub \$64,%rsp # unaligned 256-bit load that crosses page boundary can # cause severe performance degradation here, so if $ap does # cross page boundary, swap it with $bp [meaning that caller # is advised to lay down $ap and $bp next to each other, so # that only one can cross page boundary]. .byte 0x67,0x67 mov $ap, $tmp and \$4095, $tmp add \$32*10, $tmp shr \$12, $tmp mov $ap, $tmp cmovnz $bp, $ap cmovnz $tmp, $bp mov $np, $tmp sub \$-128,$ap # size optimization sub \$-128,$np sub \$-128,$rp and \$4095, $tmp # see if $np crosses page add \$32*10, $tmp .byte 0x67,0x67 shr \$12, $tmp jz .Lmul_1024_no_n_copy # unaligned 256-bit load that crosses page boundary can # cause severe performance degradation here, so if $np does # cross page boundary, copy it to stack and make sure stack # frame doesn't... sub \$32*10,%rsp vmovdqu 32*0-128($np), $ACC0 and \$-512, %rsp vmovdqu 32*1-128($np), $ACC1 vmovdqu 32*2-128($np), $ACC2 vmovdqu 32*3-128($np), $ACC3 vmovdqu 32*4-128($np), $ACC4 vmovdqu 32*5-128($np), $ACC5 vmovdqu 32*6-128($np), $ACC6 vmovdqu 32*7-128($np), $ACC7 vmovdqu 32*8-128($np), $ACC8 lea 64+128(%rsp),$np vmovdqu $ACC0, 32*0-128($np) vpxor $ACC0, $ACC0, $ACC0 vmovdqu $ACC1, 32*1-128($np) vpxor $ACC1, $ACC1, $ACC1 vmovdqu $ACC2, 32*2-128($np) vpxor $ACC2, $ACC2, $ACC2 vmovdqu $ACC3, 32*3-128($np) vpxor $ACC3, $ACC3, $ACC3 vmovdqu $ACC4, 32*4-128($np) vpxor $ACC4, $ACC4, $ACC4 vmovdqu $ACC5, 32*5-128($np) vpxor $ACC5, $ACC5, $ACC5 vmovdqu $ACC6, 32*6-128($np) vpxor $ACC6, $ACC6, $ACC6 vmovdqu $ACC7, 32*7-128($np) vpxor $ACC7, $ACC7, $ACC7 vmovdqu $ACC8, 32*8-128($np) vmovdqa $ACC0, $ACC8 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall .Lmul_1024_no_n_copy: and \$-64,%rsp mov ($bp), %rbx vpbroadcastq ($bp), $Bi vmovdqu $ACC0, (%rsp) # clear top of stack xor $r0, $r0 .byte 0x67 xor $r1, $r1 xor $r2, $r2 xor $r3, $r3 vmovdqu .Land_mask(%rip), $AND_MASK mov \$9, $i vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall jmp .Loop_mul_1024 .align 32 .Loop_mul_1024: vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) mov %rbx, %rax imulq -128($ap), %rax add $r0, %rax mov %rbx, $r1 imulq 8-128($ap), $r1 add 8(%rsp), $r1 mov %rax, $r0 imull $n0, %eax and \$0x1fffffff, %eax mov %rbx, $r2 imulq 16-128($ap), $r2 add 16(%rsp), $r2 mov %rbx, $r3 imulq 24-128($ap), $r3 add 24(%rsp), $r3 vpmuludq 32*1-128($ap),$Bi,$TEMP0 vmovd %eax, $Yi vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq 32*2-128($ap),$Bi,$TEMP1 vpbroadcastq $Yi, $Yi vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq 32*3-128($ap),$Bi,$TEMP2 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq 32*4-128($ap),$Bi,$TEMP0 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq 32*5-128($ap),$Bi,$TEMP1 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq 32*6-128($ap),$Bi,$TEMP2 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq 32*7-128($ap),$Bi,$TEMP0 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq 32*8-128($ap),$Bi,$TEMP1 vpbroadcastq 8($bp), $Bi vpaddq $TEMP1,$ACC8,$ACC8 mov %rax,%rdx imulq -128($np),%rax add %rax,$r0 mov %rdx,%rax imulq 8-128($np),%rax add %rax,$r1 mov %rdx,%rax imulq 16-128($np),%rax add %rax,$r2 shr \$29, $r0 imulq 24-128($np),%rdx add %rdx,$r3 add $r0, $r1 vpmuludq 32*1-128($np),$Yi,$TEMP2 vmovq $Bi, %rbx vpaddq $TEMP2,$ACC1,$ACC1 vpmuludq 32*2-128($np),$Yi,$TEMP0 vpaddq $TEMP0,$ACC2,$ACC2 vpmuludq 32*3-128($np),$Yi,$TEMP1 vpaddq $TEMP1,$ACC3,$ACC3 vpmuludq 32*4-128($np),$Yi,$TEMP2 vpaddq $TEMP2,$ACC4,$ACC4 vpmuludq 32*5-128($np),$Yi,$TEMP0 vpaddq $TEMP0,$ACC5,$ACC5 vpmuludq 32*6-128($np),$Yi,$TEMP1 vpaddq $TEMP1,$ACC6,$ACC6 vpmuludq 32*7-128($np),$Yi,$TEMP2 vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 vpaddq $TEMP2,$ACC7,$ACC7 vpmuludq 32*8-128($np),$Yi,$TEMP0 vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 vpaddq $TEMP0,$ACC8,$ACC8 mov %rbx, %rax imulq -128($ap),%rax add %rax,$r1 vmovdqu -8+32*1-128($ap),$TEMP1 mov %rbx, %rax imulq 8-128($ap),%rax add %rax,$r2 vmovdqu -8+32*2-128($ap),$TEMP2 mov $r1, %rax vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 imull $n0, %eax vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 and \$0x1fffffff, %eax imulq 16-128($ap),%rbx add %rbx,$r3 vpmuludq $Bi,$TEMP1,$TEMP1 vmovd %eax, $Yi vmovdqu -8+32*3-128($ap),$TEMP0 vpaddq $TEMP1,$ACC1,$ACC1 vpmuludq $Bi,$TEMP2,$TEMP2 vpbroadcastq $Yi, $Yi vmovdqu -8+32*4-128($ap),$TEMP1 vpaddq $TEMP2,$ACC2,$ACC2 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -8+32*5-128($ap),$TEMP2 vpaddq $TEMP0,$ACC3,$ACC3 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -8+32*6-128($ap),$TEMP0 vpaddq $TEMP1,$ACC4,$ACC4 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -8+32*7-128($ap),$TEMP1 vpaddq $TEMP2,$ACC5,$ACC5 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -8+32*8-128($ap),$TEMP2 vpaddq $TEMP0,$ACC6,$ACC6 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -8+32*9-128($ap),$ACC9 vpaddq $TEMP1,$ACC7,$ACC7 vpmuludq $Bi,$TEMP2,$TEMP2 vpaddq $TEMP2,$ACC8,$ACC8 vpmuludq $Bi,$ACC9,$ACC9 vpbroadcastq 16($bp), $Bi mov %rax,%rdx imulq -128($np),%rax add %rax,$r1 vmovdqu -8+32*1-128($np),$TEMP0 mov %rdx,%rax imulq 8-128($np),%rax add %rax,$r2 vmovdqu -8+32*2-128($np),$TEMP1 shr \$29, $r1 imulq 16-128($np),%rdx add %rdx,$r3 add $r1, $r2 vpmuludq $Yi,$TEMP0,$TEMP0 vmovq $Bi, %rbx vmovdqu -8+32*3-128($np),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -8+32*4-128($np),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -8+32*5-128($np),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -8+32*6-128($np),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -8+32*7-128($np),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -8+32*8-128($np),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -8+32*9-128($np),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Yi,$TEMP1,$TEMP1 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Yi,$TEMP2,$TEMP2 vpaddq $TEMP2,$ACC9,$ACC9 vmovdqu -16+32*1-128($ap),$TEMP0 mov %rbx,%rax imulq -128($ap),%rax add $r2,%rax vmovdqu -16+32*2-128($ap),$TEMP1 mov %rax,$r2 imull $n0, %eax and \$0x1fffffff, %eax imulq 8-128($ap),%rbx add %rbx,$r3 vpmuludq $Bi,$TEMP0,$TEMP0 vmovd %eax, $Yi vmovdqu -16+32*3-128($ap),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Bi,$TEMP1,$TEMP1 vpbroadcastq $Yi, $Yi vmovdqu -16+32*4-128($ap),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -16+32*5-128($ap),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -16+32*6-128($ap),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -16+32*7-128($ap),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -16+32*8-128($ap),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -16+32*9-128($ap),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Bi,$TEMP1,$TEMP1 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Bi,$TEMP2,$TEMP2 vpbroadcastq 24($bp), $Bi vpaddq $TEMP2,$ACC9,$ACC9 vmovdqu -16+32*1-128($np),$TEMP0 mov %rax,%rdx imulq -128($np),%rax add %rax,$r2 vmovdqu -16+32*2-128($np),$TEMP1 imulq 8-128($np),%rdx add %rdx,$r3 shr \$29, $r2 vpmuludq $Yi,$TEMP0,$TEMP0 vmovq $Bi, %rbx vmovdqu -16+32*3-128($np),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -16+32*4-128($np),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -16+32*5-128($np),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -16+32*6-128($np),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -16+32*7-128($np),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -16+32*8-128($np),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -16+32*9-128($np),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -24+32*1-128($ap),$TEMP0 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -24+32*2-128($ap),$TEMP1 vpaddq $TEMP2,$ACC9,$ACC9 add $r2, $r3 imulq -128($ap),%rbx add %rbx,$r3 mov $r3, %rax imull $n0, %eax and \$0x1fffffff, %eax vpmuludq $Bi,$TEMP0,$TEMP0 vmovd %eax, $Yi vmovdqu -24+32*3-128($ap),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Bi,$TEMP1,$TEMP1 vpbroadcastq $Yi, $Yi vmovdqu -24+32*4-128($ap),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -24+32*5-128($ap),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -24+32*6-128($ap),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -24+32*7-128($ap),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -24+32*8-128($ap),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -24+32*9-128($ap),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Bi,$TEMP1,$TEMP1 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Bi,$TEMP2,$TEMP2 vpbroadcastq 32($bp), $Bi vpaddq $TEMP2,$ACC9,$ACC9 add \$32, $bp # $bp++ vmovdqu -24+32*1-128($np),$TEMP0 imulq -128($np),%rax add %rax,$r3 shr \$29, $r3 vmovdqu -24+32*2-128($np),$TEMP1 vpmuludq $Yi,$TEMP0,$TEMP0 vmovq $Bi, %rbx vmovdqu -24+32*3-128($np),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 vpaddq $TEMP1,$ACC2,$ACC1 vmovdqu -24+32*4-128($np),$TEMP0 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -24+32*5-128($np),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC2 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -24+32*6-128($np),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC3 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -24+32*7-128($np),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC4 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -24+32*8-128($np),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC5 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -24+32*9-128($np),$TEMP2 mov $r3, $r0 vpaddq $TEMP0,$ACC7,$ACC6 vpmuludq $Yi,$TEMP1,$TEMP1 add (%rsp), $r0 vpaddq $TEMP1,$ACC8,$ACC7 vpmuludq $Yi,$TEMP2,$TEMP2 vmovq $r3, $TEMP1 vpaddq $TEMP2,$ACC9,$ACC8 dec $i jnz .Loop_mul_1024 ___ # (*) Original implementation was correcting ACC1-ACC3 for overflow # after 7 loop runs, or after 28 iterations, or 56 additions. # But as we underutilize resources, it's possible to correct in # each iteration with marginal performance loss. But then, as # we do it in each iteration, we can correct less digits, and # avoid performance penalties completely. $TEMP0 = $ACC9; $TEMP3 = $Bi; $TEMP4 = $Yi; $code.=<<___; vpaddq (%rsp), $TEMP1, $ACC0 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpermq \$0x93, $TEMP4, $TEMP4 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vpaddq $TEMP4, $ACC4, $ACC4 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vpaddq $TEMP4, $ACC4, $ACC4 vmovdqu $ACC0, 0-128($rp) vmovdqu $ACC1, 32-128($rp) vmovdqu $ACC2, 64-128($rp) vmovdqu $ACC3, 96-128($rp) ___ $TEMP5=$ACC0; $code.=<<___; vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 vmovdqu $ACC4, 128-128($rp) vmovdqu $ACC5, 160-128($rp) vmovdqu $ACC6, 192-128($rp) vmovdqu $ACC7, 224-128($rp) vmovdqu $ACC8, 256-128($rp) vzeroupper mov %rbp, %rax .cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); .Lmul_1024_in_tail: movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lmul_1024_epilogue: ret .cfi_endproc .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 ___ } { my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); my @T = map("%r$_",(8..11)); $code.=<<___; .globl rsaz_1024_red2norm_avx2 .type rsaz_1024_red2norm_avx2,\@abi-omnipotent .align 32 rsaz_1024_red2norm_avx2: .cfi_startproc sub \$-128,$inp # size optimization xor %rax,%rax ___ for ($j=0,$i=0; $i<16; $i++) { my $k=0; while (29*$j<64*($i+1)) { # load data till boundary $code.=" mov `8*$j-128`($inp), @T[0]\n"; $j++; $k++; push(@T,shift(@T)); } $l=$k; while ($k>1) { # shift loaded data but last value $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; $k--; } $code.=<<___; # shift last value mov @T[-1], @T[0] shl \$`29*($j-1)`, @T[-1] shr \$`-29*($j-1)`, @T[0] ___ while ($l) { # accumulate all values $code.=" add @T[-$l], %rax\n"; $l--; } $code.=<<___; adc \$0, @T[0] # consume eventual carry mov %rax, 8*$i($out) mov @T[0], %rax ___ push(@T,shift(@T)); } $code.=<<___; ret .cfi_endproc .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 .globl rsaz_1024_norm2red_avx2 .type rsaz_1024_norm2red_avx2,\@abi-omnipotent .align 32 rsaz_1024_norm2red_avx2: .cfi_startproc sub \$-128,$out # size optimization mov ($inp),@T[0] mov \$0x1fffffff,%eax ___ for ($j=0,$i=0; $i<16; $i++) { $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); $code.=" xor @T[1],@T[1]\n" if ($i==15); my $k=1; while (29*($j+1)<64*($i+1)) { $code.=<<___; mov @T[0],@T[-$k] shr \$`29*$j`,@T[-$k] and %rax,@T[-$k] # &0x1fffffff mov @T[-$k],`8*$j-128`($out) ___ $j++; $k++; } $code.=<<___; shrd \$`29*$j`,@T[1],@T[0] and %rax,@T[0] mov @T[0],`8*$j-128`($out) ___ $j++; push(@T,shift(@T)); } $code.=<<___; mov @T[0],`8*$j-128`($out) # zero mov @T[0],`8*($j+1)-128`($out) mov @T[0],`8*($j+2)-128`($out) mov @T[0],`8*($j+3)-128`($out) ret .cfi_endproc .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 ___ } { my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); $code.=<<___; .globl rsaz_1024_scatter5_avx2 .type rsaz_1024_scatter5_avx2,\@abi-omnipotent .align 32 rsaz_1024_scatter5_avx2: .cfi_startproc vzeroupper vmovdqu .Lscatter_permd(%rip),%ymm5 shl \$4,$power lea ($out,$power),$out mov \$9,%eax jmp .Loop_scatter_1024 .align 32 .Loop_scatter_1024: vmovdqu ($inp),%ymm0 lea 32($inp),$inp vpermd %ymm0,%ymm5,%ymm0 vmovdqu %xmm0,($out) lea 16*32($out),$out dec %eax jnz .Loop_scatter_1024 vzeroupper ret .cfi_endproc .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 .globl rsaz_1024_gather5_avx2 .type rsaz_1024_gather5_avx2,\@abi-omnipotent .align 32 rsaz_1024_gather5_avx2: .cfi_startproc vzeroupper mov %rsp,%r11 .cfi_def_cfa_register %r11 ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax .LSEH_begin_rsaz_1024_gather5: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) ___ $code.=<<___; lea -0x100(%rsp),%rsp and \$-32, %rsp lea .Linc(%rip), %r10 lea -128(%rsp),%rax # control u-op density vmovd $power, %xmm4 vmovdqa (%r10),%ymm0 vmovdqa 32(%r10),%ymm1 vmovdqa 64(%r10),%ymm5 vpbroadcastd %xmm4,%ymm4 vpaddd %ymm5, %ymm0, %ymm2 vpcmpeqd %ymm4, %ymm0, %ymm0 vpaddd %ymm5, %ymm1, %ymm3 vpcmpeqd %ymm4, %ymm1, %ymm1 vmovdqa %ymm0, 32*0+128(%rax) vpaddd %ymm5, %ymm2, %ymm0 vpcmpeqd %ymm4, %ymm2, %ymm2 vmovdqa %ymm1, 32*1+128(%rax) vpaddd %ymm5, %ymm3, %ymm1 vpcmpeqd %ymm4, %ymm3, %ymm3 vmovdqa %ymm2, 32*2+128(%rax) vpaddd %ymm5, %ymm0, %ymm2 vpcmpeqd %ymm4, %ymm0, %ymm0 vmovdqa %ymm3, 32*3+128(%rax) vpaddd %ymm5, %ymm1, %ymm3 vpcmpeqd %ymm4, %ymm1, %ymm1 vmovdqa %ymm0, 32*4+128(%rax) vpaddd %ymm5, %ymm2, %ymm8 vpcmpeqd %ymm4, %ymm2, %ymm2 vmovdqa %ymm1, 32*5+128(%rax) vpaddd %ymm5, %ymm3, %ymm9 vpcmpeqd %ymm4, %ymm3, %ymm3 vmovdqa %ymm2, 32*6+128(%rax) vpaddd %ymm5, %ymm8, %ymm10 vpcmpeqd %ymm4, %ymm8, %ymm8 vmovdqa %ymm3, 32*7+128(%rax) vpaddd %ymm5, %ymm9, %ymm11 vpcmpeqd %ymm4, %ymm9, %ymm9 vpaddd %ymm5, %ymm10, %ymm12 vpcmpeqd %ymm4, %ymm10, %ymm10 vpaddd %ymm5, %ymm11, %ymm13 vpcmpeqd %ymm4, %ymm11, %ymm11 vpaddd %ymm5, %ymm12, %ymm14 vpcmpeqd %ymm4, %ymm12, %ymm12 vpaddd %ymm5, %ymm13, %ymm15 vpcmpeqd %ymm4, %ymm13, %ymm13 vpcmpeqd %ymm4, %ymm14, %ymm14 vpcmpeqd %ymm4, %ymm15, %ymm15 vmovdqa -32(%r10),%ymm7 # .Lgather_permd lea 128($inp), $inp mov \$9,$power .Loop_gather_1024: vmovdqa 32*0-128($inp), %ymm0 vmovdqa 32*1-128($inp), %ymm1 vmovdqa 32*2-128($inp), %ymm2 vmovdqa 32*3-128($inp), %ymm3 vpand 32*0+128(%rax), %ymm0, %ymm0 vpand 32*1+128(%rax), %ymm1, %ymm1 vpand 32*2+128(%rax), %ymm2, %ymm2 vpor %ymm0, %ymm1, %ymm4 vpand 32*3+128(%rax), %ymm3, %ymm3 vmovdqa 32*4-128($inp), %ymm0 vmovdqa 32*5-128($inp), %ymm1 vpor %ymm2, %ymm3, %ymm5 vmovdqa 32*6-128($inp), %ymm2 vmovdqa 32*7-128($inp), %ymm3 vpand 32*4+128(%rax), %ymm0, %ymm0 vpand 32*5+128(%rax), %ymm1, %ymm1 vpand 32*6+128(%rax), %ymm2, %ymm2 vpor %ymm0, %ymm4, %ymm4 vpand 32*7+128(%rax), %ymm3, %ymm3 vpand 32*8-128($inp), %ymm8, %ymm0 vpor %ymm1, %ymm5, %ymm5 vpand 32*9-128($inp), %ymm9, %ymm1 vpor %ymm2, %ymm4, %ymm4 vpand 32*10-128($inp),%ymm10, %ymm2 vpor %ymm3, %ymm5, %ymm5 vpand 32*11-128($inp),%ymm11, %ymm3 vpor %ymm0, %ymm4, %ymm4 vpand 32*12-128($inp),%ymm12, %ymm0 vpor %ymm1, %ymm5, %ymm5 vpand 32*13-128($inp),%ymm13, %ymm1 vpor %ymm2, %ymm4, %ymm4 vpand 32*14-128($inp),%ymm14, %ymm2 vpor %ymm3, %ymm5, %ymm5 vpand 32*15-128($inp),%ymm15, %ymm3 lea 32*16($inp), $inp vpor %ymm0, %ymm4, %ymm4 vpor %ymm1, %ymm5, %ymm5 vpor %ymm2, %ymm4, %ymm4 vpor %ymm3, %ymm5, %ymm5 vpor %ymm5, %ymm4, %ymm4 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared vpor %xmm4, %xmm5, %xmm5 vpermd %ymm5,%ymm7,%ymm5 vmovdqu %ymm5,($out) lea 32($out),$out dec $power jnz .Loop_gather_1024 vpxor %ymm0,%ymm0,%ymm0 vmovdqu %ymm0,($out) vzeroupper ___ $code.=<<___ if ($win64); movaps -0xa8(%r11),%xmm6 movaps -0x98(%r11),%xmm7 movaps -0x88(%r11),%xmm8 movaps -0x78(%r11),%xmm9 movaps -0x68(%r11),%xmm10 movaps -0x58(%r11),%xmm11 movaps -0x48(%r11),%xmm12 movaps -0x38(%r11),%xmm13 movaps -0x28(%r11),%xmm14 movaps -0x18(%r11),%xmm15 ___ $code.=<<___; lea (%r11),%rsp .cfi_def_cfa_register %rsp ret .cfi_endproc .LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ } $code.=<<___; .extern OPENSSL_ia32cap_P .globl rsaz_avx2_eligible .type rsaz_avx2_eligible,\@abi-omnipotent .align 32 rsaz_avx2_eligible: mov OPENSSL_ia32cap_P+8(%rip),%eax ___ $code.=<<___ if ($addx); mov \$`1<<8|1<<19`,%ecx mov \$0,%edx and %eax,%ecx cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X cmove %edx,%eax ___ $code.=<<___; and \$`1<<5`,%eax shr \$5,%eax ret .size rsaz_avx2_eligible,.-rsaz_avx2_eligible .align 64 .Land_mask: .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff .Lscatter_permd: .long 0,2,4,6,7,7,7,7 .Lgather_permd: .long 0,7,1,7,2,7,3,7 .Linc: .long 0,0,0,0, 1,1,1,1 .long 2,2,2,2, 3,3,3,3 .long 4,4,4,4, 4,4,4,4 .align 64 ___ if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___ .extern __imp_RtlVirtualUnwind .type rsaz_se_handler,\@abi-omnipotent .align 16 rsaz_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRip>=epilogue label jae .Lcommon_seh_tail mov 160($context),%rbp # pull context->Rbp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # "in tail" label cmp %r10,%rbx # context->Rip>="in tail" label cmovc %rbp,%rax mov -48(%rax),%r15 mov -40(%rax),%r14 mov -32(%rax),%r13 mov -24(%rax),%r12 mov -16(%rax),%rbp mov -8(%rax),%rbx mov %r15,240($context) mov %r14,232($context) mov %r13,224($context) mov %r12,216($context) mov %rbp,160($context) mov %rbx,144($context) lea -0xd8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size rsaz_se_handler,.-rsaz_se_handler .section .pdata .align 4 .rva .LSEH_begin_rsaz_1024_sqr_avx2 .rva .LSEH_end_rsaz_1024_sqr_avx2 .rva .LSEH_info_rsaz_1024_sqr_avx2 .rva .LSEH_begin_rsaz_1024_mul_avx2 .rva .LSEH_end_rsaz_1024_mul_avx2 .rva .LSEH_info_rsaz_1024_mul_avx2 .rva .LSEH_begin_rsaz_1024_gather5 .rva .LSEH_end_rsaz_1024_gather5 .rva .LSEH_info_rsaz_1024_gather5 .section .xdata .align 8 .LSEH_info_rsaz_1024_sqr_avx2: .byte 9,0,0,0 .rva rsaz_se_handler .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail .long 0 .LSEH_info_rsaz_1024_mul_avx2: .byte 9,0,0,0 .rva rsaz_se_handler .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail .long 0 .LSEH_info_rsaz_1024_gather5: .byte 0x01,0x36,0x17,0x0b .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; print $_,"\n"; } }}} else {{{ print <<___; # assembler is too old .text .globl rsaz_avx2_eligible .type rsaz_avx2_eligible,\@abi-omnipotent rsaz_avx2_eligible: xor %eax,%eax ret .size rsaz_avx2_eligible,.-rsaz_avx2_eligible .globl rsaz_1024_sqr_avx2 .globl rsaz_1024_mul_avx2 .globl rsaz_1024_norm2red_avx2 .globl rsaz_1024_red2norm_avx2 .globl rsaz_1024_scatter5_avx2 .globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,\@abi-omnipotent rsaz_1024_sqr_avx2: rsaz_1024_mul_avx2: rsaz_1024_norm2red_avx2: rsaz_1024_red2norm_avx2: rsaz_1024_scatter5_avx2: rsaz_1024_gather5_avx2: .byte 0x0f,0x0b # ud2 ret .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 ___ }}} close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl (revision 364822) @@ -1,2431 +1,2431 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2012, Intel Corporation. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # # References: # [1] S. Gueron, "Efficient Software Implementations of Modular # Exponentiation", http://eprint.iacr.org/2011/239 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # IEEE Proceedings of 9th International Conference on Information # Technology: New Generations (ITNG 2012), 821-823 (2012). # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation # Journal of Cryptographic Engineering 2:31-43 (2012). # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # resistant 512-bit and 1024-bit modular exponentiation for optimizing # RSA1024 and RSA2048 on x86_64 platforms", # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest # # While original submission covers 512- and 1024-bit exponentiation, # this module is limited to 512-bit version only (and as such # accelerates RSA1024 sign). This is because improvement for longer # keys is not high enough to justify the effort, highest measured # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming # for the moment of this writing!] Nor does this module implement # "monolithic" complete exponentiation jumbo-subroutine, but adheres # to more modular mixture of C and assembly. And it's optimized even # for processors other than Intel Core family (see table below for # improvement coefficients). # # # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) # ----------------+--------------------------- # Opteron +13% |+5% +20% # Bulldozer -0% |-1% +10% # P4 +11% |+7% +8% # Westmere +5% |+14% +17% # Sandy Bridge +2% |+12% +29% # Ivy Bridge +1% |+11% +35% # Haswell(**) -0% |+12% +39% # Atom +13% |+11% +4% # VIA Nano +70% |+9% +25% # # (*) rsax engine and fips numbers are presented for reference # purposes; # (**) MULX was attempted, but found to give only marginal improvement; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API { my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl rsaz_512_sqr .type rsaz_512_sqr,\@function,5 .align 32 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lsqr_body: movq $mod, %xmm1 # common off-load movq ($inp), %rdx movq 8($inp), %rax movq $n0, 128(%rsp) ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Loop_sqrx ___ $code.=<<___; jmp .Loop_sqr .align 32 .Loop_sqr: movl $times,128+8(%rsp) #first iteration movq %rdx, %rbx # 0($inp) mov %rax, %rbp # 8($inp) mulq %rdx movq %rax, %r8 movq 16($inp), %rax movq %rdx, %r9 mulq %rbx addq %rax, %r9 movq 24($inp), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r10 movq 32($inp), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r11 movq 40($inp), %rax movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r12 movq 48($inp), %rax movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r13 movq 56($inp), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r14 movq %rbx, %rax adcq \$0, %rdx xorq %rcx,%rcx # rcx:r8 = r8 << 1 addq %r8, %r8 movq %rdx, %r15 adcq \$0, %rcx mulq %rax addq %r8, %rdx adcq \$0, %rcx movq %rax, (%rsp) movq %rdx, 8(%rsp) #second iteration movq 16($inp), %rax mulq %rbp addq %rax, %r10 movq 24($inp), %rax movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r11 movq 32($inp), %rax adcq \$0, %rdx addq %rbx, %r11 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r12 movq 40($inp), %rax adcq \$0, %rdx addq %rbx, %r12 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r13 movq 48($inp), %rax adcq \$0, %rdx addq %rbx, %r13 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r14 movq 56($inp), %rax adcq \$0, %rdx addq %rbx, %r14 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r15 movq %rbp, %rax adcq \$0, %rdx addq %rbx, %r15 adcq \$0, %rdx xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 addq %r9, %r9 movq %rdx, %r8 adcq %r10, %r10 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax movq 16($inp), %rbp addq %rax, %r9 movq 24($inp), %rax adcq %rdx, %r10 adcq \$0, %rbx movq %r9, 16(%rsp) movq %r10, 24(%rsp) #third iteration mulq %rbp addq %rax, %r12 movq 32($inp), %rax movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r13 movq 40($inp), %rax adcq \$0, %rdx addq %rcx, %r13 movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r14 movq 48($inp), %rax adcq \$0, %rdx addq %rcx, %r14 movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r15 movq 56($inp), %rax adcq \$0, %rdx addq %rcx, %r15 movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r8 movq %rbp, %rax adcq \$0, %rdx addq %rcx, %r8 adcq \$0, %rdx xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 addq %r11, %r11 movq %rdx, %r9 adcq %r12, %r12 adcq \$0, %rcx mulq %rax # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rbx, %rax movq 24($inp), %r10 addq %rax, %r11 movq 32($inp), %rax adcq %rdx, %r12 adcq \$0, %rcx movq %r11, 32(%rsp) movq %r12, 40(%rsp) #fourth iteration mov %rax, %r11 # 32($inp) mulq %r10 addq %rax, %r14 movq 40($inp), %rax movq %rdx, %rbx adcq \$0, %rbx mov %rax, %r12 # 40($inp) mulq %r10 addq %rax, %r15 movq 48($inp), %rax adcq \$0, %rdx addq %rbx, %r15 movq %rdx, %rbx adcq \$0, %rbx mov %rax, %rbp # 48($inp) mulq %r10 addq %rax, %r8 movq 56($inp), %rax adcq \$0, %rdx addq %rbx, %r8 movq %rdx, %rbx adcq \$0, %rbx mulq %r10 addq %rax, %r9 movq %r10, %rax adcq \$0, %rdx addq %rbx, %r9 adcq \$0, %rdx xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1 addq %r13, %r13 movq %rdx, %r10 adcq %r14, %r14 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax addq %rax, %r13 movq %r12, %rax # 40($inp) adcq %rdx, %r14 adcq \$0, %rbx movq %r13, 48(%rsp) movq %r14, 56(%rsp) #fifth iteration mulq %r11 addq %rax, %r8 movq %rbp, %rax # 48($inp) movq %rdx, %rcx adcq \$0, %rcx mulq %r11 addq %rax, %r9 movq 56($inp), %rax adcq \$0, %rdx addq %rcx, %r9 movq %rdx, %rcx adcq \$0, %rcx mov %rax, %r14 # 56($inp) mulq %r11 addq %rax, %r10 movq %r11, %rax adcq \$0, %rdx addq %rcx, %r10 adcq \$0, %rdx xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1 addq %r15, %r15 movq %rdx, %r11 adcq %r8, %r8 adcq \$0, %rcx mulq %rax # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rbx, %rax addq %rax, %r15 movq %rbp, %rax # 48($inp) adcq %rdx, %r8 adcq \$0, %rcx movq %r15, 64(%rsp) movq %r8, 72(%rsp) #sixth iteration mulq %r12 addq %rax, %r10 movq %r14, %rax # 56($inp) movq %rdx, %rbx adcq \$0, %rbx mulq %r12 addq %rax, %r11 movq %r12, %rax adcq \$0, %rdx addq %rbx, %r11 adcq \$0, %rdx xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 addq %r9, %r9 movq %rdx, %r12 adcq %r10, %r10 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax addq %rax, %r9 movq %r14, %rax # 56($inp) adcq %rdx, %r10 adcq \$0, %rbx movq %r9, 80(%rsp) movq %r10, 88(%rsp) #seventh iteration mulq %rbp addq %rax, %r12 movq %rbp, %rax adcq \$0, %rdx xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 addq %r11, %r11 movq %rdx, %r13 adcq %r12, %r12 adcq \$0, %rcx mulq %rax # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rbx, %rax addq %rax, %r11 movq %r14, %rax # 56($inp) adcq %rdx, %r12 adcq \$0, %rcx movq %r11, 96(%rsp) movq %r12, 104(%rsp) #eighth iteration xorq %rbx, %rbx # rbx:r13 = r13 << 1 addq %r13, %r13 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax addq %r13, %rax adcq %rbx, %rdx movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 movq %xmm1, %rbp movq %rax, 112(%rsp) movq %rdx, 120(%rsp) call __rsaz_512_reduce addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract movq %r8, %rdx movq %r9, %rax movl 128+8(%rsp), $times movq $out, $inp decl $times jnz .Loop_sqr ___ if ($addx) { $code.=<<___; jmp .Lsqr_tail .align 32 .Loop_sqrx: movl $times,128+8(%rsp) movq $out, %xmm0 # off-load #first iteration mulx %rax, %r8, %r9 mov %rax, %rbx mulx 16($inp), %rcx, %r10 xor %rbp, %rbp # cf=0, of=0 mulx 24($inp), %rax, %r11 adcx %rcx, %r9 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12 adcx %rax, %r10 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13 adcx %rcx, %r11 mulx 48($inp), %rcx, %r14 adcx %rax, %r12 adcx %rcx, %r13 mulx 56($inp), %rax, %r15 adcx %rax, %r14 adcx %rbp, %r15 # %rbp is 0 mulx %rdx, %rax, $out mov %rbx, %rdx # 8($inp) xor %rcx, %rcx adox %r8, %r8 adcx $out, %r8 adox %rbp, %rcx adcx %rbp, %rcx mov %rax, (%rsp) mov %r8, 8(%rsp) #second iteration .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx adox %rax, %r10 adcx %rbx, %r11 mulx 24($inp), $out, %r8 adox $out, %r11 .byte 0x66 adcx %r8, %r12 mulx 32($inp), %rax, %rbx adox %rax, %r12 adcx %rbx, %r13 mulx 40($inp), $out, %r8 adox $out, %r13 adcx %r8, %r14 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx adox %rax, %r14 adcx %rbx, %r15 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 adox $out, %r15 adcx %rbp, %r8 mulx %rdx, %rax, $out adox %rbp, %r8 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx xor %rbx, %rbx adox %r9, %r9 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %r10, %r10 adcx %rax, %r9 adox %rbp, %rbx adcx $out, %r10 adcx %rbp, %rbx mov %r9, 16(%rsp) .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) #third iteration mulx 24($inp), $out, %r9 adox $out, %r12 adcx %r9, %r13 mulx 32($inp), %rax, %rcx adox %rax, %r13 adcx %rcx, %r14 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9 adox $out, %r14 adcx %r9, %r15 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx adox %rax, %r15 adcx %rcx, %r8 mulx 56($inp), $out, %r9 adox $out, %r8 adcx %rbp, %r9 mulx %rdx, %rax, $out adox %rbp, %r9 mov 24($inp), %rdx xor %rcx, %rcx adox %r11, %r11 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rbx, %rax adox %r12, %r12 adcx %rax, %r11 adox %rbp, %rcx adcx $out, %r12 adcx %rbp, %rcx mov %r11, 32(%rsp) mov %r12, 40(%rsp) #fourth iteration mulx 32($inp), %rax, %rbx adox %rax, %r14 adcx %rbx, %r15 mulx 40($inp), $out, %r10 adox $out, %r15 adcx %r10, %r8 mulx 48($inp), %rax, %rbx adox %rax, %r8 adcx %rbx, %r9 mulx 56($inp), $out, %r10 adox $out, %r9 adcx %rbp, %r10 mulx %rdx, %rax, $out adox %rbp, %r10 mov 32($inp), %rdx xor %rbx, %rbx adox %r13, %r13 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %r14, %r14 adcx %rax, %r13 adox %rbp, %rbx adcx $out, %r14 adcx %rbp, %rbx mov %r13, 48(%rsp) mov %r14, 56(%rsp) #fifth iteration mulx 40($inp), $out, %r11 adox $out, %r8 adcx %r11, %r9 mulx 48($inp), %rax, %rcx adox %rax, %r9 adcx %rcx, %r10 mulx 56($inp), $out, %r11 adox $out, %r10 adcx %rbp, %r11 mulx %rdx, %rax, $out mov 40($inp), %rdx adox %rbp, %r11 xor %rcx, %rcx adox %r15, %r15 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rbx, %rax adox %r8, %r8 adcx %rax, %r15 adox %rbp, %rcx adcx $out, %r8 adcx %rbp, %rcx mov %r15, 64(%rsp) mov %r8, 72(%rsp) #sixth iteration .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx adox %rax, %r10 adcx %rbx, %r11 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 adox $out, %r11 adcx %rbp, %r12 mulx %rdx, %rax, $out adox %rbp, %r12 mov 48($inp), %rdx xor %rbx, %rbx adox %r9, %r9 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %r10, %r10 adcx %rax, %r9 adcx $out, %r10 adox %rbp, %rbx adcx %rbp, %rbx mov %r9, 80(%rsp) mov %r10, 88(%rsp) #seventh iteration .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 adox %rax, %r12 adox %rbp, %r13 mulx %rdx, %rax, $out xor %rcx, %rcx mov 56($inp), %rdx adox %r11, %r11 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rbx, %rax adox %r12, %r12 adcx %rax, %r11 adox %rbp, %rcx adcx $out, %r12 adcx %rbp, %rcx .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) #eighth iteration mulx %rdx, %rax, %rdx xor %rbx, %rbx adox %r13, %r13 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %rbp, %rbx adcx %r13, %rax adcx %rdx, %rbx movq %xmm0, $out movq %xmm1, %rbp movq 128(%rsp), %rdx # pull $n0 movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 movq %rax, 112(%rsp) movq %rbx, 120(%rsp) call __rsaz_512_reducex addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract movq %r8, %rdx movq %r9, %rax movl 128+8(%rsp), $times movq $out, $inp decl $times jnz .Loop_sqrx .Lsqr_tail: ___ } $code.=<<___; leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lsqr_epilogue: ret .cfi_endproc .size rsaz_512_sqr,.-rsaz_512_sqr ___ } { my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); $code.=<<___; .globl rsaz_512_mul .type rsaz_512_mul,\@function,5 .align 32 rsaz_512_mul: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lmul_body: movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 movq $n0, 128(%rsp) ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Lmulx ___ $code.=<<___; movq ($bp), %rbx # pass b[0] movq $bp, %rbp # pass argument call __rsaz_512_mul movq %xmm0, $out movq %xmm1, %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lmul_tail .align 32 .Lmulx: movq $bp, %rbp # pass argument movq ($bp), %rdx # pass b[0] call __rsaz_512_mulx movq %xmm0, $out movq %xmm1, %rbp movq 128(%rsp), %rdx # pull $n0 movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reducex .Lmul_tail: ___ $code.=<<___; addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size rsaz_512_mul,.-rsaz_512_mul ___ } { my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); $code.=<<___; .globl rsaz_512_mul_gather4 .type rsaz_512_mul_gather4,\@function,6 .align 32 rsaz_512_mul_gather4: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$`128+24+($win64?0xb0:0)`, %rsp .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,0xa0(%rsp) movaps %xmm7,0xb0(%rsp) movaps %xmm8,0xc0(%rsp) movaps %xmm9,0xd0(%rsp) movaps %xmm10,0xe0(%rsp) movaps %xmm11,0xf0(%rsp) movaps %xmm12,0x100(%rsp) movaps %xmm13,0x110(%rsp) movaps %xmm14,0x120(%rsp) movaps %xmm15,0x130(%rsp) ___ $code.=<<___; .Lmul_gather4_body: movd $pwr,%xmm8 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 pshufd \$0,%xmm8,%xmm8 # broadcast $power movdqa %xmm1,%xmm7 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..15 to $power # for($i=0;$i<4;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` movdqa %xmm7,%xmm`$i+3` ___ } for(;$i<7;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` ___ } $code.=<<___; pcmpeqd %xmm8,%xmm7 movdqa 16*0($bp),%xmm8 movdqa 16*1($bp),%xmm9 movdqa 16*2($bp),%xmm10 movdqa 16*3($bp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4($bp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5($bp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6($bp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7($bp),%xmm15 leaq 128($bp), %rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Lmulx_gather ___ $code.=<<___; movq %xmm8,%rbx movq $n0, 128(%rsp) # off-load arguments movq $out, 128+8(%rsp) movq $mod, 128+16(%rsp) movq ($ap), %rax movq 8($ap), %rcx mulq %rbx # 0 iteration movq %rax, (%rsp) movq %rcx, %rax movq %rdx, %r8 mulq %rbx addq %rax, %r8 movq 16($ap), %rax movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r9 movq 24($ap), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r10 movq 32($ap), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r11 movq 40($ap), %rax movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r12 movq 48($ap), %rax movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r13 movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r14 movq ($ap), %rax movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rsp), %rdi movl \$7, %ecx jmp .Loop_mul_gather .align 32 .Loop_mul_gather: movdqa 16*0(%rbp),%xmm8 movdqa 16*1(%rbp),%xmm9 movdqa 16*2(%rbp),%xmm10 movdqa 16*3(%rbp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4(%rbp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5(%rbp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6(%rbp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7(%rbp),%xmm15 leaq 128(%rbp), %rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,%rbx mulq %rbx addq %rax, %r8 movq 8($ap), %rax movq %r8, (%rdi) movq %rdx, %r8 adcq \$0, %r8 mulq %rbx addq %rax, %r9 movq 16($ap), %rax adcq \$0, %rdx addq %r9, %r8 movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r10 movq 24($ap), %rax adcq \$0, %rdx addq %r10, %r9 movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r11 movq 32($ap), %rax adcq \$0, %rdx addq %r11, %r10 movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r12 movq 40($ap), %rax adcq \$0, %rdx addq %r12, %r11 movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r13 movq 48($ap), %rax adcq \$0, %rdx addq %r13, %r12 movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r14 movq 56($ap), %rax adcq \$0, %rdx addq %r14, %r13 movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r15 movq ($ap), %rax adcq \$0, %rdx addq %r15, %r14 movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rdi), %rdi decl %ecx jnz .Loop_mul_gather movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 32(%rdi) movq %r13, 40(%rdi) movq %r14, 48(%rdi) movq %r15, 56(%rdi) movq 128+8(%rsp), $out movq 128+16(%rsp), %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lmul_gather_tail .align 32 .Lmulx_gather: movq %xmm8,%rdx mov $n0, 128(%rsp) # off-load arguments mov $out, 128+8(%rsp) mov $mod, 128+16(%rsp) mulx ($ap), %rbx, %r8 # 0 iteration mov %rbx, (%rsp) xor %edi, %edi # cf=0, of=0 mulx 8($ap), %rax, %r9 mulx 16($ap), %rbx, %r10 adcx %rax, %r8 mulx 24($ap), %rax, %r11 adcx %rbx, %r9 mulx 32($ap), %rbx, %r12 adcx %rax, %r10 mulx 40($ap), %rax, %r13 adcx %rbx, %r11 mulx 48($ap), %rbx, %r14 adcx %rax, %r12 mulx 56($ap), %rax, %r15 adcx %rbx, %r13 adcx %rax, %r14 .byte 0x67 mov %r8, %rbx adcx %rdi, %r15 # %rdi is 0 mov \$-7, %rcx jmp .Loop_mulx_gather .align 32 .Loop_mulx_gather: movdqa 16*0(%rbp),%xmm8 movdqa 16*1(%rbp),%xmm9 movdqa 16*2(%rbp),%xmm10 movdqa 16*3(%rbp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4(%rbp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5(%rbp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6(%rbp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7(%rbp),%xmm15 leaq 128(%rbp), %rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,%rdx .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 mulx 8($ap), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 mulx 16($ap), %rax, %r10 adcx %rax, %r9 adox %r11, %r10 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 adcx %rax, %r10 adox %r12, %r11 mulx 32($ap), %rax, %r12 adcx %rax, %r11 adox %r13, %r12 mulx 40($ap), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 adcx %rax, %r13 .byte 0x67 adox %r15, %r14 mulx 56($ap), %rax, %r15 mov %rbx, 64(%rsp,%rcx,8) adcx %rax, %r14 adox %rdi, %r15 mov %r8, %rbx adcx %rdi, %r15 # cf=0 inc %rcx # of=0 jnz .Loop_mulx_gather mov %r8, 64(%rsp) mov %r9, 64+8(%rsp) mov %r10, 64+16(%rsp) mov %r11, 64+24(%rsp) mov %r12, 64+32(%rsp) mov %r13, 64+40(%rsp) mov %r14, 64+48(%rsp) mov %r15, 64+56(%rsp) mov 128(%rsp), %rdx # pull arguments mov 128+8(%rsp), $out mov 128+16(%rsp), %rbp mov (%rsp), %r8 mov 8(%rsp), %r9 mov 16(%rsp), %r10 mov 24(%rsp), %r11 mov 32(%rsp), %r12 mov 40(%rsp), %r13 mov 48(%rsp), %r14 mov 56(%rsp), %r15 call __rsaz_512_reducex .Lmul_gather_tail: ___ $code.=<<___; addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax ___ $code.=<<___ if ($win64); movaps 0xa0-0xc8(%rax),%xmm6 movaps 0xb0-0xc8(%rax),%xmm7 movaps 0xc0-0xc8(%rax),%xmm8 movaps 0xd0-0xc8(%rax),%xmm9 movaps 0xe0-0xc8(%rax),%xmm10 movaps 0xf0-0xc8(%rax),%xmm11 movaps 0x100-0xc8(%rax),%xmm12 movaps 0x110-0xc8(%rax),%xmm13 movaps 0x120-0xc8(%rax),%xmm14 movaps 0x130-0xc8(%rax),%xmm15 lea 0xb0(%rax),%rax ___ $code.=<<___; .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_gather4_epilogue: ret .cfi_endproc .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 ___ } { my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); $code.=<<___; .globl rsaz_512_mul_scatter4 .type rsaz_512_mul_scatter4,\@function,6 .align 32 rsaz_512_mul_scatter4: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 mov $pwr, $pwr subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lmul_scatter4_body: leaq ($tbl,$pwr,8), $tbl movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 movq $tbl, %xmm2 movq $n0, 128(%rsp) movq $out, %rbp ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Lmulx_scatter ___ $code.=<<___; movq ($out),%rbx # pass b[0] call __rsaz_512_mul movq %xmm0, $out movq %xmm1, %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lmul_scatter_tail .align 32 .Lmulx_scatter: movq ($out), %rdx # pass b[0] call __rsaz_512_mulx movq %xmm0, $out movq %xmm1, %rbp movq 128(%rsp), %rdx # pull $n0 movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reducex .Lmul_scatter_tail: ___ $code.=<<___; addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 movq %xmm2, $inp sbbq %rcx, %rcx call __rsaz_512_subtract movq %r8, 128*0($inp) # scatter movq %r9, 128*1($inp) movq %r10, 128*2($inp) movq %r11, 128*3($inp) movq %r12, 128*4($inp) movq %r13, 128*5($inp) movq %r14, 128*6($inp) movq %r15, 128*7($inp) leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_scatter4_epilogue: ret .cfi_endproc .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 ___ } { my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); $code.=<<___; .globl rsaz_512_mul_by_one .type rsaz_512_mul_by_one,\@function,4 .align 32 rsaz_512_mul_by_one: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: ___ $code.=<<___ if ($addx); movl OPENSSL_ia32cap_P+8(%rip),%eax ___ $code.=<<___; movq $mod, %rbp # reassign argument movq $n0, 128(%rsp) movq ($inp), %r8 pxor %xmm0, %xmm0 movq 8($inp), %r9 movq 16($inp), %r10 movq 24($inp), %r11 movq 32($inp), %r12 movq 40($inp), %r13 movq 48($inp), %r14 movq 56($inp), %r15 movdqa %xmm0, (%rsp) movdqa %xmm0, 16(%rsp) movdqa %xmm0, 32(%rsp) movdqa %xmm0, 48(%rsp) movdqa %xmm0, 64(%rsp) movdqa %xmm0, 80(%rsp) movdqa %xmm0, 96(%rsp) ___ $code.=<<___ if ($addx); andl \$0x80100,%eax cmpl \$0x80100,%eax # check for MULX and ADO/CX je .Lby_one_callx ___ $code.=<<___; call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lby_one_tail .align 32 .Lby_one_callx: movq 128(%rsp), %rdx # pull $n0 call __rsaz_512_reducex .Lby_one_tail: ___ $code.=<<___; movq %r8, ($out) movq %r9, 8($out) movq %r10, 16($out) movq %r11, 24($out) movq %r12, 32($out) movq %r13, 40($out) movq %r14, 48($out) movq %r15, 56($out) leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_by_one_epilogue: ret .cfi_endproc .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one ___ } { # __rsaz_512_reduce # # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 # output: %r8-%r15 # clobbers: everything except %rbp and %rdi $code.=<<___; .type __rsaz_512_reduce,\@abi-omnipotent .align 32 __rsaz_512_reduce: .cfi_startproc movq %r8, %rbx imulq 128+8(%rsp), %rbx movq 0(%rbp), %rax movl \$8, %ecx jmp .Lreduction_loop .align 32 .Lreduction_loop: mulq %rbx movq 8(%rbp), %rax negq %r8 movq %rdx, %r8 adcq \$0, %r8 mulq %rbx addq %rax, %r9 movq 16(%rbp), %rax adcq \$0, %rdx addq %r9, %r8 movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r10 movq 24(%rbp), %rax adcq \$0, %rdx addq %r10, %r9 movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r11 movq 32(%rbp), %rax adcq \$0, %rdx addq %r11, %r10 movq 128+8(%rsp), %rsi #movq %rdx, %r11 #adcq \$0, %r11 adcq \$0, %rdx movq %rdx, %r11 mulq %rbx addq %rax, %r12 movq 40(%rbp), %rax adcq \$0, %rdx imulq %r8, %rsi addq %r12, %r11 movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r13 movq 48(%rbp), %rax adcq \$0, %rdx addq %r13, %r12 movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r14 movq 56(%rbp), %rax adcq \$0, %rdx addq %r14, %r13 movq %rdx, %r14 adcq \$0, %r14 mulq %rbx movq %rsi, %rbx addq %rax, %r15 movq 0(%rbp), %rax adcq \$0, %rdx addq %r15, %r14 movq %rdx, %r15 adcq \$0, %r15 decl %ecx jne .Lreduction_loop ret .cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce ___ } if ($addx) { # __rsaz_512_reducex # # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 # output: %r8-%r15 # clobbers: everything except %rbp and %rdi $code.=<<___; .type __rsaz_512_reducex,\@abi-omnipotent .align 32 __rsaz_512_reducex: .cfi_startproc #movq 128+8(%rsp), %rdx # pull $n0 imulq %r8, %rdx xorq %rsi, %rsi # cf=0,of=0 movl \$8, %ecx jmp .Lreduction_loopx .align 32 .Lreduction_loopx: mov %r8, %rbx mulx 0(%rbp), %rax, %r8 adcx %rbx, %rax adox %r9, %r8 mulx 8(%rbp), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 mulx 16(%rbp), %rbx, %r10 adcx %rbx, %r9 adox %r11, %r10 mulx 24(%rbp), %rbx, %r11 adcx %rbx, %r10 adox %r12, %r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 mov %rdx, %rax mov %r8, %rdx adcx %rbx, %r11 adox %r13, %r12 mulx 128+8(%rsp), %rbx, %rdx mov %rax, %rdx mulx 40(%rbp), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 adcx %rax, %r13 adox %r15, %r14 mulx 56(%rbp), %rax, %r15 mov %rbx, %rdx adcx %rax, %r14 adox %rsi, %r15 # %rsi is 0 adcx %rsi, %r15 # cf=0 decl %ecx # of=0 jne .Lreduction_loopx ret .cfi_endproc .size __rsaz_512_reducex,.-__rsaz_512_reducex ___ } { # __rsaz_512_subtract # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask # output: # clobbers: everything but %rdi, %rsi and %rbp $code.=<<___; .type __rsaz_512_subtract,\@abi-omnipotent .align 32 __rsaz_512_subtract: .cfi_startproc movq %r8, ($out) movq %r9, 8($out) movq %r10, 16($out) movq %r11, 24($out) movq %r12, 32($out) movq %r13, 40($out) movq %r14, 48($out) movq %r15, 56($out) movq 0($mod), %r8 movq 8($mod), %r9 negq %r8 notq %r9 andq %rcx, %r8 movq 16($mod), %r10 andq %rcx, %r9 notq %r10 movq 24($mod), %r11 andq %rcx, %r10 notq %r11 movq 32($mod), %r12 andq %rcx, %r11 notq %r12 movq 40($mod), %r13 andq %rcx, %r12 notq %r13 movq 48($mod), %r14 andq %rcx, %r13 notq %r14 movq 56($mod), %r15 andq %rcx, %r14 notq %r15 andq %rcx, %r15 addq ($out), %r8 adcq 8($out), %r9 adcq 16($out), %r10 adcq 24($out), %r11 adcq 32($out), %r12 adcq 40($out), %r13 adcq 48($out), %r14 adcq 56($out), %r15 movq %r8, ($out) movq %r9, 8($out) movq %r10, 16($out) movq %r11, 24($out) movq %r12, 32($out) movq %r13, 40($out) movq %r14, 48($out) movq %r15, 56($out) ret .cfi_endproc .size __rsaz_512_subtract,.-__rsaz_512_subtract ___ } { # __rsaz_512_mul # # input: %rsi - ap, %rbp - bp # output: # clobbers: everything my ($ap,$bp) = ("%rsi","%rbp"); $code.=<<___; .type __rsaz_512_mul,\@abi-omnipotent .align 32 __rsaz_512_mul: .cfi_startproc leaq 8(%rsp), %rdi movq ($ap), %rax mulq %rbx movq %rax, (%rdi) movq 8($ap), %rax movq %rdx, %r8 mulq %rbx addq %rax, %r8 movq 16($ap), %rax movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r9 movq 24($ap), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r10 movq 32($ap), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r11 movq 40($ap), %rax movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r12 movq 48($ap), %rax movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r13 movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r14 movq ($ap), %rax movq %rdx, %r15 adcq \$0, %r15 leaq 8($bp), $bp leaq 8(%rdi), %rdi movl \$7, %ecx jmp .Loop_mul .align 32 .Loop_mul: movq ($bp), %rbx mulq %rbx addq %rax, %r8 movq 8($ap), %rax movq %r8, (%rdi) movq %rdx, %r8 adcq \$0, %r8 mulq %rbx addq %rax, %r9 movq 16($ap), %rax adcq \$0, %rdx addq %r9, %r8 movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r10 movq 24($ap), %rax adcq \$0, %rdx addq %r10, %r9 movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r11 movq 32($ap), %rax adcq \$0, %rdx addq %r11, %r10 movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r12 movq 40($ap), %rax adcq \$0, %rdx addq %r12, %r11 movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r13 movq 48($ap), %rax adcq \$0, %rdx addq %r13, %r12 movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r14 movq 56($ap), %rax adcq \$0, %rdx addq %r14, %r13 movq %rdx, %r14 leaq 8($bp), $bp adcq \$0, %r14 mulq %rbx addq %rax, %r15 movq ($ap), %rax adcq \$0, %rdx addq %r15, %r14 movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rdi), %rdi decl %ecx jnz .Loop_mul movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 32(%rdi) movq %r13, 40(%rdi) movq %r14, 48(%rdi) movq %r15, 56(%rdi) ret .cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul ___ } if ($addx) { # __rsaz_512_mulx # # input: %rsi - ap, %rbp - bp # output: # clobbers: everything my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); $code.=<<___; .type __rsaz_512_mulx,\@abi-omnipotent .align 32 __rsaz_512_mulx: .cfi_startproc mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller mov \$-6, %rcx mulx 8($ap), %rax, %r9 movq %rbx, 8(%rsp) mulx 16($ap), %rbx, %r10 adc %rax, %r8 mulx 24($ap), %rax, %r11 adc %rbx, %r9 mulx 32($ap), %rbx, %r12 adc %rax, %r10 mulx 40($ap), %rax, %r13 adc %rbx, %r11 mulx 48($ap), %rbx, %r14 adc %rax, %r12 mulx 56($ap), %rax, %r15 mov 8($bp), %rdx adc %rbx, %r13 adc %rax, %r14 adc \$0, %r15 xor $zero, $zero # cf=0,of=0 jmp .Loop_mulx .align 32 .Loop_mulx: movq %r8, %rbx mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 mulx 8($ap), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 mulx 16($ap), %rax, %r10 adcx %rax, %r9 adox %r11, %r10 mulx 24($ap), %rax, %r11 adcx %rax, %r10 adox %r12, %r11 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 adcx %rax, %r11 adox %r13, %r12 mulx 40($ap), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 mulx 48($ap), %rax, %r14 adcx %rax, %r13 adox %r15, %r14 mulx 56($ap), %rax, %r15 movq 64($bp,%rcx,8), %rdx movq %rbx, 8+64-8(%rsp,%rcx,8) adcx %rax, %r14 adox $zero, %r15 adcx $zero, %r15 # cf=0 inc %rcx # of=0 jnz .Loop_mulx movq %r8, %rbx mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 adcx %rax, %r9 adox %r11, %r10 mulx 24($ap), %rax, %r11 adcx %rax, %r10 adox %r12, %r11 mulx 32($ap), %rax, %r12 adcx %rax, %r11 adox %r13, %r12 mulx 40($ap), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 adcx %rax, %r13 adox %r15, %r14 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 adcx %rax, %r14 adox $zero, %r15 adcx $zero, %r15 mov %rbx, 8+64-8(%rsp) mov %r8, 8+64(%rsp) mov %r9, 8+64+8(%rsp) mov %r10, 8+64+16(%rsp) mov %r11, 8+64+24(%rsp) mov %r12, 8+64+32(%rsp) mov %r13, 8+64+40(%rsp) mov %r14, 8+64+48(%rsp) mov %r15, 8+64+56(%rsp) ret .cfi_endproc .size __rsaz_512_mulx,.-__rsaz_512_mulx ___ } { my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); $code.=<<___; .globl rsaz_512_scatter4 .type rsaz_512_scatter4,\@abi-omnipotent .align 16 rsaz_512_scatter4: .cfi_startproc leaq ($out,$power,8), $out movl \$8, %r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq ($inp), %rax leaq 8($inp), $inp movq %rax, ($out) leaq 128($out), $out decl %r9d jnz .Loop_scatter ret .cfi_endproc .size rsaz_512_scatter4,.-rsaz_512_scatter4 .globl rsaz_512_gather4 .type rsaz_512_gather4,\@abi-omnipotent .align 16 rsaz_512_gather4: .cfi_startproc ___ $code.=<<___ if ($win64); .LSEH_begin_rsaz_512_gather4: .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) ___ $code.=<<___; movd $power,%xmm8 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 pshufd \$0,%xmm8,%xmm8 # broadcast $power movdqa %xmm1,%xmm7 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..15 to $power # for($i=0;$i<4;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` movdqa %xmm7,%xmm`$i+3` ___ } for(;$i<7;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` ___ } $code.=<<___; pcmpeqd %xmm8,%xmm7 movl \$8, %r9d jmp .Loop_gather .align 16 .Loop_gather: movdqa 16*0($inp),%xmm8 movdqa 16*1($inp),%xmm9 movdqa 16*2($inp),%xmm10 movdqa 16*3($inp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4($inp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5($inp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6($inp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7($inp),%xmm15 leaq 128($inp), $inp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,($out) leaq 8($out), $out decl %r9d jnz .Loop_gather ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 add \$0xa8,%rsp ___ $code.=<<___; ret .LSEH_end_rsaz_512_gather4: .cfi_endproc .size rsaz_512_gather4,.-rsaz_512_gather4 .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 ___ } # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 128+24+48(%rax),%rax lea .Lmul_gather4_epilogue(%rip),%rbx cmp %r10,%rbx jne .Lse_not_in_mul_gather4 lea 0xb0(%rax),%rax lea -48-0xa8(%rax),%rsi lea 512($context),%rdi mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lse_not_in_mul_gather4: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_rsaz_512_sqr .rva .LSEH_end_rsaz_512_sqr .rva .LSEH_info_rsaz_512_sqr .rva .LSEH_begin_rsaz_512_mul .rva .LSEH_end_rsaz_512_mul .rva .LSEH_info_rsaz_512_mul .rva .LSEH_begin_rsaz_512_mul_gather4 .rva .LSEH_end_rsaz_512_mul_gather4 .rva .LSEH_info_rsaz_512_mul_gather4 .rva .LSEH_begin_rsaz_512_mul_scatter4 .rva .LSEH_end_rsaz_512_mul_scatter4 .rva .LSEH_info_rsaz_512_mul_scatter4 .rva .LSEH_begin_rsaz_512_mul_by_one .rva .LSEH_end_rsaz_512_mul_by_one .rva .LSEH_info_rsaz_512_mul_by_one .rva .LSEH_begin_rsaz_512_gather4 .rva .LSEH_end_rsaz_512_gather4 .rva .LSEH_info_rsaz_512_gather4 .section .xdata .align 8 .LSEH_info_rsaz_512_sqr: .byte 9,0,0,0 .rva se_handler .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul: .byte 9,0,0,0 .rva se_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul_gather4: .byte 9,0,0,0 .rva se_handler .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul_scatter4: .byte 9,0,0,0 .rva se_handler .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul_by_one: .byte 9,0,0,0 .rva se_handler .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] .LSEH_info_rsaz_512_gather4: .byte 0x01,0x46,0x16,0x00 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/bn/asm/x86_64-mont.pl =================================================================== --- head/crypto/openssl/crypto/bn/asm/x86_64-mont.pl (revision 364821) +++ head/crypto/openssl/crypto/bn/asm/x86_64-mont.pl (revision 364822) @@ -1,1592 +1,1592 @@ #! /usr/bin/env perl # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # October 2005. # # Montgomery multiplication routine for x86_64. While it gives modest # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more # than twice, >2x, as fast. Most common rsa1024 sign is improved by # respectful 50%. It remains to be seen if loop unrolling and # dedicated squaring routine can provide further improvement... # July 2011. # # Add dedicated squaring procedure. Performance improvement varies # from platform to platform, but in average it's ~5%/15%/25%/33% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. # August 2011. # # Unroll and modulo-schedule inner loops in such manner that they # are "fallen through" for input lengths of 8, which is critical for # 1024-bit RSA *sign*. Average performance improvement in comparison # to *initial* version of this module from 2005 is ~0%/30%/40%/45% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. # June 2013. # # Optimize reduction in squaring procedure and improve 1024+-bit RSA # sign performance by 10-16% on Intel Sandy Bridge and later # (virtually same on non-Intel processors). # August 2013. # # Add MULX/ADOX/ADCX code path. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } # int bn_mul_mont( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, $np="%rcx"; # const BN_ULONG *np, $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num); $lo0="%r10"; $hi0="%r11"; $hi1="%r13"; $i="%r14"; $j="%r15"; $m0="%rbx"; $m1="%rbp"; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl bn_mul_mont .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: .cfi_startproc mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d jb .Lmul_enter ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d ___ $code.=<<___; cmp $ap,$bp jne .Lmul4x_enter test \$7,${num}d jz .Lsqr8x_enter jmp .Lmul4x_enter .align 16 .Lmul_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) neg $num # restore $num and \$-1024,%r10 # minimize TLB usage # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .align 16 .Lmul_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: mov $bp,%r12 # reassign $bp ___ $bp="%r12"; $code.=<<___; mov ($n0),$n0 # pull n0[0] value mov ($bp),$m0 # m0=bp[0] mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ jmp .L1st_enter .align 16 .L1st: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] mov $lo0,$hi0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .L1st_enter: mulq $m0 # ap[j]*bp[0] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx lea 1($j),$j # j++ mov %rdx,$lo0 mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ jmp .Louter .align 16 .Louter: mov ($bp,$i,8),$m0 # m0=bp[i] xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ jmp .Linner_enter .align 16 .Linner: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .Linner_enter: mulq $m0 # ap[j]*bp[i] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 adc \$0,$hi0 lea 1($j),$j # j++ mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx add $lo0,$hi1 # pull upmost overflow bit adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ cmp $num,$i jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] mov $num,$j # j=num .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8(%rsp,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit mov \$-1,%rbx xor %rax,%rbx # not %rax xor $i,$i mov $num,$j # j=num .Lcopy: # conditional copy mov ($rp,$i,8),%rcx mov (%rsp,$i,8),%rdx and %rbx,%rcx and %rax,%rdx mov $num,(%rsp,$i,8) # zap temporary vector or %rcx,%rdx mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size bn_mul_mont,.-bn_mul_mont ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: .cfi_startproc mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); and \$0x80100,%r11d cmp \$0x80100,%r11d je .Lmulx4x_enter ___ $code.=<<___; push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) neg $num # restore and \$-1024,%r10 # minimize TLB usage sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ $bp="%r12"; $code.=<<___; mov ($n0),$n0 # pull n0[0] value mov ($bp),$m0 # m0=bp[0] mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax imulq $A[0],$m1 # "tp[0]"*n0 mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] mov 8($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] mov 16($ap),%rax adc \$0,%rdx add $A[1],$N[1] lea 4($j),$j # j++ adc \$0,%rdx mov $N[1],(%rsp) mov %rdx,$N[0] jmp .L1st4x .align 16 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov ($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-8(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov 8($np,$j,8),%rax adc \$0,%rdx lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov -16($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jb .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] mov $N[0],-8(%rsp,$j,8) mov $N[1],(%rsp,$j,8) # store upmost overflow bit lea 1($i),$i # i++ .align 4 .Louter4x: mov ($bp,$i,8),$m0 # m0=bp[i] xor $j,$j # j=0 mov (%rsp),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8($np),%rax adc \$0,%rdx add 8(%rsp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4($j),$j # j+=2 adc \$0,%rdx mov $N[1],(%rsp) # tp[j-1] mov %rdx,$N[0] jmp .Linner4x .align 16 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx add -8(%rsp,$j,8),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov ($np,$j,8),%rax adc \$0,%rdx add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-8(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8($np,$j,8),%rax adc \$0,%rdx add 8(%rsp,$j,8),$A[1] adc \$0,%rdx lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov -16($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jb .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx add -8(%rsp,$j,8),$A[1] adc \$0,%rdx lea 1($i),$i # i++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] add (%rsp,$num,8),$N[0] # pull upmost overflow bit adc \$0,$N[1] mov $N[0],-8(%rsp,$j,8) mov $N[1],(%rsp,$j,8) # store upmost overflow bit cmp $num,$i jb .Louter4x ___ { my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] mov 8(%rsp),@ri[1] # tp[1] shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! sub 0($np),@ri[0] mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] .Lsub4x: mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 16($np,$i,8),@ri[2] mov 32($ap,$i,8),@ri[0] # tp[i+1] mov 40($ap,$i,8),@ri[1] sbb 24($np,$i,8),@ri[3] mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 32($np,$i,8),@ri[0] mov 48($ap,$i,8),@ri[2] mov 56($ap,$i,8),@ri[3] sbb 40($np,$i,8),@ri[1] lea 4($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub4x mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov 32($ap,$i,8),@ri[0] # load overflow bit sbb 16($np,$i,8),@ri[2] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 24($np,$i,8),@ri[3] mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] sbb \$0,@ri[0] # handle upmost overflow bit mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] pxor %xmm0,%xmm0 movq @ri[0],%xmm4 pcmpeqd %xmm5,%xmm5 pshufd \$0,%xmm4,%xmm4 mov $num,$j pxor %xmm4,%xmm5 shr \$2,$j # j=num/4 xor %eax,%eax # i=0 jmp .Lcopy4x .align 16 .Lcopy4x: # conditional copy movdqa (%rsp,%rax),%xmm1 movdqu ($rp,%rax),%xmm2 pand %xmm4,%xmm1 pand %xmm5,%xmm2 movdqa 16(%rsp,%rax),%xmm3 movdqa %xmm0,(%rsp,%rax) por %xmm2,%xmm1 movdqu 16($rp,%rax),%xmm2 movdqu %xmm1,($rp,%rax) pand %xmm4,%xmm3 pand %xmm5,%xmm2 movdqa %xmm0,16(%rsp,%rax) por %xmm2,%xmm3 movdqu %xmm3,16($rp,%rax) lea 32(%rax),%rax dec $j jnz .Lcopy4x ___ } $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi, 8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont ___ }}} {{{ ###################################################################### # void bn_sqr8x_mont( my $rptr="%rdi"; # const BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # not used my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___ if ($addx); .extern bn_sqrx8x_internal # see x86_64-mont5 module ___ $code.=<<___; .extern bn_sqr8x_internal # see x86_64-mont5 module .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lsqr8x_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes shl \$3+2,%r10 # 4*$num neg $num ############################################################## # ensure that stack frame doesn't alias with $aptr modulo # 4096. this is done to allow memory disambiguation logic # do its job. # lea -64(%rsp,$num,2),%r11 mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt sub %r11,%rbp # align with $aptr lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lsqr8x_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lsqr8x_page_walk jmp .Lsqr8x_page_walk_done .align 16 .Lsqr8x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lsqr8x_page_walk .Lsqr8x_page_walk_done: mov $num,%r10 neg $num mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lsqr8x_body: movq $nptr, %xmm2 # save pointer to modulus pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq %r10, %xmm3 # -$num ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%eax and \$0x80100,%eax cmp \$0x80100,%eax jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %rcx -8*num # %r8 end of tp[2*num] lea (%r8,%rcx),%rbx mov %rcx,$num mov %rcx,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %r8 -8*num # %rdi end of tp[2*num] lea (%rdi,$num),%rbx mov $num,%rcx mov $num,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: mov 8*0(%rbx),%r12 mov 8*1(%rbx),%r13 mov 8*2(%rbx),%r14 mov 8*3(%rbx),%r15 lea 8*4(%rbx),%rbx sbb 8*0(%rbp),%r12 sbb 8*1(%rbp),%r13 sbb 8*2(%rbp),%r14 sbb 8*3(%rbp),%r15 lea 8*4(%rbp),%rbp mov %r12,8*0($rptr) mov %r13,8*1($rptr) mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # preserves %cf jnz .Lsqr8x_sub sbb \$0,%rax # top-most carry lea (%rbx,$num),%rbx # rewind lea ($rptr,$num),$rptr # rewind movq %rax,%xmm1 pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_cond_copy: movdqa 16*0(%rbx),%xmm2 movdqa 16*1(%rbx),%xmm3 lea 16*2(%rbx),%rbx movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2(%rbx) # zero tp movdqa %xmm0,-16*1(%rbx) movdqa %xmm0,-16*2(%rbx,%rdx) movdqa %xmm0,-16*1(%rbx,%rdx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) add \$32,$num jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lsqr8x_epilogue: ret .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ }}} if ($addx) {{{ my $bp="%rdx"; # original value $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) and \$-128,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .align 16 .Lmulx4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: lea ($bp,$num),%r10 ############################################################## # Stack layout # +0 num # +8 off-loaded &b[i] # +16 end of b[num] # +24 saved n0 # +32 saved rp # +40 saved %rsp # +48 inner counter # +56 # +64 tmp[num+1] # mov $num,0(%rsp) # save $num shr \$5,$num mov %r10,16(%rsp) # end of b[num] sub \$1,$num mov $n0, 24(%rsp) # save *n0 mov $rp, 32(%rsp) # save $rp mov %rax,40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 mov $num,48(%rsp) # inner counter jmp .Lmulx4x_body .align 32 .Lmulx4x_body: ___ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); my $rptr=$bptr; $code.=<<___; lea 8($bp),$bptr mov ($bp),%rdx # b[0], $bp==%rdx actually lea 64+32(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] add %rax,%r11 mov $bptr,8(%rsp) # off-load &b[i] mulx 2*8($aptr),%r12,%r13 # ... adc %r14,%r12 adc \$0,%r13 mov $mi,$bptr # borrow $bptr imulq 24(%rsp),$mi # "t[0]"*n0 xor $zero,$zero # cf=0, of=0 mulx 3*8($aptr),%rax,%r14 mov $mi,%rdx lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 mulx 0*8($nptr),%rax,%r10 adcx %rax,$bptr # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 mov 48(%rsp),$bptr # counter value mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-3*8($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 lea 4*8($nptr),$nptr mov %r12,-2*8($tptr) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcx $zero,%r15 # cf=0, modulo-scheduled mulx 0*8($aptr),%r10,%rax # a[4]*b[0] adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] adcx %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r14,%r12 mulx 3*8($aptr),%r13,%r14 .byte 0x67,0x67 mov $mi,%rdx adcx %rax,%r13 adcx $zero,%r14 # cf=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled add %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: mov ($bptr),%rdx # b[i] lea 8($bptr),$bptr # b++ sub $num,$aptr # rewind $aptr mov %r15,($tptr) # save top-most carry lea 64+4*8(%rsp),$tptr sub $num,$nptr # rewind $nptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 mov %rdx,$bi mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] adox -4*8($tptr),$mi adcx %r14,%r11 mulx 2*8($aptr),%r15,%r13 # ... adox -3*8($tptr),%r11 adcx %r15,%r12 adox -2*8($tptr),%r12 adcx $zero,%r13 adox $zero,%r13 mov $bptr,8(%rsp) # off-load &b[i] mov $mi,%r15 imulq 24(%rsp),$mi # "t[0]"*n0 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 mulx 3*8($aptr),%rax,%r14 mov $mi,%rdx adcx %rax,%r13 adox -1*8($tptr),%r13 adcx $zero,%r14 lea 4*8($aptr),$aptr adox $zero,%r14 mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-3*8($tptr) lea 4*8($nptr),$nptr adcx %rax,%r12 adox $zero,%r15 # of=0 mov 48(%rsp),$bptr # counter value mov %r12,-2*8($tptr) jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulx 0*8($aptr),%r10,%rax # a[4]*b[i] adcx $zero,%r15 # cf=0, modulo-scheduled adox %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adcx 0*8($tptr),%r10 adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx adcx 2*8($tptr),%r12 adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-4*8($tptr) mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_inner mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer lea 64(%rsp),$tptr sub $num,$nptr # rewind $nptr neg %r15 mov $num,%rdx shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp jmp .Lmulx4x_sub .align 32 .Lmulx4x_sub: mov 8*0($tptr),%r11 mov 8*1($tptr),%r12 mov 8*2($tptr),%r13 mov 8*3($tptr),%r14 lea 8*4($tptr),$tptr sbb 8*0($nptr),%r11 sbb 8*1($nptr),%r12 sbb 8*2($nptr),%r13 sbb 8*3($nptr),%r14 lea 8*4($nptr),$nptr mov %r11,8*0($rptr) mov %r12,8*1($rptr) mov %r13,8*2($rptr) mov %r14,8*3($rptr) lea 8*4($rptr),$rptr dec $num # preserves %cf jnz .Lmulx4x_sub sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr sub %rdx,$rptr # rewind movq %r15,%xmm1 pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 .Lmulx4x_cond_copy: movdqa 16*0($tptr),%xmm2 movdqa 16*1($tptr),%xmm3 lea 16*2($tptr),$tptr movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2($tptr) # zero tp movdqa %xmm0,-16*1($tptr) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) sub \$32,%rdx jnz .Lmulx4x_cond_copy mov %rdx,($tptr) mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont ___ }}} $code.=<<___; .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " .align 16 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type mul_handler,\@abi-omnipotent .align 16 mul_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent .align 16 sqr_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lsqr_prologue jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # body label cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_pop_regs mov 152($context),%rax # pull context->Rsp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer .Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size sqr_handler,.-sqr_handler .section .pdata .align 4 .rva .LSEH_begin_bn_mul_mont .rva .LSEH_end_bn_mul_mont .rva .LSEH_info_bn_mul_mont .rva .LSEH_begin_bn_mul4x_mont .rva .LSEH_end_bn_mul4x_mont .rva .LSEH_info_bn_mul4x_mont .rva .LSEH_begin_bn_sqr8x_mont .rva .LSEH_end_bn_sqr8x_mont .rva .LSEH_info_bn_sqr8x_mont ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont .rva .LSEH_end_bn_mulx4x_mont .rva .LSEH_info_bn_mulx4x_mont ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_bn_mul_mont: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .LSEH_info_bn_mul4x_mont: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] .align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 ___ } print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl =================================================================== --- head/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl (revision 364821) +++ head/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl (revision 364822) @@ -1,3963 +1,3963 @@ #! /usr/bin/env perl # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # August 2011. # # Companion to x86_64-mont.pl that optimizes cache-timing attack # countermeasures. The subroutines are produced by replacing bp[i] # references in their x86_64-mont.pl counterparts with cache-neutral # references to powers table computed in BN_mod_exp_mont_consttime. # In addition subroutine that scatters elements of the powers table # is implemented, so that scatter-/gathering can be tuned without # bn_exp.c modifications. # August 2013. # # Add MULX/AD*X code paths and additional interfaces to optimize for # branch prediction unit. For input lengths that are multiples of 8 # the np argument is not just modulus value, but one interleaved # with 0. This is to optimize post-condition... $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } # int bn_mul_mont_gather5( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, $np="%rcx"; # const BN_ULONG *np, $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num, # int idx); # 0 to 2^5-1, "index" in $bp holding # pre-computed powers of a', interlaced # in such manner that b[0] is $bp[idx], # b[1] is [2^5+idx], etc. $lo0="%r10"; $hi0="%r11"; $hi1="%r13"; $i="%r14"; $j="%r15"; $m0="%rbx"; $m1="%rbp"; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl bn_mul_mont_gather5 .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: .cfi_startproc mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax test \$7,${num}d jnz .Lmul_enter ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d ___ $code.=<<___; jmp .Lmul4x_enter .align 16 .Lmul_enter: movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) neg $num # restore $num and \$-1024,%r10 # minimize TLB usage # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .Lmul_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: lea .Linc(%rip),%r10 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) and \$-16,%r10 pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($k+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($k+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($k+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($k+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($k+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($k+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($k+2)+112`(%r10) pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register pand `16*($k+1)-128`($bp),%xmm1 pand `16*($k+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($k+3)+112`(%r10) pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm4 movdqa `16*($k+1)-128`($bp),%xmm5 movdqa `16*($k+2)-128`($bp),%xmm2 pand `16*($k+0)+112`(%r10),%xmm4 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($k+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($k+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ jmp .L1st_enter .align 16 .L1st: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] mov $lo0,$hi0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .L1st_enter: mulq $m0 # ap[j]*bp[0] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx lea 1($j),$j # j++ mov %rdx,$lo0 mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ jmp .Louter .align 16 .Louter: lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) and \$-16,%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($k=0;$k<$STRIDE/16;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm0 movdqa `16*($k+1)-128`($bp),%xmm1 movdqa `16*($k+2)-128`($bp),%xmm2 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+0)-128`(%rdx),%xmm0 pand `16*($k+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($k+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($k+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bp),$bp mov ($ap),%rax # ap[0] movq %xmm0,$m0 # m0=bp[i] xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ jmp .Linner_enter .align 16 .Linner: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .Linner_enter: mulq $m0 # ap[j]*bp[i] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 adc \$0,$hi0 lea 1($j),$j # j++ mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$num,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx add $lo0,$hi1 # pull upmost overflow bit adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ cmp $num,$i jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] lea (%rsp),$ap # borrow ap for tp mov $num,$j # j=num jmp .Lsub .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit mov \$-1,%rbx xor %rax,%rbx xor $i,$i mov $num,$j # j=num .Lcopy: # conditional copy mov ($rp,$i,8),%rcx mov (%rsp,$i,8),%rdx and %rbx,%rcx and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector or %rcx,%rdx mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: .cfi_startproc .byte 0x67 mov %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); and \$0x80108,%r11d cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lmulx4x_enter ___ $code.=<<___; push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmul4x_prologue: .byte 0x67 shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra [num] is allocated in order # to align with bn_power5's frame, which is cleansed after # completing exponentiation. Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt sub %r11,%rbp # align with $rp lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lmul4xsp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: neg $num mov %rax,40(%rsp) .cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: .cfi_startproc shl \$5,$num # $num was in bytes movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index lea .Linc(%rip),%rax lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $tp=$i; $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128(%rdx),$bp # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 .byte 0x67,0x67 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($i+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($i+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($i+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($i+2)+112`(%r10) pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register pand `16*($i+1)-128`($bp),%xmm1 pand `16*($i+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($i+3)+112`(%r10) pand `16*($i+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bp),%xmm4 movdqa `16*($i+1)-128`($bp),%xmm5 movdqa `16*($i+2)-128`($bp),%xmm2 pand `16*($i+0)+112`(%r10),%xmm4 movdqa `16*($i+3)-128`($bp),%xmm3 pand `16*($i+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($i+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($i+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax lea ($ap,$num),$ap # end of a[num] neg $num mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax imulq $A[0],$m1 # "tp[0]"*n0 lea 64+8(%rsp),$tp mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] jmp .L1st4x .align 32 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] add \$32,$j # j+=4 jnz .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] mov $N[0],-8($tp) jmp .Louter4x .align 32 .Louter4x: lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bp),%xmm0 movdqa `16*($i+1)-128`($bp),%xmm1 movdqa `16*($i+2)-128`($bp),%xmm2 movdqa `16*($i+3)-128`($bp),%xmm3 pand `16*($i+0)-128`(%rdx),%xmm0 pand `16*($i+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[i] mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit lea ($tp,$num),$tp # rewind $tp mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x .align 32 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] add \$32,$j # j+=4 jnz .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mov $N[1],-16($tp) # tp[j-1] lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] add ($tp),$N[0] # pull upmost overflow bit adc \$0,$N[1] # upmost overflow bit mov $N[0],-8($tp) cmp 16+8(%rsp),$bp jb .Louter4x ___ if (1) { $code.=<<___; xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub mov ($np),%r12 lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub dec %r12 # so that after 'not' we get -n[0] xor %r10,%r10 mov 8*1(%rbp),%r13 mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqr4x_sub_entry ___ } else { my @ri=("%rax",$bp,$m0,$m1); my $rp="%rdx"; $code.=<<___ xor \$1,$N[1] lea ($tp,$num),$tp # rewind $tp sar \$5,$num # cf=0 lea ($np,$N[1],8),$np mov 56+8(%rsp),$rp # restore $rp jmp .Lsub4x .align 32 .Lsub4x: .byte 0x66 mov 8*0($tp),@ri[0] mov 8*1($tp),@ri[1] .byte 0x66 sbb 16*0($np),@ri[0] mov 8*2($tp),@ri[2] sbb 16*1($np),@ri[1] mov 3*8($tp),@ri[3] lea 4*8($tp),$tp sbb 16*2($np),@ri[2] mov @ri[0],8*0($rp) sbb 16*3($np),@ri[3] lea 16*4($np),$np mov @ri[1],8*1($rp) mov @ri[2],8*2($rp) mov @ri[3],8*3($rp) lea 8*4($rp),$rp inc $num jnz .Lsub4x ret ___ } $code.=<<___; .cfi_endproc .size mul4x_internal,.-mul4x_internal ___ }}} {{{ ###################################################################### # void bn_power5( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 # int pwr my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; .globl bn_power5 .type bn_power5,\@function,6 .align 32 bn_power5: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d and \$0x80108,%r11d cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lpowerx5_enter ___ $code.=<<___; push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lpower5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lpwr_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwr_page_walk jmp .Lpwr_page_walk_done .Lpwr_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwr_page_walk .Lpwr_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr mov $aptr,$rptr mov 40(%rsp),%rax lea 32(%rsp),$n0 call mul4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpower5_epilogue: ret .cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal .hidden bn_sqr8x_internal .type bn_sqr8x_internal,\@abi-omnipotent .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: .cfi_startproc ############################################################## # Squaring part: # # a) multiply-n-add everything but a[i]*a[i]; # b) shift result of a) by 1 to the left and accumulate # a[i]*a[i] products; # ############################################################## # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[2]a[1] # a[4]a[0] # a[3]a[1] # a[5]a[0] # a[4]a[1] # a[3]a[2] # a[6]a[0] # a[5]a[1] # a[4]a[2] # a[7]a[0] # a[6]a[1] # a[5]a[2] # a[4]a[3] # a[7]a[1] # a[6]a[2] # a[5]a[3] # a[7]a[2] # a[6]a[3] # a[5]a[4] # a[7]a[3] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[4]a[0] # a[5]a[0] # a[6]a[0] # a[7]a[0] # a[2]a[1] # a[3]a[1] # a[4]a[1] # a[5]a[1] # a[6]a[1] # a[7]a[1] # a[3]a[2] # a[4]a[2] # a[5]a[2] # a[6]a[2] # a[7]a[2] # a[4]a[3] # a[5]a[3] # a[6]a[3] # a[7]a[3] # a[5]a[4] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[0]a[0] # a[1]a[1] # a[2]a[2] # a[3]a[3] # a[4]a[4] # a[5]a[5] # a[6]a[6] # a[7]a[7] lea 32(%r10),$i # $i=-($num-32) lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] mov $num,$j # $j=$num # comments apply to $num==8 case mov -32($aptr,$i),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr,$i),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr,$i),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] mov %rax,$A0[0] # a[1]*a[0] mov $ai,%rax # a[2] mov %rdx,$A0[1] mov $A0[0],-24($tptr,$i) # t[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax adc \$0,%rdx mov $A0[1],-16($tptr,$i) # t[2] mov %rdx,$A0[0] mov -8($aptr,$i),$ai # a[3] mul $a1 # a[2]*a[1] mov %rax,$A1[0] # a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A1[1] lea ($i),$j mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[3] jmp .Lsqr4x_1st .align 32 .Lsqr4x_1st: mov ($aptr,$j),$ai # a[4] mul $a1 # a[3]*a[1] add %rax,$A1[1] # a[3]*a[1]+t[4] mov $ai,%rax mov %rdx,$A1[0] adc \$0,$A1[0] mul $a0 # a[4]*a[0] add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] mov $ai,%rax # a[3] mov 8($aptr,$j),$ai # a[5] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[4]*a[3] add %rax,$A1[0] # a[4]*a[3]+t[5] mov $ai,%rax mov $A0[1],($tptr,$j) # t[4] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[5]*a[2] add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] mov $ai,%rax mov 16($aptr,$j),$ai # a[6] mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mul $a1 # a[5]*a[3] add %rax,$A1[1] # a[5]*a[3]+t[6] mov $ai,%rax mov $A0[0],8($tptr,$j) # t[5] mov %rdx,$A1[0] adc \$0,$A1[0] mul $a0 # a[6]*a[2] add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] mov $ai,%rax # a[3] mov 24($aptr,$j),$ai # a[7] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[6]*a[5] add %rax,$A1[0] # a[6]*a[5]+t[7] mov $ai,%rax mov $A0[1],16($tptr,$j) # t[6] mov %rdx,$A1[1] adc \$0,$A1[1] lea 32($j),$j mul $a0 # a[7]*a[4] add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[7] cmp \$0,$j jne .Lsqr4x_1st mul $a1 # a[7]*a[5] add %rax,$A1[1] lea 16($i),$i adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[8] mov %rdx,$A1[0] mov %rdx,8($tptr) # t[9] jmp .Lsqr4x_outer .align 32 .Lsqr4x_outer: # comments apply to $num==6 case mov -32($aptr,$i),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr,$i),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr,$i),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] mov -24($tptr,$i),$A0[0] # t[1] add %rax,$A0[0] # a[1]*a[0]+t[1] mov $ai,%rax # a[2] adc \$0,%rdx mov $A0[0],-24($tptr,$i) # t[1] mov %rdx,$A0[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax adc \$0,%rdx add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] mov %rdx,$A0[0] adc \$0,$A0[0] mov $A0[1],-16($tptr,$i) # t[2] xor $A1[0],$A1[0] mov -8($aptr,$i),$ai # a[3] mul $a1 # a[2]*a[1] add %rax,$A1[0] # a[2]*a[1]+t[3] mov $ai,%rax adc \$0,%rdx add -8($tptr,$i),$A1[0] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax adc \$0,%rdx add $A1[0],$A0[0] mov %rdx,$A0[1] adc \$0,$A0[1] mov $A0[0],-8($tptr,$i) # t[3] lea ($i),$j jmp .Lsqr4x_inner .align 32 .Lsqr4x_inner: mov ($aptr,$j),$ai # a[4] mul $a1 # a[3]*a[1] add %rax,$A1[1] # a[3]*a[1]+t[4] mov $ai,%rax mov %rdx,$A1[0] adc \$0,$A1[0] add ($tptr,$j),$A1[1] adc \$0,$A1[0] .byte 0x67 mul $a0 # a[4]*a[0] add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] mov $ai,%rax # a[3] mov 8($aptr,$j),$ai # a[5] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[4]*a[3] add %rax,$A1[0] # a[4]*a[3]+t[5] mov $A0[1],($tptr,$j) # t[4] mov $ai,%rax mov %rdx,$A1[1] adc \$0,$A1[1] add 8($tptr,$j),$A1[0] lea 16($j),$j # j++ adc \$0,$A1[1] mul $a0 # a[5]*a[2] add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] mov $ai,%rax adc \$0,%rdx add $A1[0],$A0[0] mov %rdx,$A0[1] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below cmp \$0,$j jne .Lsqr4x_inner .byte 0x67 mul $a1 # a[5]*a[3] add %rax,$A1[1] adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[6], "preloaded t[2]" below mov %rdx,$A1[0] mov %rdx,8($tptr) # t[7], "preloaded t[3]" below add \$16,$i jnz .Lsqr4x_outer # comments apply to $num==4 case mov -32($aptr),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] mov $ai,%rax # a[2] mov %rdx,$A0[1] adc \$0,$A0[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax mov $A0[0],-24($tptr) # t[1] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] mov -8($aptr),$ai # a[3] adc \$0,$A0[0] mul $a1 # a[2]*a[1] add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] mov $ai,%rax mov $A0[1],-16($tptr) # t[2] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr) # t[3] mul $a1 # a[3]*a[1] add %rax,$A1[1] mov -16($aptr),%rax # a[2] adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[4] mov %rdx,$A1[0] mov %rdx,8($tptr) # t[5] mul $ai # a[2]*a[3] ___ { my ($shift,$carry)=($a0,$a1); my @S=(@A1,$ai,$n0); $code.=<<___; add \$16,$i xor $shift,$shift sub $num,$i # $i=16-$num xor $carry,$carry add $A1[0],%rax # t[5] adc \$0,%rdx mov %rax,8($tptr) # t[5] mov %rdx,16($tptr) # t[6] mov $carry,24($tptr) # t[7] mov -16($aptr,$i),%rax # a[0] lea 48+8(%rsp),$tptr xor $A0[0],$A0[0] # t[0] mov 8($tptr),$A0[1] # t[1] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],16($tptr) adc %rdx,$S[3] lea 16($i),$i mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry lea 64($tptr),$tptr jmp .Lsqr4x_shift_n_add .align 32 .Lsqr4x_shift_n_add: lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],-16($tptr) adc %rdx,$S[3] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift mov $S[3],-8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov 8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],0($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 16($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],16($tptr) adc %rdx,$S[3] mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry lea 64($tptr),$tptr add \$32,$i jnz .Lsqr4x_shift_n_add lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift .byte 0x67 shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr),%rax # a[i+1] # prefetch mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf adc %rax,$S[2] adc %rdx,$S[3] mov $S[2],-16($tptr) mov $S[3],-8($tptr) ___ } ###################################################################### # Montgomery reduction part, "word-by-word" algorithm. # # This new path is inspired by multiple submissions from Intel, by # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, # Vinodh Gopal... { my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr __bn_sqr8x_reduction: xor %rax,%rax lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window mov %rdx,8+8(%rsp) neg $num jmp .L8x_reduction_loop .align 32 .L8x_reduction_loop: lea ($tptr,$num),$tptr # start of current t[] window .byte 0x66 mov 8*0($tptr),$m0 mov 8*1($tptr),%r9 mov 8*2($tptr),%r10 mov 8*3($tptr),%r11 mov 8*4($tptr),%r12 mov 8*5($tptr),%r13 mov 8*6($tptr),%r14 mov 8*7($tptr),%r15 mov %rax,(%rdx) # store top-most carry bit lea 8*8($tptr),$tptr .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] mov %rdx,%r9 adc \$0,%r9 mulq $m0 add %rax,%r10 mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry mov %rdx,%r10 adc \$0,%r10 mulq $m0 add %rax,%r11 mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 mov %rdx,%r11 adc \$0,%r11 mulq $m0 add %rax,%r12 mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 adc \$0,%r12 mulq $m0 add %rax,%r13 mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 adc \$0,%r13 mulq $m0 add %rax,%r14 mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 adc \$0,%r14 mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_reduce lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_no_tail .byte 0x66 add 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 sbb $carry,$carry # top carry mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ mov %rdx,%r9 adc \$0,%r9 mulq $m0 add %rax,%r10 mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 adc \$0,%r10 mulq $m0 add %rax,%r11 mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 adc \$0,%r11 mulq $m0 add %rax,%r12 mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 adc \$0,%r12 mulq $m0 add %rax,%r13 mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 adc \$0,%r13 mulq $m0 add %rax,%r14 mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 adc \$0,%r14 mulq $m0 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] add %rax,%r15 adc \$0,%rdx add %r15,%r14 mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop mov 48+56+8(%rsp),$m0 # pull n0*a[0] neg $carry mov 8*0($nptr),%rax # pull n[0] adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 sbb $carry,$carry # top carry mov \$8,%ecx jmp .L8x_tail .align 32 .L8x_tail_done: xor %rax,%rax add (%rdx),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 adc \$0,%rax neg $carry .L8x_no_tail: adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr mov %r8,8*0($tptr) # store top 512 bits mov %r9,8*1($tptr) movq %xmm3,$num # $num is %r9, can't be moved upwards mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr),$tptr cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret .cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## # Post-condition, 4x unrolled # { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: .cfi_startproc mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above mov $num,%rcx movq %xmm1,$rptr # restore $rptr neg %rax movq %xmm1,$aptr # prepare for back-to-back call sar \$3+2,%rcx dec %r12 # so that after 'not' we get -n[0] xor %r10,%r10 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 jmp .Lsqr4x_sub_entry .align 16 .Lsqr4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 .Lsqr4x_sub_entry: lea 8*4($nptr),$nptr not %r12 not %r13 not %r14 not %r15 and %rax,%r12 and %rax,%r13 and %rax,%r14 and %rax,%r15 neg %r10 # mov %r10,%cf adc 8*0($tptr),%r12 adc 8*1($tptr),%r13 adc 8*2($tptr),%r14 adc 8*3($tptr),%r15 mov %r12,8*0($rptr) lea 8*4($tptr),$tptr mov %r13,8*1($rptr) sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # pass %cf jnz .Lsqr4x_sub mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal ___ } { $code.=<<___; .globl bn_from_montgomery .type bn_from_montgomery,\@abi-omnipotent .align 32 bn_from_montgomery: .cfi_startproc testl \$7,`($win64?"48(%rsp)":"%r9d")` jz bn_from_mont8x xor %eax,%eax ret .cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,\@function,6 .align 32 bn_from_mont8x: .cfi_startproc .byte 0x67 mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lfrom_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). The stack is allocated to aligned with # bn_power5's frame, and as bn_from_montgomery happens to be # last operation, we use the opportunity to cleanse it. # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lfrom_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lfrom_page_walk jmp .Lfrom_page_walk_done .Lfrom_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lfrom_page_walk .Lfrom_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lfrom_body: mov $num,%r11 lea 48(%rsp),%rax pxor %xmm0,%xmm0 jmp .Lmul_by_1 .align 32 .Lmul_by_1: movdqu ($aptr),%xmm1 movdqu 16($aptr),%xmm2 movdqu 32($aptr),%xmm3 movdqa %xmm0,(%rax,$num) movdqu 48($aptr),%xmm4 movdqa %xmm0,16(%rax,$num) .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr movdqa %xmm1,(%rax) movdqa %xmm0,32(%rax,$num) movdqa %xmm2,16(%rax) movdqa %xmm0,48(%rax,$num) movdqa %xmm3,32(%rax) movdqa %xmm4,48(%rax) lea 64(%rax),%rax sub \$64,%r11 jnz .Lmul_by_1 movq $rptr,%xmm1 movq $nptr,%xmm2 .byte 0x67 mov $nptr,%rbp movq %r10, %xmm3 # -num ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d and \$0x80108,%r11d cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 jne .Lfrom_mont_nox lea (%rax,$num),$rptr call __bn_sqrx8x_reduction call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_nox: ___ $code.=<<___; call __bn_sqr8x_reduction call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 movdqa %xmm0,16*0(%rax) movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) movdqa %xmm0,16*3(%rax) lea 16*4(%rax),%rax sub \$32,$num jnz .Lfrom_mont_zero mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lfrom_epilogue: ret .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x ___ } }}} if ($addx) {{{ my $bp="%rdx"; # restore original value $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra [num] is allocated in order # to align with bn_power5's frame, which is cleansed after # completing exponentiation. Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lmulx4xsp_done: and \$-64,%rbp # ensure alignment mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .Lmulx4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: ############################################################## # Stack layout # +0 -num # +8 off-loaded &b[i] # +16 end of b[num] # +24 inner counter # +32 saved n0 # +40 saved %rsp # +48 # +56 saved rp # +64 tmp[num+1] # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: .cfi_startproc mov $num,8(%rsp) # save -$num (it was in bytes) mov $num,%r10 neg $num # restore $num shl \$5,$num neg %r10 # restore $num lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp ___ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); my $rptr=$bptr; my $STRIDE=2**5*8; # 5 is "window size" my $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128($bp),$bptr # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 .byte 0x67 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # $code.=<<___; .byte 0x67 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 movdqa %xmm4,%xmm3 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($i+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized .byte 0x67 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($i+0)+112`(%r10) paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($i+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($i+2)+112`(%r10) pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register pand `16*($i+1)-128`($bptr),%xmm1 pand `16*($i+2)-128`($bptr),%xmm2 movdqa %xmm3,`16*($i+3)+112`(%r10) pand `16*($i+3)-128`($bptr),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bptr),%xmm4 movdqa `16*($i+1)-128`($bptr),%xmm5 movdqa `16*($i+2)-128`($bptr),%xmm2 pand `16*($i+0)+112`(%r10),%xmm4 movdqa `16*($i+3)-128`($bptr),%xmm3 pand `16*($i+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($i+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($i+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; pxor %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] add %rax,%r11 mulx 2*8($aptr),%rax,%r13 # ... adc %rax,%r12 adc \$0,%r13 mulx 3*8($aptr),%rax,%r14 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 xor $zero,$zero # cf=0, of=0 mov $mi,%rdx mov $bptr,8+8(%rsp) # off-load &b[i] lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcx $zero,%r15 # cf=0, modulo-scheduled mulx 0*8($aptr),%r10,%rax # a[4]*b[0] adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] adcx %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r14,%r12 mulx 3*8($aptr),%r13,%r14 .byte 0x67,0x67 mov $mi,%rdx adcx %rax,%r13 adcx $zero,%r14 # cf=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 mov 8+8(%rsp),$bptr # re-load &b[i] adc $zero,$zero # top-most carry mov %r14,-1*8($tptr) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) pxor %xmm4,%xmm4 .byte 0x67,0x67 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bptr),%xmm0 movdqa `16*($i+1)-128`($bptr),%xmm1 movdqa `16*($i+2)-128`($bptr),%xmm2 pand `16*($i+0)+256`(%r10),%xmm0 movdqa `16*($i+3)-128`($bptr),%xmm3 pand `16*($i+1)+256`(%r10),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)+256`(%r10),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)+256`(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # m0=bp[i] mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] xor $zero,$zero # cf=0, of=0 mov %rdx,$bi mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] adox -4*8($tptr),$mi # +t[0] adcx %r14,%r11 mulx 2*8($aptr),%r15,%r13 # ... adox -3*8($tptr),%r11 adcx %r15,%r12 mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 mov $mi,%rdx xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulx 0*8($aptr),%r10,%rax # a[4]*b[i] adcx $zero,%r15 # cf=0, modulo-scheduled adox %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adcx 0*8($tptr),%r10 adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx adcx 2*8($tptr),%r12 adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] mov 16+8(%rsp),%r10 adc %r15,%r14 lea ($aptr,$num),$aptr # rewind $aptr adc $zero,$zero # top-most carry mov %r14,-1*8($tptr) cmp %r10,$bptr jb .Lmulx4x_outer mov -8($nptr),%r10 mov $zero,%r8 mov ($nptr,$num),%r12 lea ($nptr,$num),%rbp # rewind $nptr mov $num,%rcx lea ($tptr,$num),%rdi # rewind $tptr xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 or %r15,%r8 sar \$3+2,%rcx sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp dec %r12 # so that after 'not' we get -n[0] mov 8*1(%rbp),%r13 xor %r8,%r8 mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqrx4x_sub_entry # common post-condition .cfi_endproc .size mulx4x_internal,.-mulx4x_internal ___ } { ###################################################################### # void bn_power5( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 # int pwr); my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lpowerx5_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lpwrx_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwrx_page_walk jmp .Lpwrx_page_walk_done .Lpwrx_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwrx_page_walk .Lpwrx_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +16 intermediate carry bit # +24 top-most carry bit, used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq $nptr,%xmm2 # save $nptr movq %r10, %xmm3 # -$num movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr movq %xmm2,$nptr movq %xmm4,$bptr mov 40(%rsp),%rax call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret .cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal .hidden bn_sqrx8x_internal .type bn_sqrx8x_internal,\@abi-omnipotent .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: .cfi_startproc ################################################################## # Squaring part: # # a) multiply-n-add everything but a[i]*a[i]; # b) shift result of a) by 1 to the left and accumulate # a[i]*a[i] products; # ################################################################## # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[2]a[1] # a[3]a[1] # a[3]a[2] # # a[4]a[0] # a[5]a[0] # a[6]a[0] # a[7]a[0] # a[4]a[1] # a[5]a[1] # a[6]a[1] # a[7]a[1] # a[4]a[2] # a[5]a[2] # a[6]a[2] # a[7]a[2] # a[4]a[3] # a[5]a[3] # a[6]a[3] # a[7]a[3] # # a[5]a[4] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] ___ { my ($zero,$carry)=("%rbp","%rcx"); my $aaptr=$zero; $code.=<<___; lea 48+8(%rsp),$tptr lea ($aptr,$num),$aaptr mov $num,0+8(%rsp) # save $num mov $aaptr,8+8(%rsp) # save end of $aptr jmp .Lsqr8x_zero_start .align 32 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_zero: .byte 0x3e movdqa %xmm0,0*8($tptr) movdqa %xmm0,2*8($tptr) movdqa %xmm0,4*8($tptr) movdqa %xmm0,6*8($tptr) .Lsqr8x_zero_start: # aligned at 32 movdqa %xmm0,8*8($tptr) movdqa %xmm0,10*8($tptr) movdqa %xmm0,12*8($tptr) movdqa %xmm0,14*8($tptr) lea 16*8($tptr),$tptr sub \$64,$num jnz .Lsqrx8x_zero mov 0*8($aptr),%rdx # a[0], modulo-scheduled #xor %r9,%r9 # t[1], ex-$num, zero already xor %r10,%r10 xor %r11,%r11 xor %r12,%r12 xor %r13,%r13 xor %r14,%r14 xor %r15,%r15 lea 48+8(%rsp),$tptr xor $zero,$zero # cf=0, cf=0 jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_loop: mulx 1*8($aptr),%r8,%rax # a[1]*a[0] adcx %r9,%r8 # a[1]*a[0]+=t[1] adox %rax,%r10 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] adcx %r10,%r9 adox %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... adcx %r11,%r10 adox %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax adcx %r12,%r11 adox %rax,%r13 mulx 5*8($aptr),%r12,%rax adcx %r13,%r12 adox %rax,%r14 mulx 6*8($aptr),%r13,%rax adcx %r14,%r13 adox %r15,%rax mulx 7*8($aptr),%r14,%r15 mov 1*8($aptr),%rdx # a[1] adcx %rax,%r14 adox $zero,%r15 adc 8*8($tptr),%r15 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] sbb $carry,$carry # mov %cf,$carry xor $zero,$zero # cf=0, of=0 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] mulx 3*8($aptr),%r9,%rax # a[3]*a[1] adcx %r10,%r8 adox %rbx,%r9 mulx 4*8($aptr),%r10,%rbx # ... adcx %r11,%r9 adox %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax adcx %r12,%r10 adox %rbx,%r11 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx adcx %r13,%r11 adox %r14,%r12 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 mov 2*8($aptr),%rdx # a[2] adcx %rax,%r12 adox %rbx,%r13 adcx %r15,%r13 adox $zero,%r14 # of=0 adcx $zero,%r14 # cf=0 mov %r8,3*8($tptr) # t[3] mov %r9,4*8($tptr) # t[4] mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] mulx 4*8($aptr),%r9,%rax # a[4]*a[2] adcx %r10,%r8 adox %rbx,%r9 mulx 5*8($aptr),%r10,%rbx # ... adcx %r11,%r9 adox %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax adcx %r12,%r10 adox %r13,%r11 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 .byte 0x3e mov 3*8($aptr),%rdx # a[3] adcx %rbx,%r11 adox %rax,%r12 adcx %r14,%r12 mov %r8,5*8($tptr) # t[5] mov %r9,6*8($tptr) # t[6] mulx 4*8($aptr),%r8,%rax # a[4]*a[3] adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] adcx %r10,%r8 adox %rax,%r9 mulx 6*8($aptr),%r10,%rax # ... adcx %r11,%r9 adox %r12,%r10 mulx 7*8($aptr),%r11,%r12 mov 4*8($aptr),%rdx # a[4] mov 5*8($aptr),%r14 # a[5] adcx %rbx,%r10 adox %rax,%r11 mov 6*8($aptr),%r15 # a[6] adcx %r13,%r11 adox $zero,%r12 # of=0 adcx $zero,%r12 # cf=0 mov %r8,7*8($tptr) # t[7] mov %r9,8*8($tptr) # t[8] mulx %r14,%r9,%rax # a[5]*a[4] mov 7*8($aptr),%r8 # a[7] adcx %r10,%r9 mulx %r15,%r10,%rbx # a[6]*a[4] adox %rax,%r10 adcx %r11,%r10 mulx %r8,%r11,%rax # a[7]*a[4] mov %r14,%rdx # a[5] adox %rbx,%r11 adcx %r12,%r11 #adox $zero,%rax # of=0 adcx $zero,%rax # cf=0 mulx %r15,%r14,%rbx # a[6]*a[5] mulx %r8,%r12,%r13 # a[7]*a[5] mov %r15,%rdx # a[6] lea 8*8($aptr),$aptr adcx %r14,%r11 adox %rbx,%r12 adcx %rax,%r12 adox $zero,%r13 .byte 0x67,0x67 mulx %r8,%r8,%r14 # a[7]*a[6] adcx %r8,%r13 adcx $zero,%r14 cmp 8+8(%rsp),$aptr je .Lsqrx8x_outer_break neg $carry # mov $carry,%cf mov \$-8,%rcx mov $zero,%r15 mov 8*8($tptr),%r8 adcx 9*8($tptr),%r9 # +=t[9] adcx 10*8($tptr),%r10 # ... adcx 11*8($tptr),%r11 adc 12*8($tptr),%r12 adc 13*8($tptr),%r13 adc 14*8($tptr),%r14 adc 15*8($tptr),%r15 lea ($aptr),$aaptr lea 2*64($tptr),$tptr sbb %rax,%rax # mov %cf,$carry mov -64($aptr),%rdx # a[0] mov %rax,16+8(%rsp) # offload $carry mov $tptr,24+8(%rsp) #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above xor %eax,%eax # cf=0, of=0 jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_loop: mov %r8,%rbx mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] adcx %rax,%rbx # +=t[8] adox %r9,%r8 mulx 1*8($aaptr),%rax,%r9 # ... adcx %rax,%r8 adox %r10,%r9 mulx 2*8($aaptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 mulx 3*8($aaptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 5*8($aaptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 6*8($aaptr),%rax,%r14 mov %rbx,($tptr,%rcx,8) # store t[8+i] mov \$0,%ebx adcx %rax,%r13 adox %r15,%r14 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 mov 8($aptr,%rcx,8),%rdx # a[i] adcx %rax,%r14 adox %rbx,%r15 # %rbx is 0, of=0 adcx %rbx,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_loop lea 8*8($aaptr),$aaptr mov \$-8,%rcx cmp 8+8(%rsp),$aaptr # done? je .Lsqrx8x_break sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf .byte 0x66 mov -64($aptr),%rdx adcx 0*8($tptr),%r8 adcx 1*8($tptr),%r9 adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 adc 4*8($tptr),%r12 adc 5*8($tptr),%r13 adc 6*8($tptr),%r14 adc 7*8($tptr),%r15 lea 8*8($tptr),$tptr .byte 0x67 sbb %rax,%rax # mov %cf,%rax xor %ebx,%ebx # cf=0, of=0 mov %rax,16+8(%rsp) # offload carry jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_break: xor $zero,$zero sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf adcx $zero,%r8 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry adcx $zero,%r9 mov 0*8($aptr),%rdx # a[8], modulo-scheduled adc \$0,%r10 mov %r8,0*8($tptr) adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop mov %r9,1*8($tptr) mov 1*8($carry),%r9 mov %r10,2*8($tptr) mov 2*8($carry),%r10 mov %r11,3*8($tptr) mov 3*8($carry),%r11 mov %r12,4*8($tptr) mov 4*8($carry),%r12 mov %r13,5*8($tptr) mov 5*8($carry),%r13 mov %r14,6*8($tptr) mov 6*8($carry),%r14 mov %r15,7*8($tptr) mov 7*8($carry),%r15 mov $carry,$tptr jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_break: mov %r9,9*8($tptr) # t[9] movq %xmm3,%rcx # -$num mov %r10,10*8($tptr) # ... mov %r11,11*8($tptr) mov %r12,12*8($tptr) mov %r13,13*8($tptr) mov %r14,14*8($tptr) ___ } { my $i="%rcx"; $code.=<<___; lea 48+8(%rsp),$tptr mov ($aptr,$i),%rdx # a[0] mov 8($tptr),$A0[1] # t[1] xor $A0[0],$A0[0] # t[0], of=0, cf=0 mov 0+8(%rsp),$num # restore $num adox $A0[1],$A0[1] mov 16($tptr),$A1[0] # t[2] # prefetch mov 24($tptr),$A1[1] # t[3] # prefetch #jmp .Lsqrx4x_shift_n_add # happens to be aligned .align 32 .Lsqrx4x_shift_n_add: mulx %rdx,%rax,%rbx adox $A1[0],$A1[0] adcx $A0[0],%rax .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch adox $A1[1],$A1[1] adcx $A0[1],%rbx mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch mov %rax,0($tptr) mov %rbx,8($tptr) mulx %rdx,%rax,%rbx adox $A0[0],$A0[0] adcx $A1[0],%rax mov 16($aptr,$i),%rdx # a[i+2] # prefetch mov 48($tptr),$A1[0] # t[2*i+6] # prefetch adox $A0[1],$A0[1] adcx $A1[1],%rbx mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch mov %rax,16($tptr) mov %rbx,24($tptr) mulx %rdx,%rax,%rbx adox $A1[0],$A1[0] adcx $A0[0],%rax mov 24($aptr,$i),%rdx # a[i+3] # prefetch lea 32($i),$i mov 64($tptr),$A0[0] # t[2*i+8] # prefetch adox $A1[1],$A1[1] adcx $A0[1],%rbx mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch mov %rax,32($tptr) mov %rbx,40($tptr) mulx %rdx,%rax,%rbx adox $A0[0],$A0[0] adcx $A1[0],%rax jrcxz .Lsqrx4x_shift_n_add_break .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch adox $A0[1],$A0[1] adcx $A1[1],%rbx mov 80($tptr),$A1[0] # t[2*i+10] # prefetch mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr nop jmp .Lsqrx4x_shift_n_add .align 32 .Lsqrx4x_shift_n_add_break: adcx $A1[1],%rbx mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr # end of t[] buffer ___ } ###################################################################### # Montgomery reduction part, "word-by-word" algorithm. # # This new path is inspired by multiple submissions from Intel, by # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, # Vinodh Gopal... { my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr __bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] lea 48+8(%rsp),$tptr # initial t[] window jmp .Lsqrx8x_reduction_loop .align 32 .Lsqrx8x_reduction_loop: mov 8*1($tptr),%r9 mov 8*2($tptr),%r10 mov 8*3($tptr),%r11 mov 8*4($tptr),%r12 mov %rdx,%r8 imulq %rbx,%rdx # n0*a[i] mov 8*5($tptr),%r13 mov 8*6($tptr),%r14 mov 8*7($tptr),%r15 mov %rax,24+8(%rsp) # store top-most carry bit lea 8*8($tptr),$tptr xor $carry,$carry # cf=0,of=0 mov \$-8,%rcx jmp .Lsqrx8x_reduce .align 32 .Lsqrx8x_reduce: mov %r8, %rbx mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 adox %r13,%r12 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 adcx $carry,%r15 # cf=0 .byte 0x67,0x67,0x67 inc %rcx # of=0 jnz .Lsqrx8x_reduce mov $carry,%rax # xor %rax,%rax cmp 0+8(%rsp),$nptr # end of n[]? jae .Lsqrx8x_no_tail mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb %rax,%rax # top carry xor $carry,$carry # of=0, cf=0 mov %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail: mov %r8,%rbx mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 mov %rbx,($tptr,%rcx,8) # save result mov %r8,%rbx adcx $carry,%r15 # cf=0 inc %rcx # of=0 jnz .Lsqrx8x_tail cmp 0+8(%rsp),$nptr # end of n[]? jae .Lsqrx8x_tail_done # break out of loop sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb %rax,%rax sub \$8,%rcx # mov \$-8,%rcx xor $carry,$carry # of=0, cf=0 mov %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail_done: xor %rax,%rax add 24+8(%rsp),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 adc \$0,%rax sub 16+8(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry mov 32+8(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" mov %r8,8*0($tptr) # store top 512 bits lea 8*8($tptr),%r8 # borrow %r8 mov %r9,8*1($tptr) mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret .cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## # Post-condition, 4x unrolled # { my ($rptr,$nptr)=("%rdx","%rbp"); $code.=<<___; .align 32 __bn_postx4x_internal: .cfi_startproc mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num mov %rcx,%r9 # -$num neg %rax sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call dec %r12 # so that after 'not' we get -n[0] mov 8*1($nptr),%r13 xor %r8,%r8 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 jmp .Lsqrx4x_sub_entry .align 16 .Lsqrx4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 .Lsqrx4x_sub_entry: andn %rax,%r12,%r12 lea 8*4($nptr),$nptr andn %rax,%r13,%r13 andn %rax,%r14,%r14 andn %rax,%r15,%r15 neg %r8 # mov %r8,%cf adc 8*0($tptr),%r12 adc 8*1($tptr),%r13 adc 8*2($tptr),%r14 adc 8*3($tptr),%r15 mov %r12,8*0($rptr) lea 8*4($tptr),$tptr mov %r13,8*1($rptr) sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx jnz .Lsqrx4x_sub neg %r9 # restore $num ret .cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order ("%rdi","%esi","%rdx","%ecx"); # Unix order my $out=$inp; my $STRIDE=2**5*8; my $N=$STRIDE/4; $code.=<<___; .globl bn_get_bits5 .type bn_get_bits5,\@abi-omnipotent .align 16 bn_get_bits5: .cfi_startproc lea 0($inp),%r10 lea 1($inp),%r11 mov $num,%ecx shr \$4,$num and \$15,%ecx lea -8(%ecx),%eax cmp \$11,%ecx cmova %r11,%r10 cmova %eax,%ecx movzw (%r10,$num,2),%eax shrl %cl,%eax and \$31,%eax ret .cfi_endproc .size bn_get_bits5,.-bn_get_bits5 .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 bn_scatter5: .cfi_startproc cmp \$0, $num jz .Lscatter_epilogue lea ($tbl,$idx,8),$tbl .Lscatter: mov ($inp),%rax lea 8($inp),$inp mov %rax,($tbl) lea 32*8($tbl),$tbl sub \$1,$num jnz .Lscatter .Lscatter_epilogue: ret .cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 .type bn_gather5,\@abi-omnipotent .align 32 bn_gather5: .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases .cfi_startproc # I can't trust assembler to use specific encoding:-( .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp lea .Linc(%rip),%rax and \$-16,%rsp # shouldn't be formally required movd $idx,%xmm5 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 128($tbl),%r11 # size optimization lea 128(%rsp),%rax # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast $idx movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to $idx and save result to stack # for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 ___ $code.=<<___ if ($i); movdqa %xmm3,`16*($i-1)-128`(%rax) ___ $code.=<<___; movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)-128`(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)-128`(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)-128`(%rax) movdqa %xmm4,%xmm2 ___ } $code.=<<___; movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather .align 32 .Lgather: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`(%r11),%xmm0 movdqa `16*($i+1)-128`(%r11),%xmm1 movdqa `16*($i+2)-128`(%r11),%xmm2 pand `16*($i+0)-128`(%rax),%xmm0 movdqa `16*($i+3)-128`(%r11),%xmm3 pand `16*($i+1)-128`(%rax),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)-128`(%rax),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)-128`(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 lea $STRIDE(%r11),%r11 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather lea (%r10),%rsp ret .LSEH_end_bn_gather5: .cfi_endproc .size bn_gather5,.-bn_gather5 ___ } $code.=<<___; .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type mul_handler,\@abi-omnipotent .align 16 mul_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRipRsp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea .Lmul_epilogue(%rip),%r10 cmp %r10,%rbx ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer .Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size mul_handler,.-mul_handler .section .pdata .align 4 .rva .LSEH_begin_bn_mul_mont_gather5 .rva .LSEH_end_bn_mul_mont_gather5 .rva .LSEH_info_bn_mul_mont_gather5 .rva .LSEH_begin_bn_mul4x_mont_gather5 .rva .LSEH_end_bn_mul4x_mont_gather5 .rva .LSEH_info_bn_mul4x_mont_gather5 .rva .LSEH_begin_bn_power5 .rva .LSEH_end_bn_power5 .rva .LSEH_info_bn_power5 .rva .LSEH_begin_bn_from_mont8x .rva .LSEH_end_bn_from_mont8x .rva .LSEH_info_bn_from_mont8x ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont_gather5 .rva .LSEH_end_bn_mulx4x_mont_gather5 .rva .LSEH_info_bn_mulx4x_mont_gather5 .rva .LSEH_begin_bn_powerx5 .rva .LSEH_end_bn_powerx5 .rva .LSEH_info_bn_powerx5 ___ $code.=<<___; .rva .LSEH_begin_bn_gather5 .rva .LSEH_end_bn_gather5 .rva .LSEH_info_bn_gather5 .section .xdata .align 8 .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: .byte 0x01,0x0b,0x03,0x0a .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/chacha/asm/chacha-x86.pl =================================================================== --- head/crypto/openssl/crypto/chacha/asm/chacha-x86.pl (revision 364821) +++ head/crypto/openssl/crypto/chacha/asm/chacha-x86.pl (revision 364822) @@ -1,1155 +1,1155 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # January 2015 # # ChaCha20 for x86. # # Performance in cycles per byte out of large buffer. # # 1xIALU/gcc 4xSSSE3 # Pentium 17.5/+80% # PIII 14.2/+60% # P4 18.6/+84% # Core2 9.56/+89% 4.83 # Westmere 9.50/+45% 3.35 # Sandy Bridge 10.5/+47% 3.20 # Haswell 8.15/+50% 2.83 # Skylake 7.53/+22% 2.75 # Silvermont 17.4/+36% 8.35 # Goldmont 13.4/+40% 4.36 # Sledgehammer 10.2/+54% # Bulldozer 13.4/+50% 4.38(*) # # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } $ymm=1 if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/ && ($gasver=$1)>=2.19); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && - `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && + `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); # first version supporting AVX $a="eax"; ($b,$b_)=("ebx","ebp"); ($c,$c_)=("ecx","esi"); ($d,$d_)=("edx","edi"); sub QUARTERROUND { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&add ($a,$b); # see elsewhere &xor ($d,$a); &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); &rol ($d,16); &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); &add ($c,$d); &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); &xor ($b,$c); &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); &rol ($b,12); &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter &add ($a,$b); &xor ($d,$a); &mov (&DWP(4*$ai,"esp"),$a); &rol ($d,8); &mov ($a,&DWP(4*$an,"esp")); &add ($c,$d); &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); &mov ($d_,$d) if ($di==$dn); &xor ($b,$c); &add ($a,$b_) if ($i<7); # elsewhere &rol ($b,7); ($b,$b_)=($b_,$b); ($c,$c_)=($c_,$c); ($d,$d_)=($d_,$d); } &static_label("ssse3_shortcut"); &static_label("xop_shortcut"); &static_label("ssse3_data"); &static_label("pic_point"); &function_begin("ChaCha20_ctr32"); &xor ("eax","eax"); &cmp ("eax",&wparam(2)); # len==0? &je (&label("no_data")); if ($xmm) { &call (&label("pic_point")); &set_label("pic_point"); &blindpop("eax"); &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); &test (&DWP(0,"ebp"),1<<24); # test FXSR bit &jz (&label("x86")); &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit &jz (&label("x86")); &jmp (&label("ssse3_shortcut")); &set_label("x86"); } &mov ("esi",&wparam(3)); # key &mov ("edi",&wparam(4)); # counter and nonce &stack_push(33); &mov ("eax",&DWP(4*0,"esi")); # copy key &mov ("ebx",&DWP(4*1,"esi")); &mov ("ecx",&DWP(4*2,"esi")); &mov ("edx",&DWP(4*3,"esi")); &mov (&DWP(64+4*4,"esp"),"eax"); &mov (&DWP(64+4*5,"esp"),"ebx"); &mov (&DWP(64+4*6,"esp"),"ecx"); &mov (&DWP(64+4*7,"esp"),"edx"); &mov ("eax",&DWP(4*4,"esi")); &mov ("ebx",&DWP(4*5,"esi")); &mov ("ecx",&DWP(4*6,"esi")); &mov ("edx",&DWP(4*7,"esi")); &mov (&DWP(64+4*8,"esp"),"eax"); &mov (&DWP(64+4*9,"esp"),"ebx"); &mov (&DWP(64+4*10,"esp"),"ecx"); &mov (&DWP(64+4*11,"esp"),"edx"); &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce &mov ("ebx",&DWP(4*1,"edi")); &mov ("ecx",&DWP(4*2,"edi")); &mov ("edx",&DWP(4*3,"edi")); &sub ("eax",1); &mov (&DWP(64+4*12,"esp"),"eax"); &mov (&DWP(64+4*13,"esp"),"ebx"); &mov (&DWP(64+4*14,"esp"),"ecx"); &mov (&DWP(64+4*15,"esp"),"edx"); &jmp (&label("entry")); &set_label("outer_loop",16); &mov (&wparam(1),$b); # save input &mov (&wparam(0),$a); # save output &mov (&wparam(2),$c); # save len &set_label("entry"); &mov ($a,0x61707865); &mov (&DWP(4*1,"esp"),0x3320646e); &mov (&DWP(4*2,"esp"),0x79622d32); &mov (&DWP(4*3,"esp"),0x6b206574); &mov ($b, &DWP(64+4*5,"esp")); # copy key material &mov ($b_,&DWP(64+4*6,"esp")); &mov ($c, &DWP(64+4*10,"esp")); &mov ($c_,&DWP(64+4*11,"esp")); &mov ($d, &DWP(64+4*13,"esp")); &mov ($d_,&DWP(64+4*14,"esp")); &mov (&DWP(4*5,"esp"),$b); &mov (&DWP(4*6,"esp"),$b_); &mov (&DWP(4*10,"esp"),$c); &mov (&DWP(4*11,"esp"),$c_); &mov (&DWP(4*13,"esp"),$d); &mov (&DWP(4*14,"esp"),$d_); &mov ($b, &DWP(64+4*7,"esp")); &mov ($d_,&DWP(64+4*15,"esp")); &mov ($d, &DWP(64+4*12,"esp")); &mov ($b_,&DWP(64+4*4,"esp")); &mov ($c, &DWP(64+4*8,"esp")); &mov ($c_,&DWP(64+4*9,"esp")); &add ($d,1); # counter value &mov (&DWP(4*7,"esp"),$b); &mov (&DWP(4*15,"esp"),$d_); &mov (&DWP(64+4*12,"esp"),$d); # save counter value &mov ($b,10); # loop counter &jmp (&label("loop")); &set_label("loop",16); &add ($a,$b_); # elsewhere &mov (&DWP(128,"esp"),$b); # save loop counter &mov ($b,$b_); &QUARTERROUND(0, 4, 8, 12, 0); &QUARTERROUND(1, 5, 9, 13, 1); &QUARTERROUND(2, 6,10, 14, 2); &QUARTERROUND(3, 7,11, 15, 3); &QUARTERROUND(0, 5,10, 15, 4); &QUARTERROUND(1, 6,11, 12, 5); &QUARTERROUND(2, 7, 8, 13, 6); &QUARTERROUND(3, 4, 9, 14, 7); &dec ($b); &jnz (&label("loop")); &mov ($b,&wparam(2)); # load len &add ($a,0x61707865); # accumulate key material &add ($b_,&DWP(64+4*4,"esp")); &add ($c, &DWP(64+4*8,"esp")); &add ($c_,&DWP(64+4*9,"esp")); &cmp ($b,64); &jb (&label("tail")); &mov ($b,&wparam(1)); # load input pointer &add ($d, &DWP(64+4*12,"esp")); &add ($d_,&DWP(64+4*14,"esp")); &xor ($a, &DWP(4*0,$b)); # xor with input &xor ($b_,&DWP(4*4,$b)); &mov (&DWP(4*0,"esp"),$a); &mov ($a,&wparam(0)); # load output pointer &xor ($c, &DWP(4*8,$b)); &xor ($c_,&DWP(4*9,$b)); &xor ($d, &DWP(4*12,$b)); &xor ($d_,&DWP(4*14,$b)); &mov (&DWP(4*4,$a),$b_); # write output &mov (&DWP(4*8,$a),$c); &mov (&DWP(4*9,$a),$c_); &mov (&DWP(4*12,$a),$d); &mov (&DWP(4*14,$a),$d_); &mov ($b_,&DWP(4*1,"esp")); &mov ($c, &DWP(4*2,"esp")); &mov ($c_,&DWP(4*3,"esp")); &mov ($d, &DWP(4*5,"esp")); &mov ($d_,&DWP(4*6,"esp")); &add ($b_,0x3320646e); # accumulate key material &add ($c, 0x79622d32); &add ($c_,0x6b206574); &add ($d, &DWP(64+4*5,"esp")); &add ($d_,&DWP(64+4*6,"esp")); &xor ($b_,&DWP(4*1,$b)); &xor ($c, &DWP(4*2,$b)); &xor ($c_,&DWP(4*3,$b)); &xor ($d, &DWP(4*5,$b)); &xor ($d_,&DWP(4*6,$b)); &mov (&DWP(4*1,$a),$b_); &mov (&DWP(4*2,$a),$c); &mov (&DWP(4*3,$a),$c_); &mov (&DWP(4*5,$a),$d); &mov (&DWP(4*6,$a),$d_); &mov ($b_,&DWP(4*7,"esp")); &mov ($c, &DWP(4*10,"esp")); &mov ($c_,&DWP(4*11,"esp")); &mov ($d, &DWP(4*13,"esp")); &mov ($d_,&DWP(4*15,"esp")); &add ($b_,&DWP(64+4*7,"esp")); &add ($c, &DWP(64+4*10,"esp")); &add ($c_,&DWP(64+4*11,"esp")); &add ($d, &DWP(64+4*13,"esp")); &add ($d_,&DWP(64+4*15,"esp")); &xor ($b_,&DWP(4*7,$b)); &xor ($c, &DWP(4*10,$b)); &xor ($c_,&DWP(4*11,$b)); &xor ($d, &DWP(4*13,$b)); &xor ($d_,&DWP(4*15,$b)); &lea ($b,&DWP(4*16,$b)); &mov (&DWP(4*7,$a),$b_); &mov ($b_,&DWP(4*0,"esp")); &mov (&DWP(4*10,$a),$c); &mov ($c,&wparam(2)); # len &mov (&DWP(4*11,$a),$c_); &mov (&DWP(4*13,$a),$d); &mov (&DWP(4*15,$a),$d_); &mov (&DWP(4*0,$a),$b_); &lea ($a,&DWP(4*16,$a)); &sub ($c,64); &jnz (&label("outer_loop")); &jmp (&label("done")); &set_label("tail"); &add ($d, &DWP(64+4*12,"esp")); &add ($d_,&DWP(64+4*14,"esp")); &mov (&DWP(4*0,"esp"),$a); &mov (&DWP(4*4,"esp"),$b_); &mov (&DWP(4*8,"esp"),$c); &mov (&DWP(4*9,"esp"),$c_); &mov (&DWP(4*12,"esp"),$d); &mov (&DWP(4*14,"esp"),$d_); &mov ($b_,&DWP(4*1,"esp")); &mov ($c, &DWP(4*2,"esp")); &mov ($c_,&DWP(4*3,"esp")); &mov ($d, &DWP(4*5,"esp")); &mov ($d_,&DWP(4*6,"esp")); &add ($b_,0x3320646e); # accumulate key material &add ($c, 0x79622d32); &add ($c_,0x6b206574); &add ($d, &DWP(64+4*5,"esp")); &add ($d_,&DWP(64+4*6,"esp")); &mov (&DWP(4*1,"esp"),$b_); &mov (&DWP(4*2,"esp"),$c); &mov (&DWP(4*3,"esp"),$c_); &mov (&DWP(4*5,"esp"),$d); &mov (&DWP(4*6,"esp"),$d_); &mov ($b_,&DWP(4*7,"esp")); &mov ($c, &DWP(4*10,"esp")); &mov ($c_,&DWP(4*11,"esp")); &mov ($d, &DWP(4*13,"esp")); &mov ($d_,&DWP(4*15,"esp")); &add ($b_,&DWP(64+4*7,"esp")); &add ($c, &DWP(64+4*10,"esp")); &add ($c_,&DWP(64+4*11,"esp")); &add ($d, &DWP(64+4*13,"esp")); &add ($d_,&DWP(64+4*15,"esp")); &mov (&DWP(4*7,"esp"),$b_); &mov ($b_,&wparam(1)); # load input &mov (&DWP(4*10,"esp"),$c); &mov ($c,&wparam(0)); # load output &mov (&DWP(4*11,"esp"),$c_); &xor ($c_,$c_); &mov (&DWP(4*13,"esp"),$d); &mov (&DWP(4*15,"esp"),$d_); &xor ("eax","eax"); &xor ("edx","edx"); &set_label("tail_loop"); &movb ("al",&BP(0,$c_,$b_)); &movb ("dl",&BP(0,"esp",$c_)); &lea ($c_,&DWP(1,$c_)); &xor ("al","dl"); &mov (&BP(-1,$c,$c_),"al"); &dec ($b); &jnz (&label("tail_loop")); &set_label("done"); &stack_pop(33); &set_label("no_data"); &function_end("ChaCha20_ctr32"); if ($xmm) { my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); my ($out,$inp,$len)=("edi","esi","ecx"); sub QUARTERROUND_SSSE3 { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&paddd ($xa,$xb); # see elsewhere #&pxor ($xd,$xa); # see elsewhere &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); &pshufb ($xd,&QWP(0,"eax")); # rot16 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); &paddd ($xc,$xd); &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); &pxor ($xb,$xc); &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); &movdqa ($xa_,$xb); # borrow as temporary &pslld ($xb,12); &psrld ($xa_,20); &por ($xb,$xa_); &movdqa($xa_,&QWP(16*$an-128,"ebx")); &paddd ($xa,$xb); &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); &pxor ($xd,$xa); &movdqa (&QWP(16*$ai-128,"ebx"),$xa); &pshufb ($xd,&QWP(16,"eax")); # rot8 &paddd ($xc,$xd); &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); &movdqa ($xd_,$xd) if ($di==$dn); &pxor ($xb,$xc); &paddd ($xa_,$xb_) if ($i<7); # elsewhere &movdqa ($xa,$xb); # borrow as temporary &pslld ($xb,7); &psrld ($xa,25); &pxor ($xd_,$xa_) if ($i<7); # elsewhere &por ($xb,$xa); ($xa,$xa_)=($xa_,$xa); ($xb,$xb_)=($xb_,$xb); ($xc,$xc_)=($xc_,$xc); ($xd,$xd_)=($xd_,$xd); } &function_begin("ChaCha20_ssse3"); &set_label("ssse3_shortcut"); if ($ymm) { &test (&DWP(4,"ebp"),1<<11); # test XOP bit &jnz (&label("xop_shortcut")); } &mov ($out,&wparam(0)); &mov ($inp,&wparam(1)); &mov ($len,&wparam(2)); &mov ("edx",&wparam(3)); # key &mov ("ebx",&wparam(4)); # counter and nonce &mov ("ebp","esp"); &stack_push (131); &and ("esp",-64); &mov (&DWP(512,"esp"),"ebp"); &lea ("eax",&DWP(&label("ssse3_data")."-". &label("pic_point"),"eax")); &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce if (defined($gasver) && $gasver>=2.17) { # even though we encode # pshufb manually, we # handle only register # operands, while this # segment uses memory # operand... &cmp ($len,64*4); &jb (&label("1x")); &mov (&DWP(512+4,"esp"),"edx"); # offload pointers &mov (&DWP(512+8,"esp"),"ebx"); &sub ($len,64*4); # bias len &lea ("ebp",&DWP(256+128,"esp")); # size optimization &movdqu ("xmm7",&QWP(0,"edx")); # key &pshufd ("xmm0","xmm3",0x00); &pshufd ("xmm1","xmm3",0x55); &pshufd ("xmm2","xmm3",0xaa); &pshufd ("xmm3","xmm3",0xff); &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters &pshufd ("xmm4","xmm7",0x00); &pshufd ("xmm5","xmm7",0x55); &psubd ("xmm0",&QWP(16*4,"eax")); &pshufd ("xmm6","xmm7",0xaa); &pshufd ("xmm7","xmm7",0xff); &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); &movdqu ("xmm3",&QWP(16,"edx")); # key &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma &lea ("ebx",&DWP(128,"esp")); # size optimization &pshufd ("xmm0","xmm3",0x00); &pshufd ("xmm1","xmm3",0x55); &pshufd ("xmm2","xmm3",0xaa); &pshufd ("xmm3","xmm3",0xff); &pshufd ("xmm4","xmm7",0x00); &pshufd ("xmm5","xmm7",0x55); &pshufd ("xmm6","xmm7",0xaa); &pshufd ("xmm7","xmm7",0xff); &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); &lea ($inp,&DWP(128,$inp)); # size optimization &lea ($out,&DWP(128,$out)); # size optimization &jmp (&label("outer_loop")); &set_label("outer_loop",16); #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material &movdqa ("xmm1",&QWP(16*1-128,"ebp")); &movdqa ("xmm2",&QWP(16*2-128,"ebp")); &movdqa ("xmm3",&QWP(16*3-128,"ebp")); #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); &movdqa ("xmm5",&QWP(16*5-128,"ebp")); &movdqa ("xmm6",&QWP(16*6-128,"ebp")); &movdqa ("xmm7",&QWP(16*7-128,"ebp")); #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); &movdqa ("xmm2",&QWP(16*10-128,"ebp")); &movdqa ("xmm3",&QWP(16*11-128,"ebp")); &movdqa ("xmm4",&QWP(16*12-128,"ebp")); &movdqa ("xmm5",&QWP(16*13-128,"ebp")); &movdqa ("xmm6",&QWP(16*14-128,"ebp")); &movdqa ("xmm7",&QWP(16*15-128,"ebp")); &paddd ("xmm4",&QWP(16*4,"eax")); # counter value #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value &movdqa ($xa, &QWP(16*0-128,"ebp")); &movdqa ($xd, "xmm4"); &movdqa ($xb_,&QWP(16*4-128,"ebp")); &movdqa ($xc, &QWP(16*8-128,"ebp")); &movdqa ($xc_,&QWP(16*9-128,"ebp")); &mov ("edx",10); # loop counter &nop (); &set_label("loop",16); &paddd ($xa,$xb_); # elsewhere &movdqa ($xb,$xb_); &pxor ($xd,$xa); # elsewhere &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); &QUARTERROUND_SSSE3(2, 6,10, 14, 2); &QUARTERROUND_SSSE3(3, 7,11, 15, 3); &QUARTERROUND_SSSE3(0, 5,10, 15, 4); &QUARTERROUND_SSSE3(1, 6,11, 12, 5); &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); &dec ("edx"); &jnz (&label("loop")); &movdqa (&QWP(16*4-128,"ebx"),$xb_); &movdqa (&QWP(16*8-128,"ebx"),$xc); &movdqa (&QWP(16*9-128,"ebx"),$xc_); &movdqa (&QWP(16*12-128,"ebx"),$xd); &movdqa (&QWP(16*14-128,"ebx"),$xd_); my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there &movdqa ($xa1,&QWP(16*1-128,"ebx")); &movdqa ($xa2,&QWP(16*2-128,"ebx")); &movdqa ($xa3,&QWP(16*3-128,"ebx")); for($i=0;$i<256;$i+=64) { &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &paddd ($xa1,&QWP($i+16*1-128,"ebp")); &paddd ($xa2,&QWP($i+16*2-128,"ebp")); &paddd ($xa3,&QWP($i+16*3-128,"ebp")); &movdqa ($xt2,$xa0); # "de-interlace" data &punpckldq ($xa0,$xa1); &movdqa ($xt3,$xa2); &punpckldq ($xa2,$xa3); &punpckhdq ($xt2,$xa1); &punpckhdq ($xt3,$xa3); &movdqa ($xa1,$xa0); &punpcklqdq ($xa0,$xa2); # "a0" &movdqa ($xa3,$xt2); &punpcklqdq ($xt2,$xt3); # "a2" &punpckhqdq ($xa1,$xa2); # "a1" &punpckhqdq ($xa3,$xt3); # "a3" #($xa2,$xt2)=($xt2,$xa2); &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input &movdqu ($xt1,&QWP(64*1-128,$inp)); &movdqu ($xa2,&QWP(64*2-128,$inp)); &movdqu ($xt3,&QWP(64*3-128,$inp)); &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); &pxor ($xt0,$xa0); &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); &pxor ($xt1,$xa1); &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); &pxor ($xt2,$xa2); &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); &pxor ($xt3,$xa3); &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); &movdqu (&QWP(64*0-128,$out),$xt0); # store output &movdqu (&QWP(64*1-128,$out),$xt1); &movdqu (&QWP(64*2-128,$out),$xt2); &movdqu (&QWP(64*3-128,$out),$xt3); &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); } &sub ($len,64*4); &jnc (&label("outer_loop")); &add ($len,64*4); &jz (&label("done")); &mov ("ebx",&DWP(512+8,"esp")); # restore pointers &lea ($inp,&DWP(-128,$inp)); &mov ("edx",&DWP(512+4,"esp")); &lea ($out,&DWP(-128,$out)); &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value &movdqu ("xmm3",&QWP(0,"ebx")); &paddd ("xmm2",&QWP(16*6,"eax")); # +four &pand ("xmm3",&QWP(16*7,"eax")); &por ("xmm3","xmm2"); # counter value } { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot16); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,20); &pslld ($t,12); &por ($b,$t); &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot24); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,25); &pslld ($t,7); &por ($b,$t); } &set_label("1x"); &movdqa ($a,&QWP(16*2,"eax")); # sigma &movdqu ($b,&QWP(0,"edx")); &movdqu ($c,&QWP(16,"edx")); #&movdqu ($d,&QWP(0,"ebx")); # already loaded &movdqa ($rot16,&QWP(0,"eax")); &movdqa ($rot24,&QWP(16,"eax")); &mov (&DWP(16*3,"esp"),"ebp"); &movdqa (&QWP(16*0,"esp"),$a); &movdqa (&QWP(16*1,"esp"),$b); &movdqa (&QWP(16*2,"esp"),$c); &movdqa (&QWP(16*3,"esp"),$d); &mov ("edx",10); &jmp (&label("loop1x")); &set_label("outer1x",16); &movdqa ($d,&QWP(16*5,"eax")); # one &movdqa ($a,&QWP(16*0,"esp")); &movdqa ($b,&QWP(16*1,"esp")); &movdqa ($c,&QWP(16*2,"esp")); &paddd ($d,&QWP(16*3,"esp")); &mov ("edx",10); &movdqa (&QWP(16*3,"esp"),$d); &jmp (&label("loop1x")); &set_label("loop1x",16); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &nop (); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &dec ("edx"); &jnz (&label("loop1x")); &paddd ($a,&QWP(16*0,"esp")); &paddd ($b,&QWP(16*1,"esp")); &paddd ($c,&QWP(16*2,"esp")); &paddd ($d,&QWP(16*3,"esp")); &cmp ($len,64); &jb (&label("tail")); &movdqu ($t,&QWP(16*0,$inp)); &movdqu ($t1,&QWP(16*1,$inp)); &pxor ($a,$t); # xor with input &movdqu ($t,&QWP(16*2,$inp)); &pxor ($b,$t1); &movdqu ($t1,&QWP(16*3,$inp)); &pxor ($c,$t); &pxor ($d,$t1); &lea ($inp,&DWP(16*4,$inp)); # inp+=64 &movdqu (&QWP(16*0,$out),$a); # write output &movdqu (&QWP(16*1,$out),$b); &movdqu (&QWP(16*2,$out),$c); &movdqu (&QWP(16*3,$out),$d); &lea ($out,&DWP(16*4,$out)); # inp+=64 &sub ($len,64); &jnz (&label("outer1x")); &jmp (&label("done")); &set_label("tail"); &movdqa (&QWP(16*0,"esp"),$a); &movdqa (&QWP(16*1,"esp"),$b); &movdqa (&QWP(16*2,"esp"),$c); &movdqa (&QWP(16*3,"esp"),$d); &xor ("eax","eax"); &xor ("edx","edx"); &xor ("ebp","ebp"); &set_label("tail_loop"); &movb ("al",&BP(0,"esp","ebp")); &movb ("dl",&BP(0,$inp,"ebp")); &lea ("ebp",&DWP(1,"ebp")); &xor ("al","dl"); &movb (&BP(-1,$out,"ebp"),"al"); &dec ($len); &jnz (&label("tail_loop")); } &set_label("done"); &mov ("esp",&DWP(512,"esp")); &function_end("ChaCha20_ssse3"); &align (64); &set_label("ssse3_data"); &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); &data_word(0,1,2,3); &data_word(4,4,4,4); &data_word(1,0,0,0); &data_word(4,0,0,0); &data_word(0,-1,-1,-1); &align (64); } &asciz ("ChaCha20 for x86, CRYPTOGAMS by "); if ($ymm) { my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); my ($out,$inp,$len)=("edi","esi","ecx"); sub QUARTERROUND_XOP { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&vpaddd ($xa,$xa,$xb); # see elsewhere #&vpxor ($xd,$xd,$xa); # see elsewhere &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); &vprotd ($xd,$xd,16); &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); &vpaddd ($xc,$xc,$xd); &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); &vpxor ($xb,$i!=0?$xb:$xb_,$xc); &vmovdqa ($xa_,&QWP(16*$an-128,"ebx")); &vprotd ($xb,$xb,12); &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); &vpaddd ($xa,$xa,$xb); &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); &vpxor ($xd,$xd,$xa); &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere &vprotd ($xd,$xd,8); &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa); &vpaddd ($xc,$xc,$xd); &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); &vpxor ($xb,$xb,$xc); &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere &vprotd ($xb,$xb,7); ($xa,$xa_)=($xa_,$xa); ($xb,$xb_)=($xb_,$xb); ($xc,$xc_)=($xc_,$xc); ($xd,$xd_)=($xd_,$xd); } &function_begin("ChaCha20_xop"); &set_label("xop_shortcut"); &mov ($out,&wparam(0)); &mov ($inp,&wparam(1)); &mov ($len,&wparam(2)); &mov ("edx",&wparam(3)); # key &mov ("ebx",&wparam(4)); # counter and nonce &vzeroupper (); &mov ("ebp","esp"); &stack_push (131); &and ("esp",-64); &mov (&DWP(512,"esp"),"ebp"); &lea ("eax",&DWP(&label("ssse3_data")."-". &label("pic_point"),"eax")); &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce &cmp ($len,64*4); &jb (&label("1x")); &mov (&DWP(512+4,"esp"),"edx"); # offload pointers &mov (&DWP(512+8,"esp"),"ebx"); &sub ($len,64*4); # bias len &lea ("ebp",&DWP(256+128,"esp")); # size optimization &vmovdqu ("xmm7",&QWP(0,"edx")); # key &vpshufd ("xmm0","xmm3",0x00); &vpshufd ("xmm1","xmm3",0x55); &vpshufd ("xmm2","xmm3",0xaa); &vpshufd ("xmm3","xmm3",0xff); &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters &vpshufd ("xmm4","xmm7",0x00); &vpshufd ("xmm5","xmm7",0x55); &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax")); &vpshufd ("xmm6","xmm7",0xaa); &vpshufd ("xmm7","xmm7",0xff); &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0"); &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1"); &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2"); &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3"); &vmovdqu ("xmm3",&QWP(16,"edx")); # key &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4"); &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5"); &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6"); &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7"); &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma &lea ("ebx",&DWP(128,"esp")); # size optimization &vpshufd ("xmm0","xmm3",0x00); &vpshufd ("xmm1","xmm3",0x55); &vpshufd ("xmm2","xmm3",0xaa); &vpshufd ("xmm3","xmm3",0xff); &vpshufd ("xmm4","xmm7",0x00); &vpshufd ("xmm5","xmm7",0x55); &vpshufd ("xmm6","xmm7",0xaa); &vpshufd ("xmm7","xmm7",0xff); &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0"); &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1"); &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2"); &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3"); &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4"); &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5"); &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6"); &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7"); &lea ($inp,&DWP(128,$inp)); # size optimization &lea ($out,&DWP(128,$out)); # size optimization &jmp (&label("outer_loop")); &set_label("outer_loop",32); #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material &vmovdqa ("xmm1",&QWP(16*1-128,"ebp")); &vmovdqa ("xmm2",&QWP(16*2-128,"ebp")); &vmovdqa ("xmm3",&QWP(16*3-128,"ebp")); #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp")); &vmovdqa ("xmm5",&QWP(16*5-128,"ebp")); &vmovdqa ("xmm6",&QWP(16*6-128,"ebp")); &vmovdqa ("xmm7",&QWP(16*7-128,"ebp")); #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0"); &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1"); &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2"); &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3"); #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4"); &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5"); &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6"); &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7"); #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp")); #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp")); &vmovdqa ("xmm2",&QWP(16*10-128,"ebp")); &vmovdqa ("xmm3",&QWP(16*11-128,"ebp")); &vmovdqa ("xmm4",&QWP(16*12-128,"ebp")); &vmovdqa ("xmm5",&QWP(16*13-128,"ebp")); &vmovdqa ("xmm6",&QWP(16*14-128,"ebp")); &vmovdqa ("xmm7",&QWP(16*15-128,"ebp")); &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0"); #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1"); &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2"); &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3"); &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4"); &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5"); &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6"); &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7"); &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value &vmovdqa ($xa, &QWP(16*0-128,"ebp")); &vmovdqa ($xd, "xmm4"); &vmovdqa ($xb_,&QWP(16*4-128,"ebp")); &vmovdqa ($xc, &QWP(16*8-128,"ebp")); &vmovdqa ($xc_,&QWP(16*9-128,"ebp")); &mov ("edx",10); # loop counter &nop (); &set_label("loop",32); &vpaddd ($xa,$xa,$xb_); # elsewhere &vpxor ($xd,$xd,$xa); # elsewhere &QUARTERROUND_XOP(0, 4, 8, 12, 0); &QUARTERROUND_XOP(1, 5, 9, 13, 1); &QUARTERROUND_XOP(2, 6,10, 14, 2); &QUARTERROUND_XOP(3, 7,11, 15, 3); &QUARTERROUND_XOP(0, 5,10, 15, 4); &QUARTERROUND_XOP(1, 6,11, 12, 5); &QUARTERROUND_XOP(2, 7, 8, 13, 6); &QUARTERROUND_XOP(3, 4, 9, 14, 7); &dec ("edx"); &jnz (&label("loop")); &vmovdqa (&QWP(16*4-128,"ebx"),$xb_); &vmovdqa (&QWP(16*8-128,"ebx"),$xc); &vmovdqa (&QWP(16*9-128,"ebx"),$xc_); &vmovdqa (&QWP(16*12-128,"ebx"),$xd); &vmovdqa (&QWP(16*14-128,"ebx"),$xd_); my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there &vmovdqa ($xa1,&QWP(16*1-128,"ebx")); &vmovdqa ($xa2,&QWP(16*2-128,"ebx")); &vmovdqa ($xa3,&QWP(16*3-128,"ebx")); for($i=0;$i<256;$i+=64) { &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp")); &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp")); &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp")); &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data &vpunpckldq ($xt3,$xa2,$xa3); &vpunpckhdq ($xa0,$xa0,$xa1); &vpunpckhdq ($xa2,$xa2,$xa3); &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0" &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1" &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2" &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3" &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp)); &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp)); &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp)); &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp)); &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output &vmovdqu (&QWP(64*1-128,$out),$xt1); &vmovdqu (&QWP(64*2-128,$out),$xt2); &vmovdqu (&QWP(64*3-128,$out),$xt3); &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); } &sub ($len,64*4); &jnc (&label("outer_loop")); &add ($len,64*4); &jz (&label("done")); &mov ("ebx",&DWP(512+8,"esp")); # restore pointers &lea ($inp,&DWP(-128,$inp)); &mov ("edx",&DWP(512+4,"esp")); &lea ($out,&DWP(-128,$out)); &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value &vmovdqu ("xmm3",&QWP(0,"ebx")); &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four &vpand ("xmm3","xmm3",&QWP(16*7,"eax")); &vpor ("xmm3","xmm3","xmm2"); # counter value { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); sub XOPROUND { &vpaddd ($a,$a,$b); &vpxor ($d,$d,$a); &vprotd ($d,$d,16); &vpaddd ($c,$c,$d); &vpxor ($b,$b,$c); &vprotd ($b,$b,12); &vpaddd ($a,$a,$b); &vpxor ($d,$d,$a); &vprotd ($d,$d,8); &vpaddd ($c,$c,$d); &vpxor ($b,$b,$c); &vprotd ($b,$b,7); } &set_label("1x"); &vmovdqa ($a,&QWP(16*2,"eax")); # sigma &vmovdqu ($b,&QWP(0,"edx")); &vmovdqu ($c,&QWP(16,"edx")); #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded &vmovdqa ($rot16,&QWP(0,"eax")); &vmovdqa ($rot24,&QWP(16,"eax")); &mov (&DWP(16*3,"esp"),"ebp"); &vmovdqa (&QWP(16*0,"esp"),$a); &vmovdqa (&QWP(16*1,"esp"),$b); &vmovdqa (&QWP(16*2,"esp"),$c); &vmovdqa (&QWP(16*3,"esp"),$d); &mov ("edx",10); &jmp (&label("loop1x")); &set_label("outer1x",16); &vmovdqa ($d,&QWP(16*5,"eax")); # one &vmovdqa ($a,&QWP(16*0,"esp")); &vmovdqa ($b,&QWP(16*1,"esp")); &vmovdqa ($c,&QWP(16*2,"esp")); &vpaddd ($d,$d,&QWP(16*3,"esp")); &mov ("edx",10); &vmovdqa (&QWP(16*3,"esp"),$d); &jmp (&label("loop1x")); &set_label("loop1x",16); &XOPROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b00111001); &vpshufd ($d,$d,0b10010011); &XOPROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b10010011); &vpshufd ($d,$d,0b00111001); &dec ("edx"); &jnz (&label("loop1x")); &vpaddd ($a,$a,&QWP(16*0,"esp")); &vpaddd ($b,$b,&QWP(16*1,"esp")); &vpaddd ($c,$c,&QWP(16*2,"esp")); &vpaddd ($d,$d,&QWP(16*3,"esp")); &cmp ($len,64); &jb (&label("tail")); &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input &vpxor ($b,$b,&QWP(16*1,$inp)); &vpxor ($c,$c,&QWP(16*2,$inp)); &vpxor ($d,$d,&QWP(16*3,$inp)); &lea ($inp,&DWP(16*4,$inp)); # inp+=64 &vmovdqu (&QWP(16*0,$out),$a); # write output &vmovdqu (&QWP(16*1,$out),$b); &vmovdqu (&QWP(16*2,$out),$c); &vmovdqu (&QWP(16*3,$out),$d); &lea ($out,&DWP(16*4,$out)); # inp+=64 &sub ($len,64); &jnz (&label("outer1x")); &jmp (&label("done")); &set_label("tail"); &vmovdqa (&QWP(16*0,"esp"),$a); &vmovdqa (&QWP(16*1,"esp"),$b); &vmovdqa (&QWP(16*2,"esp"),$c); &vmovdqa (&QWP(16*3,"esp"),$d); &xor ("eax","eax"); &xor ("edx","edx"); &xor ("ebp","ebp"); &set_label("tail_loop"); &movb ("al",&BP(0,"esp","ebp")); &movb ("dl",&BP(0,$inp,"ebp")); &lea ("ebp",&DWP(1,"ebp")); &xor ("al","dl"); &movb (&BP(-1,$out,"ebp"),"al"); &dec ($len); &jnz (&label("tail_loop")); } &set_label("done"); &vzeroupper (); &mov ("esp",&DWP(512,"esp")); &function_end("ChaCha20_xop"); } &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/chacha/asm/chacha-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/chacha/asm/chacha-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/chacha/asm/chacha-x86_64.pl (revision 364822) @@ -1,4005 +1,4005 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # November 2014 # # ChaCha20 for x86_64. # # December 2016 # # Add AVX512F code path. # # December 2017 # # Add AVX512VL code path. # # Performance in cycles per byte out of large buffer. # # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) # # P4 9.48/+99% - - # Core2 7.83/+55% 7.90/5.76 4.35 # Westmere 7.19/+50% 5.60/4.50 3.00 # Sandy Bridge 8.31/+42% 5.45/4.00 2.72 # Ivy Bridge 6.71/+46% 5.40/? 2.41 # Haswell 5.92/+43% 5.20/3.45 2.42 1.23 # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] # Silvermont 12.0/+33% 7.75/6.90 7.03(iii) # Knights L 11.7/- ? 9.60(iii) 0.80 # Goldmont 10.6/+17% 5.10/3.52 3.28 # Sledgehammer 7.28/+52% - - # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 # VIA Nano 10.5/+46% 6.72/6.88 6.05 # # (i) compared to older gcc 3.x one can observe >2x improvement on # most platforms; # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used # by chacha20_poly1305_tls_cipher, results are EVP-free; # (iii) this is not optimal result for Atom because of MSROM # limitations, SSE2 can do better, but gain is considered too # low to justify the [maintenance] effort; # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 # and 4.85 for 128-byte inputs; # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 # cpb in single thread, the corresponding capability is suppressed; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); $avx += 1 if ($1==2.11 && $2>=8); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # input parameter block ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); $code.=<<___; .text .extern OPENSSL_ia32cap_P .align 64 .Lzero: .long 0,0,0,0 .Lone: .long 1,0,0,0 .Linc: .long 0,1,2,3 .Lfour: .long 4,4,4,4 .Lincy: .long 0,2,4,6,1,3,5,7 .Leight: .long 8,8,8,8,8,8,8,8 .Lrot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd .Lrot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Ltwoy: .long 2,0,0,0, 2,0,0,0 .align 64 .Lzeroz: .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 .Lfourz: .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 .Lincz: .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .Lsixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .Lsigma: .asciz "expand 32-byte k" .asciz "ChaCha20 for x86_64, CRYPTOGAMS by " ___ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); @t=("%esi","%edi"); sub ROUND { # critical path is 24 cycles per round my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. # Normally instructions would be interleaved to favour in-order # execution. Generally out-of-order cores manage it gracefully, # but not this time for some reason. As in-order execution # cores are dying breed, old Atom is the only one around, # instructions are left uninterleaved. Besides, Atom is better # off executing 1xSSSE3 code anyway... ( "&add (@x[$a0],@x[$b0])", # Q1 "&xor (@x[$d0],@x[$a0])", "&rol (@x[$d0],16)", "&add (@x[$a1],@x[$b1])", # Q2 "&xor (@x[$d1],@x[$a1])", "&rol (@x[$d1],16)", "&add ($xc,@x[$d0])", "&xor (@x[$b0],$xc)", "&rol (@x[$b0],12)", "&add ($xc_,@x[$d1])", "&xor (@x[$b1],$xc_)", "&rol (@x[$b1],12)", "&add (@x[$a0],@x[$b0])", "&xor (@x[$d0],@x[$a0])", "&rol (@x[$d0],8)", "&add (@x[$a1],@x[$b1])", "&xor (@x[$d1],@x[$a1])", "&rol (@x[$d1],8)", "&add ($xc,@x[$d0])", "&xor (@x[$b0],$xc)", "&rol (@x[$b0],7)", "&add ($xc_,@x[$d1])", "&xor (@x[$b1],$xc_)", "&rol (@x[$b1],7)", "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's "&mov (\"4*$c1(%rsp)\",$xc_)", "&mov ($xc,\"4*$c2(%rsp)\")", "&mov ($xc_,\"4*$c3(%rsp)\")", "&add (@x[$a2],@x[$b2])", # Q3 "&xor (@x[$d2],@x[$a2])", "&rol (@x[$d2],16)", "&add (@x[$a3],@x[$b3])", # Q4 "&xor (@x[$d3],@x[$a3])", "&rol (@x[$d3],16)", "&add ($xc,@x[$d2])", "&xor (@x[$b2],$xc)", "&rol (@x[$b2],12)", "&add ($xc_,@x[$d3])", "&xor (@x[$b3],$xc_)", "&rol (@x[$b3],12)", "&add (@x[$a2],@x[$b2])", "&xor (@x[$d2],@x[$a2])", "&rol (@x[$d2],8)", "&add (@x[$a3],@x[$b3])", "&xor (@x[$d3],@x[$a3])", "&rol (@x[$d3],8)", "&add ($xc,@x[$d2])", "&xor (@x[$b2],$xc)", "&rol (@x[$b2],7)", "&add ($xc_,@x[$d3])", "&xor (@x[$b3],$xc_)", "&rol (@x[$b3],7)" ); } ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function,5 .align 64 ChaCha20_ctr32: .cfi_startproc cmp \$0,$len je .Lno_data mov OPENSSL_ia32cap_P+4(%rip),%r10 ___ $code.=<<___ if ($avx>2); bt \$48,%r10 # check for AVX512F jc .LChaCha20_avx512 test %r10,%r10 # check for AVX512VL js .LChaCha20_avx512vl ___ $code.=<<___; test \$`1<<(41-32)`,%r10d jnz .LChaCha20_ssse3 push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$64+24,%rsp .cfi_adjust_cfa_offset 64+24 .Lctr32_body: #movdqa .Lsigma(%rip),%xmm0 movdqu ($key),%xmm1 movdqu 16($key),%xmm2 movdqu ($counter),%xmm3 movdqa .Lone(%rip),%xmm4 #movdqa %xmm0,4*0(%rsp) # key[0] movdqa %xmm1,4*4(%rsp) # key[1] movdqa %xmm2,4*8(%rsp) # key[2] movdqa %xmm3,4*12(%rsp) # key[3] mov $len,%rbp # reassign $len jmp .Loop_outer .align 32 .Loop_outer: mov \$0x61707865,@x[0] # 'expa' mov \$0x3320646e,@x[1] # 'nd 3' mov \$0x79622d32,@x[2] # '2-by' mov \$0x6b206574,@x[3] # 'te k' mov 4*4(%rsp),@x[4] mov 4*5(%rsp),@x[5] mov 4*6(%rsp),@x[6] mov 4*7(%rsp),@x[7] movd %xmm3,@x[12] mov 4*13(%rsp),@x[13] mov 4*14(%rsp),@x[14] mov 4*15(%rsp),@x[15] mov %rbp,64+0(%rsp) # save len mov \$10,%ebp mov $inp,64+8(%rsp) # save inp movq %xmm2,%rsi # "@x[8]" mov $out,64+16(%rsp) # save out mov %rsi,%rdi shr \$32,%rdi # "@x[9]" jmp .Loop .align 32 .Loop: ___ foreach (&ROUND (0, 4, 8,12)) { eval; } foreach (&ROUND (0, 5,10,15)) { eval; } &dec ("%ebp"); &jnz (".Loop"); $code.=<<___; mov @t[1],4*9(%rsp) # modulo-scheduled mov @t[0],4*8(%rsp) mov 64(%rsp),%rbp # load len movdqa %xmm2,%xmm1 mov 64+8(%rsp),$inp # load inp paddd %xmm4,%xmm3 # increment counter mov 64+16(%rsp),$out # load out add \$0x61707865,@x[0] # 'expa' add \$0x3320646e,@x[1] # 'nd 3' add \$0x79622d32,@x[2] # '2-by' add \$0x6b206574,@x[3] # 'te k' add 4*4(%rsp),@x[4] add 4*5(%rsp),@x[5] add 4*6(%rsp),@x[6] add 4*7(%rsp),@x[7] add 4*12(%rsp),@x[12] add 4*13(%rsp),@x[13] add 4*14(%rsp),@x[14] add 4*15(%rsp),@x[15] paddd 4*8(%rsp),%xmm1 cmp \$64,%rbp jb .Ltail xor 4*0($inp),@x[0] # xor with input xor 4*1($inp),@x[1] xor 4*2($inp),@x[2] xor 4*3($inp),@x[3] xor 4*4($inp),@x[4] xor 4*5($inp),@x[5] xor 4*6($inp),@x[6] xor 4*7($inp),@x[7] movdqu 4*8($inp),%xmm0 xor 4*12($inp),@x[12] xor 4*13($inp),@x[13] xor 4*14($inp),@x[14] xor 4*15($inp),@x[15] lea 4*16($inp),$inp # inp+=64 pxor %xmm1,%xmm0 movdqa %xmm2,4*8(%rsp) movd %xmm3,4*12(%rsp) mov @x[0],4*0($out) # write output mov @x[1],4*1($out) mov @x[2],4*2($out) mov @x[3],4*3($out) mov @x[4],4*4($out) mov @x[5],4*5($out) mov @x[6],4*6($out) mov @x[7],4*7($out) movdqu %xmm0,4*8($out) mov @x[12],4*12($out) mov @x[13],4*13($out) mov @x[14],4*14($out) mov @x[15],4*15($out) lea 4*16($out),$out # out+=64 sub \$64,%rbp jnz .Loop_outer jmp .Ldone .align 16 .Ltail: mov @x[0],4*0(%rsp) mov @x[1],4*1(%rsp) xor %rbx,%rbx mov @x[2],4*2(%rsp) mov @x[3],4*3(%rsp) mov @x[4],4*4(%rsp) mov @x[5],4*5(%rsp) mov @x[6],4*6(%rsp) mov @x[7],4*7(%rsp) movdqa %xmm1,4*8(%rsp) mov @x[12],4*12(%rsp) mov @x[13],4*13(%rsp) mov @x[14],4*14(%rsp) mov @x[15],4*15(%rsp) .Loop_tail: movzb ($inp,%rbx),%eax movzb (%rsp,%rbx),%edx lea 1(%rbx),%rbx xor %edx,%eax mov %al,-1($out,%rbx) dec %rbp jnz .Loop_tail .Ldone: lea 64+24+48(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lno_data: ret .cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ ######################################################################## # SSSE3 code path that handles shorter lengths { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot16); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,20); &pslld ($t,12); &por ($b,$t); &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot24); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,25); &pslld ($t,7); &por ($b,$t); } my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_ssse3,\@function,5 .align 32 ChaCha20_ssse3: .cfi_startproc .LChaCha20_ssse3: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 ___ $code.=<<___ if ($avx); test \$`1<<(43-32)`,%r10d jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 ___ $code.=<<___; cmp \$128,$len # we might throw away some data, je .LChaCha20_128 ja .LChaCha20_4x # but overall it won't be slower .Ldo_sse3_after_all: sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x28(%r9) movaps %xmm7,-0x18(%r9) .Lssse3_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$a movdqu ($key),$b movdqu 16($key),$c movdqu ($counter),$d movdqa .Lrot16(%rip),$rot16 movdqa .Lrot24(%rip),$rot24 movdqa $a,0x00(%rsp) movdqa $b,0x10(%rsp) movdqa $c,0x20(%rsp) movdqa $d,0x30(%rsp) mov \$10,$counter # reuse $counter jmp .Loop_ssse3 .align 32 .Loop_outer_ssse3: movdqa .Lone(%rip),$d movdqa 0x00(%rsp),$a movdqa 0x10(%rsp),$b movdqa 0x20(%rsp),$c paddd 0x30(%rsp),$d mov \$10,$counter movdqa $d,0x30(%rsp) jmp .Loop_ssse3 .align 32 .Loop_ssse3: ___ &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &nop (); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &dec ($counter); &jnz (".Loop_ssse3"); $code.=<<___; paddd 0x00(%rsp),$a paddd 0x10(%rsp),$b paddd 0x20(%rsp),$c paddd 0x30(%rsp),$d cmp \$64,$len jb .Ltail_ssse3 movdqu 0x00($inp),$t movdqu 0x10($inp),$t1 pxor $t,$a # xor with input movdqu 0x20($inp),$t pxor $t1,$b movdqu 0x30($inp),$t1 lea 0x40($inp),$inp # inp+=64 pxor $t,$c pxor $t1,$d movdqu $a,0x00($out) # write output movdqu $b,0x10($out) movdqu $c,0x20($out) movdqu $d,0x30($out) lea 0x40($out),$out # out+=64 sub \$64,$len jnz .Loop_outer_ssse3 jmp .Ldone_ssse3 .align 16 .Ltail_ssse3: movdqa $a,0x00(%rsp) movdqa $b,0x10(%rsp) movdqa $c,0x20(%rsp) movdqa $d,0x30(%rsp) xor $counter,$counter .Loop_tail_ssse3: movzb ($inp,$counter),%eax movzb (%rsp,$counter),%ecx lea 1($counter),$counter xor %ecx,%eax mov %al,-1($out,$counter) dec $len jnz .Loop_tail_ssse3 .Ldone_ssse3: ___ $code.=<<___ if ($win64); movaps -0x28(%r9),%xmm6 movaps -0x18(%r9),%xmm7 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .Lssse3_epilogue: ret .cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 ___ } ######################################################################## # SSSE3 code path that handles 128-byte inputs { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); sub SSSE3ROUND_2x { &paddd ($a,$b); &pxor ($d,$a); &paddd ($a1,$b1); &pxor ($d1,$a1); &pshufb ($d,$rot16); &pshufb($d1,$rot16); &paddd ($c,$d); &paddd ($c1,$d1); &pxor ($b,$c); &pxor ($b1,$c1); &movdqa ($t,$b); &psrld ($b,20); &movdqa($t1,$b1); &pslld ($t,12); &psrld ($b1,20); &por ($b,$t); &pslld ($t1,12); &por ($b1,$t1); &paddd ($a,$b); &pxor ($d,$a); &paddd ($a1,$b1); &pxor ($d1,$a1); &pshufb ($d,$rot24); &pshufb($d1,$rot24); &paddd ($c,$d); &paddd ($c1,$d1); &pxor ($b,$c); &pxor ($b1,$c1); &movdqa ($t,$b); &psrld ($b,25); &movdqa($t1,$b1); &pslld ($t,7); &psrld ($b1,25); &por ($b,$t); &pslld ($t1,7); &por ($b1,$t1); } my $xframe = $win64 ? 0x68 : 8; $code.=<<___; .type ChaCha20_128,\@function,5 .align 32 ChaCha20_128: .cfi_startproc .LChaCha20_128: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x68(%r9) movaps %xmm7,-0x58(%r9) movaps %xmm8,-0x48(%r9) movaps %xmm9,-0x38(%r9) movaps %xmm10,-0x28(%r9) movaps %xmm11,-0x18(%r9) .L128_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$a movdqu ($key),$b movdqu 16($key),$c movdqu ($counter),$d movdqa .Lone(%rip),$d1 movdqa .Lrot16(%rip),$rot16 movdqa .Lrot24(%rip),$rot24 movdqa $a,$a1 movdqa $a,0x00(%rsp) movdqa $b,$b1 movdqa $b,0x10(%rsp) movdqa $c,$c1 movdqa $c,0x20(%rsp) paddd $d,$d1 movdqa $d,0x30(%rsp) mov \$10,$counter # reuse $counter jmp .Loop_128 .align 32 .Loop_128: ___ &SSSE3ROUND_2x(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &pshufd ($c1,$c1,0b01001110); &pshufd ($b1,$b1,0b00111001); &pshufd ($d1,$d1,0b10010011); &SSSE3ROUND_2x(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &pshufd ($c1,$c1,0b01001110); &pshufd ($b1,$b1,0b10010011); &pshufd ($d1,$d1,0b00111001); &dec ($counter); &jnz (".Loop_128"); $code.=<<___; paddd 0x00(%rsp),$a paddd 0x10(%rsp),$b paddd 0x20(%rsp),$c paddd 0x30(%rsp),$d paddd .Lone(%rip),$d1 paddd 0x00(%rsp),$a1 paddd 0x10(%rsp),$b1 paddd 0x20(%rsp),$c1 paddd 0x30(%rsp),$d1 movdqu 0x00($inp),$t movdqu 0x10($inp),$t1 pxor $t,$a # xor with input movdqu 0x20($inp),$t pxor $t1,$b movdqu 0x30($inp),$t1 pxor $t,$c movdqu 0x40($inp),$t pxor $t1,$d movdqu 0x50($inp),$t1 pxor $t,$a1 movdqu 0x60($inp),$t pxor $t1,$b1 movdqu 0x70($inp),$t1 pxor $t,$c1 pxor $t1,$d1 movdqu $a,0x00($out) # write output movdqu $b,0x10($out) movdqu $c,0x20($out) movdqu $d,0x30($out) movdqu $a1,0x40($out) movdqu $b1,0x50($out) movdqu $c1,0x60($out) movdqu $d1,0x70($out) ___ $code.=<<___ if ($win64); movaps -0x68(%r9),%xmm6 movaps -0x58(%r9),%xmm7 movaps -0x48(%r9),%xmm8 movaps -0x38(%r9),%xmm9 movaps -0x28(%r9),%xmm10 movaps -0x18(%r9),%xmm11 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L128_epilogue: ret .cfi_endproc .size ChaCha20_128,.-ChaCha20_128 ___ } ######################################################################## # SSSE3 code path that handles longer messages. { # assign variables to favor Atom front-end my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); sub SSSE3_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); my @x=map("\"$_\"",@xx); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. ( "&paddd (@x[$a0],@x[$b0])", # Q1 "&paddd (@x[$a1],@x[$b1])", # Q2 "&pxor (@x[$d0],@x[$a0])", "&pxor (@x[$d1],@x[$a1])", "&pshufb (@x[$d0],$t1)", "&pshufb (@x[$d1],$t1)", "&paddd ($xc,@x[$d0])", "&paddd ($xc_,@x[$d1])", "&pxor (@x[$b0],$xc)", "&pxor (@x[$b1],$xc_)", "&movdqa ($t0,@x[$b0])", "&pslld (@x[$b0],12)", "&psrld ($t0,20)", "&movdqa ($t1,@x[$b1])", "&pslld (@x[$b1],12)", "&por (@x[$b0],$t0)", "&psrld ($t1,20)", "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) "&por (@x[$b1],$t1)", "&paddd (@x[$a0],@x[$b0])", "&paddd (@x[$a1],@x[$b1])", "&pxor (@x[$d0],@x[$a0])", "&pxor (@x[$d1],@x[$a1])", "&pshufb (@x[$d0],$t0)", "&pshufb (@x[$d1],$t0)", "&paddd ($xc,@x[$d0])", "&paddd ($xc_,@x[$d1])", "&pxor (@x[$b0],$xc)", "&pxor (@x[$b1],$xc_)", "&movdqa ($t1,@x[$b0])", "&pslld (@x[$b0],7)", "&psrld ($t1,25)", "&movdqa ($t0,@x[$b1])", "&pslld (@x[$b1],7)", "&por (@x[$b0],$t1)", "&psrld ($t0,25)", "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) "&por (@x[$b1],$t0)", "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", "&paddd (@x[$a2],@x[$b2])", # Q3 "&paddd (@x[$a3],@x[$b3])", # Q4 "&pxor (@x[$d2],@x[$a2])", "&pxor (@x[$d3],@x[$a3])", "&pshufb (@x[$d2],$t1)", "&pshufb (@x[$d3],$t1)", "&paddd ($xc,@x[$d2])", "&paddd ($xc_,@x[$d3])", "&pxor (@x[$b2],$xc)", "&pxor (@x[$b3],$xc_)", "&movdqa ($t0,@x[$b2])", "&pslld (@x[$b2],12)", "&psrld ($t0,20)", "&movdqa ($t1,@x[$b3])", "&pslld (@x[$b3],12)", "&por (@x[$b2],$t0)", "&psrld ($t1,20)", "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) "&por (@x[$b3],$t1)", "&paddd (@x[$a2],@x[$b2])", "&paddd (@x[$a3],@x[$b3])", "&pxor (@x[$d2],@x[$a2])", "&pxor (@x[$d3],@x[$a3])", "&pshufb (@x[$d2],$t0)", "&pshufb (@x[$d3],$t0)", "&paddd ($xc,@x[$d2])", "&paddd ($xc_,@x[$d3])", "&pxor (@x[$b2],$xc)", "&pxor (@x[$b3],$xc_)", "&movdqa ($t1,@x[$b2])", "&pslld (@x[$b2],7)", "&psrld ($t1,25)", "&movdqa ($t0,@x[$b3])", "&pslld (@x[$b3],7)", "&por (@x[$b2],$t1)", "&psrld ($t0,25)", "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) "&por (@x[$b3],$t0)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4x,\@function,5 .align 32 ChaCha20_4x: .cfi_startproc .LChaCha20_4x: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 mov %r10,%r11 ___ $code.=<<___ if ($avx>1); shr \$32,%r10 # OPENSSL_ia32cap_P+8 test \$`1<<5`,%r10 # test AVX2 jnz .LChaCha20_8x ___ $code.=<<___; cmp \$192,$len ja .Lproceed4x and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE je .Ldo_sse3_after_all # to detect Atom .Lproceed4x: sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x40 constant copy of key[0-2] smashed by lanes # ... # +0x100 SIMD counters (with nonce smashed by lanes) # ... # +0x140 $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L4x_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$xa3 # key[0] movdqu ($key),$xb3 # key[1] movdqu 16($key),$xt3 # key[2] movdqu ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea .Lrot16(%rip),%r10 lea .Lrot24(%rip),%r11 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... pshufd \$0x55,$xa3,$xa1 movdqa $xa0,0x40(%rsp) # ... and offload pshufd \$0xaa,$xa3,$xa2 movdqa $xa1,0x50(%rsp) pshufd \$0xff,$xa3,$xa3 movdqa $xa2,0x60(%rsp) movdqa $xa3,0x70(%rsp) pshufd \$0x00,$xb3,$xb0 pshufd \$0x55,$xb3,$xb1 movdqa $xb0,0x80-0x100(%rcx) pshufd \$0xaa,$xb3,$xb2 movdqa $xb1,0x90-0x100(%rcx) pshufd \$0xff,$xb3,$xb3 movdqa $xb2,0xa0-0x100(%rcx) movdqa $xb3,0xb0-0x100(%rcx) pshufd \$0x00,$xt3,$xt0 # "$xc0" pshufd \$0x55,$xt3,$xt1 # "$xc1" movdqa $xt0,0xc0-0x100(%rcx) pshufd \$0xaa,$xt3,$xt2 # "$xc2" movdqa $xt1,0xd0-0x100(%rcx) pshufd \$0xff,$xt3,$xt3 # "$xc3" movdqa $xt2,0xe0-0x100(%rcx) movdqa $xt3,0xf0-0x100(%rcx) pshufd \$0x00,$xd3,$xd0 pshufd \$0x55,$xd3,$xd1 paddd .Linc(%rip),$xd0 # don't save counters yet pshufd \$0xaa,$xd3,$xd2 movdqa $xd1,0x110-0x100(%rcx) pshufd \$0xff,$xd3,$xd3 movdqa $xd2,0x120-0x100(%rcx) movdqa $xd3,0x130-0x100(%rcx) jmp .Loop_enter4x .align 32 .Loop_outer4x: movdqa 0x40(%rsp),$xa0 # re-load smashed key movdqa 0x50(%rsp),$xa1 movdqa 0x60(%rsp),$xa2 movdqa 0x70(%rsp),$xa3 movdqa 0x80-0x100(%rcx),$xb0 movdqa 0x90-0x100(%rcx),$xb1 movdqa 0xa0-0x100(%rcx),$xb2 movdqa 0xb0-0x100(%rcx),$xb3 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" movdqa 0x100-0x100(%rcx),$xd0 movdqa 0x110-0x100(%rcx),$xd1 movdqa 0x120-0x100(%rcx),$xd2 movdqa 0x130-0x100(%rcx),$xd3 paddd .Lfour(%rip),$xd0 # next SIMD counters .Loop_enter4x: movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" movdqa (%r10),$xt3 # .Lrot16(%rip) mov \$10,%eax movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters jmp .Loop4x .align 32 .Loop4x: ___ foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop4x paddd 0x40(%rsp),$xa0 # accumulate key material paddd 0x50(%rsp),$xa1 paddd 0x60(%rsp),$xa2 paddd 0x70(%rsp),$xa3 movdqa $xa0,$xt2 # "de-interlace" data punpckldq $xa1,$xa0 movdqa $xa2,$xt3 punpckldq $xa3,$xa2 punpckhdq $xa1,$xt2 punpckhdq $xa3,$xt3 movdqa $xa0,$xa1 punpcklqdq $xa2,$xa0 # "a0" movdqa $xt2,$xa3 punpcklqdq $xt3,$xt2 # "a2" punpckhqdq $xa2,$xa1 # "a1" punpckhqdq $xt3,$xa3 # "a3" ___ ($xa2,$xt2)=($xt2,$xa2); $code.=<<___; paddd 0x80-0x100(%rcx),$xb0 paddd 0x90-0x100(%rcx),$xb1 paddd 0xa0-0x100(%rcx),$xb2 paddd 0xb0-0x100(%rcx),$xb3 movdqa $xa0,0x00(%rsp) # offload $xaN movdqa $xa1,0x10(%rsp) movdqa 0x20(%rsp),$xa0 # "xc2" movdqa 0x30(%rsp),$xa1 # "xc3" movdqa $xb0,$xt2 punpckldq $xb1,$xb0 movdqa $xb2,$xt3 punpckldq $xb3,$xb2 punpckhdq $xb1,$xt2 punpckhdq $xb3,$xt3 movdqa $xb0,$xb1 punpcklqdq $xb2,$xb0 # "b0" movdqa $xt2,$xb3 punpcklqdq $xt3,$xt2 # "b2" punpckhqdq $xb2,$xb1 # "b1" punpckhqdq $xt3,$xb3 # "b3" ___ ($xb2,$xt2)=($xt2,$xb2); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; paddd 0xc0-0x100(%rcx),$xc0 paddd 0xd0-0x100(%rcx),$xc1 paddd 0xe0-0x100(%rcx),$xc2 paddd 0xf0-0x100(%rcx),$xc3 movdqa $xa2,0x20(%rsp) # keep offloading $xaN movdqa $xa3,0x30(%rsp) movdqa $xc0,$xt2 punpckldq $xc1,$xc0 movdqa $xc2,$xt3 punpckldq $xc3,$xc2 punpckhdq $xc1,$xt2 punpckhdq $xc3,$xt3 movdqa $xc0,$xc1 punpcklqdq $xc2,$xc0 # "c0" movdqa $xt2,$xc3 punpcklqdq $xt3,$xt2 # "c2" punpckhqdq $xc2,$xc1 # "c1" punpckhqdq $xt3,$xc3 # "c3" ___ ($xc2,$xt2)=($xt2,$xc2); ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary $code.=<<___; paddd 0x100-0x100(%rcx),$xd0 paddd 0x110-0x100(%rcx),$xd1 paddd 0x120-0x100(%rcx),$xd2 paddd 0x130-0x100(%rcx),$xd3 movdqa $xd0,$xt2 punpckldq $xd1,$xd0 movdqa $xd2,$xt3 punpckldq $xd3,$xd2 punpckhdq $xd1,$xt2 punpckhdq $xd3,$xt3 movdqa $xd0,$xd1 punpcklqdq $xd2,$xd0 # "d0" movdqa $xt2,$xd3 punpcklqdq $xt3,$xt2 # "d2" punpckhqdq $xd2,$xd1 # "d1" punpckhqdq $xt3,$xd3 # "d3" ___ ($xd2,$xt2)=($xt2,$xd2); $code.=<<___; cmp \$64*4,$len jb .Ltail4x movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # size optimization pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu 0x00($inp),$xt0 movdqu $xt1,0x50($out) movdqu 0x10($inp),$xt1 movdqu $xt2,0x60($out) movdqu 0x20($inp),$xt2 movdqu $xt3,0x70($out) lea 0x80($out),$out # size optimization movdqu 0x30($inp),$xt3 pxor 0x20(%rsp),$xt0 pxor $xb2,$xt1 pxor $xc2,$xt2 pxor $xd2,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # inp+=64*4 pxor 0x30(%rsp),$xt0 pxor $xb3,$xt1 pxor $xc3,$xt2 pxor $xd3,$xt3 movdqu $xt0,0x40($out) movdqu $xt1,0x50($out) movdqu $xt2,0x60($out) movdqu $xt3,0x70($out) lea 0x80($out),$out # out+=64*4 sub \$64*4,$len jnz .Loop_outer4x jmp .Ldone4x .Ltail4x: cmp \$192,$len jae .L192_or_more4x cmp \$128,$len jae .L128_or_more4x cmp \$64,$len jae .L64_or_more4x #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? xor %r10,%r10 #movdqa $xt0,0x00(%rsp) movdqa $xb0,0x10(%rsp) movdqa $xc0,0x20(%rsp) movdqa $xd0,0x30(%rsp) jmp .Loop_tail4x .align 32 .L64_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu $xt1,0x10($out) movdqu $xt2,0x20($out) movdqu $xt3,0x30($out) je .Ldone4x movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*1 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 movdqa $xc1,0x20(%rsp) sub \$64,$len # len-=64*1 movdqa $xd1,0x30(%rsp) jmp .Loop_tail4x .align 32 .L128_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu $xt1,0x50($out) movdqu $xt2,0x60($out) movdqu $xt3,0x70($out) je .Ldone4x movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x80($inp),$inp # inp+=64*2 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 movdqa $xc2,0x20(%rsp) sub \$128,$len # len-=64*2 movdqa $xd2,0x30(%rsp) jmp .Loop_tail4x .align 32 .L192_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # size optimization pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu 0x00($inp),$xt0 movdqu $xt1,0x50($out) movdqu 0x10($inp),$xt1 movdqu $xt2,0x60($out) movdqu 0x20($inp),$xt2 movdqu $xt3,0x70($out) lea 0x80($out),$out # size optimization movdqu 0x30($inp),$xt3 pxor 0x20(%rsp),$xt0 pxor $xb2,$xt1 pxor $xc2,$xt2 pxor $xd2,$xt3 movdqu $xt0,0x00($out) movdqu $xt1,0x10($out) movdqu $xt2,0x20($out) movdqu $xt3,0x30($out) je .Ldone4x movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*3 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 movdqa $xc3,0x20(%rsp) sub \$192,$len # len-=64*3 movdqa $xd3,0x30(%rsp) .Loop_tail4x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail4x .Ldone4x: ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L4x_epilogue: ret .cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x ___ } ######################################################################## # XOP code path that handles all lengths. if ($avx) { # There is some "anomaly" observed depending on instructions' size or # alignment. If you look closely at below code you'll notice that # sometimes argument order varies. The order affects instruction # encoding by making it larger, and such fiddling gives 5% performance # improvement. This is on FX-4100... my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); sub XOP_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my @x=map("\"$_\"",@xx); ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vprotd (@x[$d0],@x[$d0],16)", "&vprotd (@x[$d1],@x[$d1],16)", "&vprotd (@x[$d2],@x[$d2],16)", "&vprotd (@x[$d3],@x[$d3],16)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxor (@x[$b0],@x[$c0],@x[$b0])", "&vpxor (@x[$b1],@x[$c1],@x[$b1])", "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip "&vprotd (@x[$b0],@x[$b0],12)", "&vprotd (@x[$b1],@x[$b1],12)", "&vprotd (@x[$b2],@x[$b2],12)", "&vprotd (@x[$b3],@x[$b3],12)", "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vprotd (@x[$d0],@x[$d0],8)", "&vprotd (@x[$d1],@x[$d1],8)", "&vprotd (@x[$d2],@x[$d2],8)", "&vprotd (@x[$d3],@x[$d3],8)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxor (@x[$b0],@x[$c0],@x[$b0])", "&vpxor (@x[$b1],@x[$c1],@x[$b1])", "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip "&vprotd (@x[$b0],@x[$b0],7)", "&vprotd (@x[$b1],@x[$b1],7)", "&vprotd (@x[$b2],@x[$b2],7)", "&vprotd (@x[$b3],@x[$b3],7)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4xop,\@function,5 .align 32 ChaCha20_4xop: .cfi_startproc .LChaCha20_4xop: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x40 constant copy of key[0-2] smashed by lanes # ... # +0x100 SIMD counters (with nonce smashed by lanes) # ... # +0x140 $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L4xop_body: ___ $code.=<<___; vzeroupper vmovdqa .Lsigma(%rip),$xa3 # key[0] vmovdqu ($key),$xb3 # key[1] vmovdqu 16($key),$xt3 # key[2] vmovdqu ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vmovdqa $xa0,0x40(%rsp) # ... and offload vpshufd \$0xaa,$xa3,$xa2 vmovdqa $xa1,0x50(%rsp) vpshufd \$0xff,$xa3,$xa3 vmovdqa $xa2,0x60(%rsp) vmovdqa $xa3,0x70(%rsp) vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vmovdqa $xb0,0x80-0x100(%rcx) vpshufd \$0xaa,$xb3,$xb2 vmovdqa $xb1,0x90-0x100(%rcx) vpshufd \$0xff,$xb3,$xb3 vmovdqa $xb2,0xa0-0x100(%rcx) vmovdqa $xb3,0xb0-0x100(%rcx) vpshufd \$0x00,$xt3,$xt0 # "$xc0" vpshufd \$0x55,$xt3,$xt1 # "$xc1" vmovdqa $xt0,0xc0-0x100(%rcx) vpshufd \$0xaa,$xt3,$xt2 # "$xc2" vmovdqa $xt1,0xd0-0x100(%rcx) vpshufd \$0xff,$xt3,$xt3 # "$xc3" vmovdqa $xt2,0xe0-0x100(%rcx) vmovdqa $xt3,0xf0-0x100(%rcx) vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet vpshufd \$0xaa,$xd3,$xd2 vmovdqa $xd1,0x110-0x100(%rcx) vpshufd \$0xff,$xd3,$xd3 vmovdqa $xd2,0x120-0x100(%rcx) vmovdqa $xd3,0x130-0x100(%rcx) jmp .Loop_enter4xop .align 32 .Loop_outer4xop: vmovdqa 0x40(%rsp),$xa0 # re-load smashed key vmovdqa 0x50(%rsp),$xa1 vmovdqa 0x60(%rsp),$xa2 vmovdqa 0x70(%rsp),$xa3 vmovdqa 0x80-0x100(%rcx),$xb0 vmovdqa 0x90-0x100(%rcx),$xb1 vmovdqa 0xa0-0x100(%rcx),$xb2 vmovdqa 0xb0-0x100(%rcx),$xb3 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" vmovdqa 0x100-0x100(%rcx),$xd0 vmovdqa 0x110-0x100(%rcx),$xd1 vmovdqa 0x120-0x100(%rcx),$xd2 vmovdqa 0x130-0x100(%rcx),$xd3 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters .Loop_enter4xop: mov \$10,%eax vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters jmp .Loop4xop .align 32 .Loop4xop: ___ foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop4xop vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material vpaddd 0x50(%rsp),$xa1,$xa1 vpaddd 0x60(%rsp),$xa2,$xa2 vpaddd 0x70(%rsp),$xa3,$xa3 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 vmovdqa $xt3,0x30(%rsp) vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd 0x80-0x100(%rcx),$xb0,$xb0 vpaddd 0x90-0x100(%rcx),$xb1,$xb1 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 vmovdqa $xa1,0x10(%rsp) vmovdqa 0x20(%rsp),$xa0 # "xc2" vmovdqa 0x30(%rsp),$xa1 # "xc3" vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd 0x100-0x100(%rcx),$xd0,$xd0 vpaddd 0x110-0x100(%rcx),$xd1,$xd1 vpaddd 0x120-0x100(%rcx),$xd2,$xd2 vpaddd 0x130-0x100(%rcx),$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); ($xa0,$xa1)=($xt2,$xt3); $code.=<<___; vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 vmovdqa 0x10(%rsp),$xa1 cmp \$64*4,$len jb .Ltail4xop vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vpxor 0x40($inp),$xa1,$xa1 vpxor 0x50($inp),$xb1,$xb1 vpxor 0x60($inp),$xc1,$xc1 vpxor 0x70($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x10($inp),$xb2,$xb2 vpxor 0x20($inp),$xc2,$xc2 vpxor 0x30($inp),$xd2,$xd2 vpxor 0x40($inp),$xa3,$xa3 vpxor 0x50($inp),$xb3,$xb3 vpxor 0x60($inp),$xc3,$xc3 vpxor 0x70($inp),$xd3,$xd3 lea 0x80($inp),$inp # inp+=64*4 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) vmovdqu $xa1,0x40($out) vmovdqu $xb1,0x50($out) vmovdqu $xc1,0x60($out) vmovdqu $xd1,0x70($out) lea 0x80($out),$out # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x10($out) vmovdqu $xc2,0x20($out) vmovdqu $xd2,0x30($out) vmovdqu $xa3,0x40($out) vmovdqu $xb3,0x50($out) vmovdqu $xc3,0x60($out) vmovdqu $xd3,0x70($out) lea 0x80($out),$out # out+=64*4 sub \$64*4,$len jnz .Loop_outer4xop jmp .Ldone4xop .align 32 .Ltail4xop: cmp \$192,$len jae .L192_or_more4xop cmp \$128,$len jae .L128_or_more4xop cmp \$64,$len jae .L64_or_more4xop xor %r10,%r10 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x10(%rsp) vmovdqa $xc0,0x20(%rsp) vmovdqa $xd0,0x30(%rsp) jmp .Loop_tail4xop .align 32 .L64_or_more4xop: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) je .Ldone4xop lea 0x40($inp),$inp # inp+=64*1 vmovdqa $xa1,0x00(%rsp) xor %r10,%r10 vmovdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 vmovdqa $xc1,0x20(%rsp) sub \$64,$len # len-=64*1 vmovdqa $xd1,0x30(%rsp) jmp .Loop_tail4xop .align 32 .L128_or_more4xop: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vpxor 0x40($inp),$xa1,$xa1 vpxor 0x50($inp),$xb1,$xb1 vpxor 0x60($inp),$xc1,$xc1 vpxor 0x70($inp),$xd1,$xd1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) vmovdqu $xa1,0x40($out) vmovdqu $xb1,0x50($out) vmovdqu $xc1,0x60($out) vmovdqu $xd1,0x70($out) je .Ldone4xop lea 0x80($inp),$inp # inp+=64*2 vmovdqa $xa2,0x00(%rsp) xor %r10,%r10 vmovdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 vmovdqa $xc2,0x20(%rsp) sub \$128,$len # len-=64*2 vmovdqa $xd2,0x30(%rsp) jmp .Loop_tail4xop .align 32 .L192_or_more4xop: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vpxor 0x40($inp),$xa1,$xa1 vpxor 0x50($inp),$xb1,$xb1 vpxor 0x60($inp),$xc1,$xc1 vpxor 0x70($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x10($inp),$xb2,$xb2 vpxor 0x20($inp),$xc2,$xc2 vpxor 0x30($inp),$xd2,$xd2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) vmovdqu $xa1,0x40($out) vmovdqu $xb1,0x50($out) vmovdqu $xc1,0x60($out) vmovdqu $xd1,0x70($out) lea 0x80($out),$out # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x10($out) vmovdqu $xc2,0x20($out) vmovdqu $xd2,0x30($out) je .Ldone4xop lea 0x40($inp),$inp # inp+=64*3 vmovdqa $xa3,0x00(%rsp) xor %r10,%r10 vmovdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 vmovdqa $xc3,0x20(%rsp) sub \$192,$len # len-=64*3 vmovdqa $xd3,0x30(%rsp) .Loop_tail4xop: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail4xop .Ldone4xop: vzeroupper ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L4xop_epilogue: ret .cfi_endproc .size ChaCha20_4xop,.-ChaCha20_4xop ___ } ######################################################################## # AVX2 code path if ($avx>1) { my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); sub AVX2_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); my @x=map("\"$_\"",@xx); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpshufb (@x[$d0],@x[$d0],$t1)", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpshufb (@x[$d1],@x[$d1],$t1)", "&vpaddd ($xc,$xc,@x[$d0])", "&vpxor (@x[$b0],$xc,@x[$b0])", "&vpslld ($t0,@x[$b0],12)", "&vpsrld (@x[$b0],@x[$b0],20)", "&vpor (@x[$b0],$t0,@x[$b0])", "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t1,@x[$b1],12)", "&vpsrld (@x[$b1],@x[$b1],20)", "&vpor (@x[$b1],$t1,@x[$b1])", "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpshufb (@x[$d0],@x[$d0],$t0)", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpshufb (@x[$d1],@x[$d1],$t0)", "&vpaddd ($xc,$xc,@x[$d0])", "&vpxor (@x[$b0],$xc,@x[$b0])", "&vpslld ($t1,@x[$b0],7)", "&vpsrld (@x[$b0],@x[$b0],25)", "&vpor (@x[$b0],$t1,@x[$b0])", "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t0,@x[$b1],7)", "&vpsrld (@x[$b1],@x[$b1],25)", "&vpor (@x[$b1],$t0,@x[$b1])", "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpshufb (@x[$d2],@x[$d2],$t1)", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vpshufb (@x[$d3],@x[$d3],$t1)", "&vpaddd ($xc,$xc,@x[$d2])", "&vpxor (@x[$b2],$xc,@x[$b2])", "&vpslld ($t0,@x[$b2],12)", "&vpsrld (@x[$b2],@x[$b2],20)", "&vpor (@x[$b2],$t0,@x[$b2])", "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t1,@x[$b3],12)", "&vpsrld (@x[$b3],@x[$b3],20)", "&vpor (@x[$b3],$t1,@x[$b3])", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpshufb (@x[$d2],@x[$d2],$t0)", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vpshufb (@x[$d3],@x[$d3],$t0)", "&vpaddd ($xc,$xc,@x[$d2])", "&vpxor (@x[$b2],$xc,@x[$b2])", "&vpslld ($t1,@x[$b2],7)", "&vpsrld (@x[$b2],@x[$b2],25)", "&vpor (@x[$b2],$t1,@x[$b2])", "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t0,@x[$b3],7)", "&vpsrld (@x[$b3],@x[$b3],25)", "&vpor (@x[$b3],$t0,@x[$b3])" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_8x,\@function,5 .align 32 ChaCha20_8x: .cfi_startproc .LChaCha20_8x: mov %rsp,%r9 # frame register .cfi_def_cfa_register %r9 sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L8x_body: ___ $code.=<<___; vzeroupper ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x80 constant copy of key[0-2] smashed by lanes # ... # +0x200 SIMD counters (with nonce smashed by lanes) # ... # +0x280 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] vbroadcasti128 16($key),$xt3 # key[2] vbroadcasti128 ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea 0x200(%rsp),%rax # size optimization lea .Lrot16(%rip),%r10 lea .Lrot24(%rip),%r11 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload vpshufd \$0xaa,$xa3,$xa2 vmovdqa $xa1,0xa0-0x100(%rcx) vpshufd \$0xff,$xa3,$xa3 vmovdqa $xa2,0xc0-0x100(%rcx) vmovdqa $xa3,0xe0-0x100(%rcx) vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vmovdqa $xb0,0x100-0x100(%rcx) vpshufd \$0xaa,$xb3,$xb2 vmovdqa $xb1,0x120-0x100(%rcx) vpshufd \$0xff,$xb3,$xb3 vmovdqa $xb2,0x140-0x100(%rcx) vmovdqa $xb3,0x160-0x100(%rcx) vpshufd \$0x00,$xt3,$xt0 # "xc0" vpshufd \$0x55,$xt3,$xt1 # "xc1" vmovdqa $xt0,0x180-0x200(%rax) vpshufd \$0xaa,$xt3,$xt2 # "xc2" vmovdqa $xt1,0x1a0-0x200(%rax) vpshufd \$0xff,$xt3,$xt3 # "xc3" vmovdqa $xt2,0x1c0-0x200(%rax) vmovdqa $xt3,0x1e0-0x200(%rax) vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet vpshufd \$0xaa,$xd3,$xd2 vmovdqa $xd1,0x220-0x200(%rax) vpshufd \$0xff,$xd3,$xd3 vmovdqa $xd2,0x240-0x200(%rax) vmovdqa $xd3,0x260-0x200(%rax) jmp .Loop_enter8x .align 32 .Loop_outer8x: vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key vmovdqa 0xa0-0x100(%rcx),$xa1 vmovdqa 0xc0-0x100(%rcx),$xa2 vmovdqa 0xe0-0x100(%rcx),$xa3 vmovdqa 0x100-0x100(%rcx),$xb0 vmovdqa 0x120-0x100(%rcx),$xb1 vmovdqa 0x140-0x100(%rcx),$xb2 vmovdqa 0x160-0x100(%rcx),$xb3 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" vmovdqa 0x200-0x200(%rax),$xd0 vmovdqa 0x220-0x200(%rax),$xd1 vmovdqa 0x240-0x200(%rax),$xd2 vmovdqa 0x260-0x200(%rax),$xd3 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters .Loop_enter8x: vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" vbroadcasti128 (%r10),$xt3 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters mov \$10,%eax jmp .Loop8x .align 32 .Loop8x: ___ foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop8x lea 0x200(%rsp),%rax # size optimization vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd 0x100-0x100(%rcx),$xb0,$xb0 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xb0,$xa0,$xb0 vperm2i128 \$0x20,$xb1,$xa1,$xa0 vperm2i128 \$0x31,$xb1,$xa1,$xb1 vperm2i128 \$0x20,$xb2,$xa2,$xa1 vperm2i128 \$0x31,$xb2,$xa2,$xb2 vperm2i128 \$0x20,$xb3,$xa3,$xa2 vperm2i128 \$0x31,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; vmovdqa $xa0,0x00(%rsp) # offload $xaN vmovdqa $xa1,0x20(%rsp) vmovdqa 0x40(%rsp),$xc2 # $xa0 vmovdqa 0x60(%rsp),$xc3 # $xa1 vpaddd 0x180-0x200(%rax),$xc0,$xc0 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd 0x200-0x200(%rax),$xd0,$xd0 vpaddd 0x220-0x200(%rax),$xd1,$xd1 vpaddd 0x240-0x200(%rax),$xd2,$xd2 vpaddd 0x260-0x200(%rax),$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xd0,$xc0,$xd0 vperm2i128 \$0x20,$xd1,$xc1,$xc0 vperm2i128 \$0x31,$xd1,$xc1,$xd1 vperm2i128 \$0x20,$xd2,$xc2,$xc1 vperm2i128 \$0x31,$xd2,$xc2,$xd2 vperm2i128 \$0x20,$xd3,$xc3,$xc2 vperm2i128 \$0x31,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); ($xa0,$xa1)=($xt2,$xt3); $code.=<<___; vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? vmovdqa 0x20(%rsp),$xa1 cmp \$64*8,$len jb .Ltail8x vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 lea 0x80($inp),$inp # size optimization vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vpxor 0x40($inp),$xc1,$xc1 vpxor 0x60($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vmovdqu $xa1,0x00($out) vmovdqu $xb1,0x20($out) vmovdqu $xc1,0x40($out) vmovdqu $xd1,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vpxor 0x40($inp),$xc2,$xc2 vpxor 0x60($inp),$xd2,$xd2 lea 0x80($inp),$inp # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x20($out) vmovdqu $xc2,0x40($out) vmovdqu $xd2,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vpxor 0x40($inp),$xc3,$xc3 vpxor 0x60($inp),$xd3,$xd3 lea 0x80($inp),$inp # size optimization vmovdqu $xa3,0x00($out) vmovdqu $xb3,0x20($out) vmovdqu $xc3,0x40($out) vmovdqu $xd3,0x60($out) lea 0x80($out),$out # size optimization sub \$64*8,$len jnz .Loop_outer8x jmp .Ldone8x .Ltail8x: cmp \$448,$len jae .L448_or_more8x cmp \$384,$len jae .L384_or_more8x cmp \$320,$len jae .L320_or_more8x cmp \$256,$len jae .L256_or_more8x cmp \$192,$len jae .L192_or_more8x cmp \$128,$len jae .L128_or_more8x cmp \$64,$len jae .L64_or_more8x xor %r10,%r10 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x20(%rsp) jmp .Loop_tail8x .align 32 .L64_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) je .Ldone8x lea 0x40($inp),$inp # inp+=64*1 xor %r10,%r10 vmovdqa $xc0,0x00(%rsp) lea 0x40($out),$out # out+=64*1 sub \$64,$len # len-=64*1 vmovdqa $xd0,0x20(%rsp) jmp .Loop_tail8x .align 32 .L128_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) je .Ldone8x lea 0x80($inp),$inp # inp+=64*2 xor %r10,%r10 vmovdqa $xa1,0x00(%rsp) lea 0x80($out),$out # out+=64*2 sub \$128,$len # len-=64*2 vmovdqa $xb1,0x20(%rsp) jmp .Loop_tail8x .align 32 .L192_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) je .Ldone8x lea 0xc0($inp),$inp # inp+=64*3 xor %r10,%r10 vmovdqa $xc1,0x00(%rsp) lea 0xc0($out),$out # out+=64*3 sub \$192,$len # len-=64*3 vmovdqa $xd1,0x20(%rsp) jmp .Loop_tail8x .align 32 .L256_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) je .Ldone8x lea 0x100($inp),$inp # inp+=64*4 xor %r10,%r10 vmovdqa $xa2,0x00(%rsp) lea 0x100($out),$out # out+=64*4 sub \$256,$len # len-=64*4 vmovdqa $xb2,0x20(%rsp) jmp .Loop_tail8x .align 32 .L320_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) je .Ldone8x lea 0x140($inp),$inp # inp+=64*5 xor %r10,%r10 vmovdqa $xc2,0x00(%rsp) lea 0x140($out),$out # out+=64*5 sub \$320,$len # len-=64*5 vmovdqa $xd2,0x20(%rsp) jmp .Loop_tail8x .align 32 .L384_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vpxor 0x140($inp),$xc2,$xc2 vpxor 0x160($inp),$xd2,$xd2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) vmovdqu $xc2,0x140($out) vmovdqu $xd2,0x160($out) je .Ldone8x lea 0x180($inp),$inp # inp+=64*6 xor %r10,%r10 vmovdqa $xa3,0x00(%rsp) lea 0x180($out),$out # out+=64*6 sub \$384,$len # len-=64*6 vmovdqa $xb3,0x20(%rsp) jmp .Loop_tail8x .align 32 .L448_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vpxor 0x140($inp),$xc2,$xc2 vpxor 0x160($inp),$xd2,$xd2 vpxor 0x180($inp),$xa3,$xa3 vpxor 0x1a0($inp),$xb3,$xb3 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) vmovdqu $xc2,0x140($out) vmovdqu $xd2,0x160($out) vmovdqu $xa3,0x180($out) vmovdqu $xb3,0x1a0($out) je .Ldone8x lea 0x1c0($inp),$inp # inp+=64*7 xor %r10,%r10 vmovdqa $xc3,0x00(%rsp) lea 0x1c0($out),$out # out+=64*7 sub \$448,$len # len-=64*7 vmovdqa $xd3,0x20(%rsp) .Loop_tail8x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail8x .Ldone8x: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L8x_epilogue: ret .cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x ___ } ######################################################################## # AVX512 code paths if ($avx>2) { # This one handles shorter inputs... my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); sub vpxord() # size optimization { my $opcode = "vpxor"; # adhere to vpxor when possible foreach (@_) { if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { $opcode = "vpxord"; last; } } $code .= "\t$opcode\t".join(',',reverse @_)."\n"; } sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round &vpaddd ($a,$a,$b); &vpxord ($d,$d,$a); &vprold ($d,$d,16); &vpaddd ($c,$c,$d); &vpxord ($b,$b,$c); &vprold ($b,$b,12); &vpaddd ($a,$a,$b); &vpxord ($d,$d,$a); &vprold ($d,$d,8); &vpaddd ($c,$c,$d); &vpxord ($b,$b,$c); &vprold ($b,$b,7); } my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_avx512,\@function,5 .align 32 ChaCha20_avx512: .cfi_startproc .LChaCha20_avx512: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 cmp \$512,$len ja .LChaCha20_16x sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x28(%r9) movaps %xmm7,-0x18(%r9) .Lavx512_body: ___ $code.=<<___; vbroadcasti32x4 .Lsigma(%rip),$a vbroadcasti32x4 ($key),$b vbroadcasti32x4 16($key),$c vbroadcasti32x4 ($counter),$d vmovdqa32 $a,$a_ vmovdqa32 $b,$b_ vmovdqa32 $c,$c_ vpaddd .Lzeroz(%rip),$d,$d vmovdqa32 .Lfourz(%rip),$fourz mov \$10,$counter # reuse $counter vmovdqa32 $d,$d_ jmp .Loop_avx512 .align 16 .Loop_outer_avx512: vmovdqa32 $a_,$a vmovdqa32 $b_,$b vmovdqa32 $c_,$c vpaddd $fourz,$d_,$d mov \$10,$counter vmovdqa32 $d,$d_ jmp .Loop_avx512 .align 32 .Loop_avx512: ___ &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b00111001); &vpshufd ($d,$d,0b10010011); &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b10010011); &vpshufd ($d,$d,0b00111001); &dec ($counter); &jnz (".Loop_avx512"); $code.=<<___; vpaddd $a_,$a,$a vpaddd $b_,$b,$b vpaddd $c_,$c,$c vpaddd $d_,$d,$d sub \$64,$len jb .Ltail64_avx512 vpxor 0x00($inp),%x#$a,$t0 # xor with input vpxor 0x10($inp),%x#$b,$t1 vpxor 0x20($inp),%x#$c,$t2 vpxor 0x30($inp),%x#$d,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512 vextracti32x4 \$1,$a,$t0 vextracti32x4 \$1,$b,$t1 vextracti32x4 \$1,$c,$t2 vextracti32x4 \$1,$d,$t3 sub \$64,$len jb .Ltail_avx512 vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512 vextracti32x4 \$2,$a,$t0 vextracti32x4 \$2,$b,$t1 vextracti32x4 \$2,$c,$t2 vextracti32x4 \$2,$d,$t3 sub \$64,$len jb .Ltail_avx512 vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512 vextracti32x4 \$3,$a,$t0 vextracti32x4 \$3,$b,$t1 vextracti32x4 \$3,$c,$t2 vextracti32x4 \$3,$d,$t3 sub \$64,$len jb .Ltail_avx512 vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jnz .Loop_outer_avx512 jmp .Ldone_avx512 .align 16 .Ltail64_avx512: vmovdqa %x#$a,0x00(%rsp) vmovdqa %x#$b,0x10(%rsp) vmovdqa %x#$c,0x20(%rsp) vmovdqa %x#$d,0x30(%rsp) add \$64,$len jmp .Loop_tail_avx512 .align 16 .Ltail_avx512: vmovdqa $t0,0x00(%rsp) vmovdqa $t1,0x10(%rsp) vmovdqa $t2,0x20(%rsp) vmovdqa $t3,0x30(%rsp) add \$64,$len .Loop_tail_avx512: movzb ($inp,$counter),%eax movzb (%rsp,$counter),%ecx lea 1($counter),$counter xor %ecx,%eax mov %al,-1($out,$counter) dec $len jnz .Loop_tail_avx512 vmovdqu32 $a_,0x00(%rsp) .Ldone_avx512: vzeroall ___ $code.=<<___ if ($win64); movaps -0x28(%r9),%xmm6 movaps -0x18(%r9),%xmm7 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .Lavx512_epilogue: ret .cfi_endproc .size ChaCha20_avx512,.-ChaCha20_avx512 ___ map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); $code.=<<___; .type ChaCha20_avx512vl,\@function,5 .align 32 ChaCha20_avx512vl: .cfi_startproc .LChaCha20_avx512vl: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 cmp \$128,$len ja .LChaCha20_8xvl sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x28(%r9) movaps %xmm7,-0x18(%r9) .Lavx512vl_body: ___ $code.=<<___; vbroadcasti128 .Lsigma(%rip),$a vbroadcasti128 ($key),$b vbroadcasti128 16($key),$c vbroadcasti128 ($counter),$d vmovdqa32 $a,$a_ vmovdqa32 $b,$b_ vmovdqa32 $c,$c_ vpaddd .Lzeroz(%rip),$d,$d vmovdqa32 .Ltwoy(%rip),$fourz mov \$10,$counter # reuse $counter vmovdqa32 $d,$d_ jmp .Loop_avx512vl .align 16 .Loop_outer_avx512vl: vmovdqa32 $c_,$c vpaddd $fourz,$d_,$d mov \$10,$counter vmovdqa32 $d,$d_ jmp .Loop_avx512vl .align 32 .Loop_avx512vl: ___ &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b00111001); &vpshufd ($d,$d,0b10010011); &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b10010011); &vpshufd ($d,$d,0b00111001); &dec ($counter); &jnz (".Loop_avx512vl"); $code.=<<___; vpaddd $a_,$a,$a vpaddd $b_,$b,$b vpaddd $c_,$c,$c vpaddd $d_,$d,$d sub \$64,$len jb .Ltail64_avx512vl vpxor 0x00($inp),%x#$a,$t0 # xor with input vpxor 0x10($inp),%x#$b,$t1 vpxor 0x20($inp),%x#$c,$t2 vpxor 0x30($inp),%x#$d,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512vl vextracti128 \$1,$a,$t0 vextracti128 \$1,$b,$t1 vextracti128 \$1,$c,$t2 vextracti128 \$1,$d,$t3 sub \$64,$len jb .Ltail_avx512vl vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 vmovdqa32 $a_,$a vmovdqa32 $b_,$b jnz .Loop_outer_avx512vl jmp .Ldone_avx512vl .align 16 .Ltail64_avx512vl: vmovdqa %x#$a,0x00(%rsp) vmovdqa %x#$b,0x10(%rsp) vmovdqa %x#$c,0x20(%rsp) vmovdqa %x#$d,0x30(%rsp) add \$64,$len jmp .Loop_tail_avx512vl .align 16 .Ltail_avx512vl: vmovdqa $t0,0x00(%rsp) vmovdqa $t1,0x10(%rsp) vmovdqa $t2,0x20(%rsp) vmovdqa $t3,0x30(%rsp) add \$64,$len .Loop_tail_avx512vl: movzb ($inp,$counter),%eax movzb (%rsp,$counter),%ecx lea 1($counter),$counter xor %ecx,%eax mov %al,-1($out,$counter) dec $len jnz .Loop_tail_avx512vl vmovdqu32 $a_,0x00(%rsp) vmovdqu32 $a_,0x20(%rsp) .Ldone_avx512vl: vzeroall ___ $code.=<<___ if ($win64); movaps -0x28(%r9),%xmm6 movaps -0x18(%r9),%xmm7 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .Lavx512vl_epilogue: ret .cfi_endproc .size ChaCha20_avx512vl,.-ChaCha20_avx512vl ___ } if ($avx>2) { # This one handles longer inputs... my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); my @key=map("%zmm$_",(16..31)); my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; sub AVX512_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my @x=map("\"$_\"",@xx); ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", "&vpxord (@x[$d1],@x[$d1],@x[$a1])", "&vpxord (@x[$d2],@x[$d2],@x[$a2])", "&vpxord (@x[$d3],@x[$d3],@x[$a3])", "&vprold (@x[$d0],@x[$d0],16)", "&vprold (@x[$d1],@x[$d1],16)", "&vprold (@x[$d2],@x[$d2],16)", "&vprold (@x[$d3],@x[$d3],16)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxord (@x[$b0],@x[$b0],@x[$c0])", "&vpxord (@x[$b1],@x[$b1],@x[$c1])", "&vpxord (@x[$b2],@x[$b2],@x[$c2])", "&vpxord (@x[$b3],@x[$b3],@x[$c3])", "&vprold (@x[$b0],@x[$b0],12)", "&vprold (@x[$b1],@x[$b1],12)", "&vprold (@x[$b2],@x[$b2],12)", "&vprold (@x[$b3],@x[$b3],12)", "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxord (@x[$d0],@x[$d0],@x[$a0])", "&vpxord (@x[$d1],@x[$d1],@x[$a1])", "&vpxord (@x[$d2],@x[$d2],@x[$a2])", "&vpxord (@x[$d3],@x[$d3],@x[$a3])", "&vprold (@x[$d0],@x[$d0],8)", "&vprold (@x[$d1],@x[$d1],8)", "&vprold (@x[$d2],@x[$d2],8)", "&vprold (@x[$d3],@x[$d3],8)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxord (@x[$b0],@x[$b0],@x[$c0])", "&vpxord (@x[$b1],@x[$b1],@x[$c1])", "&vpxord (@x[$b2],@x[$b2],@x[$c2])", "&vpxord (@x[$b3],@x[$b3],@x[$c3])", "&vprold (@x[$b0],@x[$b0],7)", "&vprold (@x[$b1],@x[$b1],7)", "&vprold (@x[$b2],@x[$b2],7)", "&vprold (@x[$b3],@x[$b3],7)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_16x,\@function,5 .align 32 ChaCha20_16x: .cfi_startproc .LChaCha20_16x: mov %rsp,%r9 # frame register .cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L16x_body: ___ $code.=<<___; vzeroupper lea .Lsigma(%rip),%r10 vbroadcasti32x4 (%r10),$xa3 # key[0] vbroadcasti32x4 ($key),$xb3 # key[1] vbroadcasti32x4 16($key),$xc3 # key[2] vbroadcasti32x4 ($counter),$xd3 # key[3] vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vpshufd \$0xaa,$xa3,$xa2 vpshufd \$0xff,$xa3,$xa3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vpshufd \$0xaa,$xb3,$xb2 vpshufd \$0xff,$xb3,$xb3 vmovdqa64 $xb0,@key[4] vmovdqa64 $xb1,@key[5] vmovdqa64 $xb2,@key[6] vmovdqa64 $xb3,@key[7] vpshufd \$0x00,$xc3,$xc0 vpshufd \$0x55,$xc3,$xc1 vpshufd \$0xaa,$xc3,$xc2 vpshufd \$0xff,$xc3,$xc3 vmovdqa64 $xc0,@key[8] vmovdqa64 $xc1,@key[9] vmovdqa64 $xc2,@key[10] vmovdqa64 $xc3,@key[11] vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpshufd \$0xaa,$xd3,$xd2 vpshufd \$0xff,$xd3,$xd3 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet vmovdqa64 $xd0,@key[12] vmovdqa64 $xd1,@key[13] vmovdqa64 $xd2,@key[14] vmovdqa64 $xd3,@key[15] mov \$10,%eax jmp .Loop16x .align 32 .Loop_outer16x: vpbroadcastd 0(%r10),$xa0 # reload key vpbroadcastd 4(%r10),$xa1 vpbroadcastd 8(%r10),$xa2 vpbroadcastd 12(%r10),$xa3 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters vmovdqa64 @key[4],$xb0 vmovdqa64 @key[5],$xb1 vmovdqa64 @key[6],$xb2 vmovdqa64 @key[7],$xb3 vmovdqa64 @key[8],$xc0 vmovdqa64 @key[9],$xc1 vmovdqa64 @key[10],$xc2 vmovdqa64 @key[11],$xc3 vmovdqa64 @key[12],$xd0 vmovdqa64 @key[13],$xd1 vmovdqa64 @key[14],$xd2 vmovdqa64 @key[15],$xd3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] mov \$10,%eax jmp .Loop16x .align 32 .Loop16x: ___ foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop16x vpaddd @key[0],$xa0,$xa0 # accumulate key vpaddd @key[1],$xa1,$xa1 vpaddd @key[2],$xa2,$xa2 vpaddd @key[3],$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd @key[4],$xb0,$xb0 vpaddd @key[5],$xb1,$xb1 vpaddd @key[6],$xb2,$xb2 vpaddd @key[7],$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further vshufi32x4 \$0xee,$xb0,$xa0,$xb0 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); $code.=<<___; vpaddd @key[8],$xc0,$xc0 vpaddd @key[9],$xc1,$xc1 vpaddd @key[10],$xc2,$xc2 vpaddd @key[11],$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd @key[12],$xd0,$xd0 vpaddd @key[13],$xd1,$xd1 vpaddd @key[14],$xd2,$xd2 vpaddd @key[15],$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further vshufi32x4 \$0xee,$xd0,$xc0,$xd0 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); $code.=<<___; vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 ___ ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); $code.=<<___; cmp \$64*16,$len jb .Ltail16x vpxord 0x00($inp),$xa0,$xa0 # xor with input vpxord 0x40($inp),$xb0,$xb0 vpxord 0x80($inp),$xc0,$xc0 vpxord 0xc0($inp),$xd0,$xd0 vmovdqu32 $xa0,0x00($out) vmovdqu32 $xb0,0x40($out) vmovdqu32 $xc0,0x80($out) vmovdqu32 $xd0,0xc0($out) vpxord 0x100($inp),$xa1,$xa1 vpxord 0x140($inp),$xb1,$xb1 vpxord 0x180($inp),$xc1,$xc1 vpxord 0x1c0($inp),$xd1,$xd1 vmovdqu32 $xa1,0x100($out) vmovdqu32 $xb1,0x140($out) vmovdqu32 $xc1,0x180($out) vmovdqu32 $xd1,0x1c0($out) vpxord 0x200($inp),$xa2,$xa2 vpxord 0x240($inp),$xb2,$xb2 vpxord 0x280($inp),$xc2,$xc2 vpxord 0x2c0($inp),$xd2,$xd2 vmovdqu32 $xa2,0x200($out) vmovdqu32 $xb2,0x240($out) vmovdqu32 $xc2,0x280($out) vmovdqu32 $xd2,0x2c0($out) vpxord 0x300($inp),$xa3,$xa3 vpxord 0x340($inp),$xb3,$xb3 vpxord 0x380($inp),$xc3,$xc3 vpxord 0x3c0($inp),$xd3,$xd3 lea 0x400($inp),$inp vmovdqu32 $xa3,0x300($out) vmovdqu32 $xb3,0x340($out) vmovdqu32 $xc3,0x380($out) vmovdqu32 $xd3,0x3c0($out) lea 0x400($out),$out sub \$64*16,$len jnz .Loop_outer16x jmp .Ldone16x .align 32 .Ltail16x: xor %r10,%r10 sub $inp,$out cmp \$64*1,$len jb .Less_than_64_16x vpxord ($inp),$xa0,$xa0 # xor with input vmovdqu32 $xa0,($out,$inp) je .Ldone16x vmovdqa32 $xb0,$xa0 lea 64($inp),$inp cmp \$64*2,$len jb .Less_than_64_16x vpxord ($inp),$xb0,$xb0 vmovdqu32 $xb0,($out,$inp) je .Ldone16x vmovdqa32 $xc0,$xa0 lea 64($inp),$inp cmp \$64*3,$len jb .Less_than_64_16x vpxord ($inp),$xc0,$xc0 vmovdqu32 $xc0,($out,$inp) je .Ldone16x vmovdqa32 $xd0,$xa0 lea 64($inp),$inp cmp \$64*4,$len jb .Less_than_64_16x vpxord ($inp),$xd0,$xd0 vmovdqu32 $xd0,($out,$inp) je .Ldone16x vmovdqa32 $xa1,$xa0 lea 64($inp),$inp cmp \$64*5,$len jb .Less_than_64_16x vpxord ($inp),$xa1,$xa1 vmovdqu32 $xa1,($out,$inp) je .Ldone16x vmovdqa32 $xb1,$xa0 lea 64($inp),$inp cmp \$64*6,$len jb .Less_than_64_16x vpxord ($inp),$xb1,$xb1 vmovdqu32 $xb1,($out,$inp) je .Ldone16x vmovdqa32 $xc1,$xa0 lea 64($inp),$inp cmp \$64*7,$len jb .Less_than_64_16x vpxord ($inp),$xc1,$xc1 vmovdqu32 $xc1,($out,$inp) je .Ldone16x vmovdqa32 $xd1,$xa0 lea 64($inp),$inp cmp \$64*8,$len jb .Less_than_64_16x vpxord ($inp),$xd1,$xd1 vmovdqu32 $xd1,($out,$inp) je .Ldone16x vmovdqa32 $xa2,$xa0 lea 64($inp),$inp cmp \$64*9,$len jb .Less_than_64_16x vpxord ($inp),$xa2,$xa2 vmovdqu32 $xa2,($out,$inp) je .Ldone16x vmovdqa32 $xb2,$xa0 lea 64($inp),$inp cmp \$64*10,$len jb .Less_than_64_16x vpxord ($inp),$xb2,$xb2 vmovdqu32 $xb2,($out,$inp) je .Ldone16x vmovdqa32 $xc2,$xa0 lea 64($inp),$inp cmp \$64*11,$len jb .Less_than_64_16x vpxord ($inp),$xc2,$xc2 vmovdqu32 $xc2,($out,$inp) je .Ldone16x vmovdqa32 $xd2,$xa0 lea 64($inp),$inp cmp \$64*12,$len jb .Less_than_64_16x vpxord ($inp),$xd2,$xd2 vmovdqu32 $xd2,($out,$inp) je .Ldone16x vmovdqa32 $xa3,$xa0 lea 64($inp),$inp cmp \$64*13,$len jb .Less_than_64_16x vpxord ($inp),$xa3,$xa3 vmovdqu32 $xa3,($out,$inp) je .Ldone16x vmovdqa32 $xb3,$xa0 lea 64($inp),$inp cmp \$64*14,$len jb .Less_than_64_16x vpxord ($inp),$xb3,$xb3 vmovdqu32 $xb3,($out,$inp) je .Ldone16x vmovdqa32 $xc3,$xa0 lea 64($inp),$inp cmp \$64*15,$len jb .Less_than_64_16x vpxord ($inp),$xc3,$xc3 vmovdqu32 $xc3,($out,$inp) je .Ldone16x vmovdqa32 $xd3,$xa0 lea 64($inp),$inp .Less_than_64_16x: vmovdqa32 $xa0,0x00(%rsp) lea ($out,$inp),$out and \$63,$len .Loop_tail16x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail16x vpxord $xa0,$xa0,$xa0 vmovdqa32 $xa0,0(%rsp) .Ldone16x: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L16x_epilogue: ret .cfi_endproc .size ChaCha20_16x,.-ChaCha20_16x ___ # switch to %ymm domain ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); @key=map("%ymm$_",(16..31)); ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; $code.=<<___; .type ChaCha20_8xvl,\@function,5 .align 32 ChaCha20_8xvl: .cfi_startproc .LChaCha20_8xvl: mov %rsp,%r9 # frame register .cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L8xvl_body: ___ $code.=<<___; vzeroupper lea .Lsigma(%rip),%r10 vbroadcasti128 (%r10),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] vbroadcasti128 16($key),$xc3 # key[2] vbroadcasti128 ($counter),$xd3 # key[3] vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vpshufd \$0xaa,$xa3,$xa2 vpshufd \$0xff,$xa3,$xa3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vpshufd \$0xaa,$xb3,$xb2 vpshufd \$0xff,$xb3,$xb3 vmovdqa64 $xb0,@key[4] vmovdqa64 $xb1,@key[5] vmovdqa64 $xb2,@key[6] vmovdqa64 $xb3,@key[7] vpshufd \$0x00,$xc3,$xc0 vpshufd \$0x55,$xc3,$xc1 vpshufd \$0xaa,$xc3,$xc2 vpshufd \$0xff,$xc3,$xc3 vmovdqa64 $xc0,@key[8] vmovdqa64 $xc1,@key[9] vmovdqa64 $xc2,@key[10] vmovdqa64 $xc3,@key[11] vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpshufd \$0xaa,$xd3,$xd2 vpshufd \$0xff,$xd3,$xd3 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet vmovdqa64 $xd0,@key[12] vmovdqa64 $xd1,@key[13] vmovdqa64 $xd2,@key[14] vmovdqa64 $xd3,@key[15] mov \$10,%eax jmp .Loop8xvl .align 32 .Loop_outer8xvl: #vpbroadcastd 0(%r10),$xa0 # reload key #vpbroadcastd 4(%r10),$xa1 vpbroadcastd 8(%r10),$xa2 vpbroadcastd 12(%r10),$xa3 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters vmovdqa64 @key[4],$xb0 vmovdqa64 @key[5],$xb1 vmovdqa64 @key[6],$xb2 vmovdqa64 @key[7],$xb3 vmovdqa64 @key[8],$xc0 vmovdqa64 @key[9],$xc1 vmovdqa64 @key[10],$xc2 vmovdqa64 @key[11],$xc3 vmovdqa64 @key[12],$xd0 vmovdqa64 @key[13],$xd1 vmovdqa64 @key[14],$xd2 vmovdqa64 @key[15],$xd3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] mov \$10,%eax jmp .Loop8xvl .align 32 .Loop8xvl: ___ foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop8xvl vpaddd @key[0],$xa0,$xa0 # accumulate key vpaddd @key[1],$xa1,$xa1 vpaddd @key[2],$xa2,$xa2 vpaddd @key[3],$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd @key[4],$xb0,$xb0 vpaddd @key[5],$xb1,$xb1 vpaddd @key[6],$xb2,$xb2 vpaddd @key[7],$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further vshufi32x4 \$3,$xb0,$xa0,$xb0 vshufi32x4 \$0,$xb1,$xa1,$xa0 vshufi32x4 \$3,$xb1,$xa1,$xb1 vshufi32x4 \$0,$xb2,$xa2,$xa1 vshufi32x4 \$3,$xb2,$xa2,$xb2 vshufi32x4 \$0,$xb3,$xa3,$xa2 vshufi32x4 \$3,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); $code.=<<___; vpaddd @key[8],$xc0,$xc0 vpaddd @key[9],$xc1,$xc1 vpaddd @key[10],$xc2,$xc2 vpaddd @key[11],$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd @key[12],$xd0,$xd0 vpaddd @key[13],$xd1,$xd1 vpaddd @key[14],$xd2,$xd2 vpaddd @key[15],$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xd0,$xc0,$xd0 vperm2i128 \$0x20,$xd1,$xc1,$xc0 vperm2i128 \$0x31,$xd1,$xc1,$xd1 vperm2i128 \$0x20,$xd2,$xc2,$xc1 vperm2i128 \$0x31,$xd2,$xc2,$xd2 vperm2i128 \$0x20,$xd3,$xc3,$xc2 vperm2i128 \$0x31,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); $code.=<<___; cmp \$64*8,$len jb .Ltail8xvl mov \$0x80,%eax # size optimization vpxord 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 lea ($inp,%rax),$inp # size optimization vmovdqu32 $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) lea ($out,%rax),$out # size optimization vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vpxor 0x40($inp),$xc1,$xc1 vpxor 0x60($inp),$xd1,$xd1 lea ($inp,%rax),$inp # size optimization vmovdqu $xa1,0x00($out) vmovdqu $xb1,0x20($out) vmovdqu $xc1,0x40($out) vmovdqu $xd1,0x60($out) lea ($out,%rax),$out # size optimization vpxord 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vpxor 0x40($inp),$xc2,$xc2 vpxor 0x60($inp),$xd2,$xd2 lea ($inp,%rax),$inp # size optimization vmovdqu32 $xa2,0x00($out) vmovdqu $xb2,0x20($out) vmovdqu $xc2,0x40($out) vmovdqu $xd2,0x60($out) lea ($out,%rax),$out # size optimization vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vpxor 0x40($inp),$xc3,$xc3 vpxor 0x60($inp),$xd3,$xd3 lea ($inp,%rax),$inp # size optimization vmovdqu $xa3,0x00($out) vmovdqu $xb3,0x20($out) vmovdqu $xc3,0x40($out) vmovdqu $xd3,0x60($out) lea ($out,%rax),$out # size optimization vpbroadcastd 0(%r10),%ymm0 # reload key vpbroadcastd 4(%r10),%ymm1 sub \$64*8,$len jnz .Loop_outer8xvl jmp .Ldone8xvl .align 32 .Ltail8xvl: vmovdqa64 $xa0,%ymm8 # size optimization ___ $xa0 = "%ymm8"; $code.=<<___; xor %r10,%r10 sub $inp,$out cmp \$64*1,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vmovdqu $xa0,0x00($out,$inp) vmovdqu $xb0,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc0,$xa0 vmovdqa $xd0,$xb0 lea 64($inp),$inp cmp \$64*2,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xc0,$xc0 vpxor 0x20($inp),$xd0,$xd0 vmovdqu $xc0,0x00($out,$inp) vmovdqu $xd0,0x20($out,$inp) je .Ldone8xvl vmovdqa $xa1,$xa0 vmovdqa $xb1,$xb0 lea 64($inp),$inp cmp \$64*3,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vmovdqu $xa1,0x00($out,$inp) vmovdqu $xb1,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc1,$xa0 vmovdqa $xd1,$xb0 lea 64($inp),$inp cmp \$64*4,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xc1,$xc1 vpxor 0x20($inp),$xd1,$xd1 vmovdqu $xc1,0x00($out,$inp) vmovdqu $xd1,0x20($out,$inp) je .Ldone8xvl vmovdqa32 $xa2,$xa0 vmovdqa $xb2,$xb0 lea 64($inp),$inp cmp \$64*5,$len jb .Less_than_64_8xvl vpxord 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vmovdqu32 $xa2,0x00($out,$inp) vmovdqu $xb2,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc2,$xa0 vmovdqa $xd2,$xb0 lea 64($inp),$inp cmp \$64*6,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xc2,$xc2 vpxor 0x20($inp),$xd2,$xd2 vmovdqu $xc2,0x00($out,$inp) vmovdqu $xd2,0x20($out,$inp) je .Ldone8xvl vmovdqa $xa3,$xa0 vmovdqa $xb3,$xb0 lea 64($inp),$inp cmp \$64*7,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vmovdqu $xa3,0x00($out,$inp) vmovdqu $xb3,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc3,$xa0 vmovdqa $xd3,$xb0 lea 64($inp),$inp .Less_than_64_8xvl: vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x20(%rsp) lea ($out,$inp),$out and \$63,$len .Loop_tail8xvl: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail8xvl vpxor $xa0,$xa0,$xa0 vmovdqa $xa0,0x00(%rsp) vmovdqa $xa0,0x20(%rsp) .Ldone8xvl: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L8xvl_epilogue: ret .cfi_endproc .size ChaCha20_8xvl,.-ChaCha20_8xvl ___ } # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lno_data(%rip),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea 64+24+48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .type simd_handler,\@abi-omnipotent .align 16 simd_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipR9 mov 4(%r11),%r10d # HandlerData[1] mov 8(%r11),%ecx # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail neg %rcx lea -8(%rax,%rcx),%rsi lea 512($context),%rdi # &context.Xmm6 neg %ecx shr \$3,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_seh_tail .size simd_handler,.-simd_handler .section .pdata .align 4 .rva .LSEH_begin_ChaCha20_ctr32 .rva .LSEH_end_ChaCha20_ctr32 .rva .LSEH_info_ChaCha20_ctr32 .rva .LSEH_begin_ChaCha20_ssse3 .rva .LSEH_end_ChaCha20_ssse3 .rva .LSEH_info_ChaCha20_ssse3 .rva .LSEH_begin_ChaCha20_128 .rva .LSEH_end_ChaCha20_128 .rva .LSEH_info_ChaCha20_128 .rva .LSEH_begin_ChaCha20_4x .rva .LSEH_end_ChaCha20_4x .rva .LSEH_info_ChaCha20_4x ___ $code.=<<___ if ($avx); .rva .LSEH_begin_ChaCha20_4xop .rva .LSEH_end_ChaCha20_4xop .rva .LSEH_info_ChaCha20_4xop ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_ChaCha20_8x .rva .LSEH_end_ChaCha20_8x .rva .LSEH_info_ChaCha20_8x ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_ChaCha20_avx512 .rva .LSEH_end_ChaCha20_avx512 .rva .LSEH_info_ChaCha20_avx512 .rva .LSEH_begin_ChaCha20_avx512vl .rva .LSEH_end_ChaCha20_avx512vl .rva .LSEH_info_ChaCha20_avx512vl .rva .LSEH_begin_ChaCha20_16x .rva .LSEH_end_ChaCha20_16x .rva .LSEH_info_ChaCha20_16x .rva .LSEH_begin_ChaCha20_8xvl .rva .LSEH_end_ChaCha20_8xvl .rva .LSEH_info_ChaCha20_8xvl ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_ChaCha20_ctr32: .byte 9,0,0,0 .rva se_handler .LSEH_info_ChaCha20_ssse3: .byte 9,0,0,0 .rva simd_handler .rva .Lssse3_body,.Lssse3_epilogue .long 0x20,0 .LSEH_info_ChaCha20_128: .byte 9,0,0,0 .rva simd_handler .rva .L128_body,.L128_epilogue .long 0x60,0 .LSEH_info_ChaCha20_4x: .byte 9,0,0,0 .rva simd_handler .rva .L4x_body,.L4x_epilogue .long 0xa0,0 ___ $code.=<<___ if ($avx); .LSEH_info_ChaCha20_4xop: .byte 9,0,0,0 .rva simd_handler .rva .L4xop_body,.L4xop_epilogue # HandlerData[] .long 0xa0,0 ___ $code.=<<___ if ($avx>1); .LSEH_info_ChaCha20_8x: .byte 9,0,0,0 .rva simd_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] .long 0xa0,0 ___ $code.=<<___ if ($avx>2); .LSEH_info_ChaCha20_avx512: .byte 9,0,0,0 .rva simd_handler .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] .long 0x20,0 .LSEH_info_ChaCha20_avx512vl: .byte 9,0,0,0 .rva simd_handler .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] .long 0x20,0 .LSEH_info_ChaCha20_16x: .byte 9,0,0,0 .rva simd_handler .rva .L16x_body,.L16x_epilogue # HandlerData[] .long 0xa0,0 .LSEH_info_ChaCha20_8xvl: .byte 9,0,0,0 .rva simd_handler .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] .long 0xa0,0 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/%x#%[yz]/%x/g; # "down-shift" print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl =================================================================== --- head/crypto/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl (revision 364821) +++ head/crypto/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl (revision 364822) @@ -1,2080 +1,2080 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2014, Intel Corporation. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # # Reference: # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with # 256 Bit Primes" $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } if ($avx>=2) {{ $digit_size = "\$29"; $n_digits = "\$9"; $code.=<<___; .text .align 64 .LAVX2_AND_MASK: .LAVX2_POLY: .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff .LAVX2_POLY_x2: .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC .LAVX2_POLY_x8: .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8 .LONE: .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL # Montgomery form (*2^256) to our format (*2^261) .LTO_MONT_AVX2: .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003 .LFROM_MONT_AVX2: .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .LIntOne: .long 1,1,1,1,1,1,1,1 ___ { # This function receives a pointer to an array of four affine points # (X, Y, <1>) and rearranges the data for AVX2 execution, while # converting it to 2^29 radix redundant form my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3, $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15)); $code.=<<___; .globl ecp_nistz256_avx2_transpose_convert .type ecp_nistz256_avx2_transpose_convert,\@function,2 .align 64 ecp_nistz256_avx2_transpose_convert: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; # Load the data vmovdqa 32*0(%rsi), $X0 lea 112(%rsi), %rax # size optimization vmovdqa 32*1(%rsi), $Y0 lea .LAVX2_AND_MASK(%rip), %rdx vmovdqa 32*2(%rsi), $X1 vmovdqa 32*3(%rsi), $Y1 vmovdqa 32*4-112(%rax), $X2 vmovdqa 32*5-112(%rax), $Y2 vmovdqa 32*6-112(%rax), $X3 vmovdqa 32*7-112(%rax), $Y3 # Transpose X and Y independently vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0] vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0] vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1] vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1] vpunpcklqdq $Y1, $Y0, $T4 vpunpcklqdq $Y3, $Y2, $T5 vpunpckhqdq $Y1, $Y0, $T6 vpunpckhqdq $Y3, $Y2, $T7 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0] vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1] vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2] vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3] vperm2i128 \$0x20, $T5, $T4, $Y0 vperm2i128 \$0x20, $T7, $T6, $Y1 vperm2i128 \$0x31, $T5, $T4, $Y2 vperm2i128 \$0x31, $T7, $T6, $Y3 vmovdqa (%rdx), $T7 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask; vpsrlq \$29, $X0, $X0 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask; vpsrlq \$29, $X0, $X0 vpsllq \$6, $X1, $T2 vpxor $X0, $T2, $T2 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; vpsrlq \$23, $X1, $X1 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; vpsrlq \$29, $X1, $X1 vpsllq \$12, $X2, $T4 vpxor $X1, $T4, $T4 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; vpsrlq \$17, $X2, $X2 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; vpsrlq \$29, $X2, $X2 vpsllq \$18, $X3, $T6 vpxor $X2, $T6, $T6 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; vpsrlq \$11, $X3, $X3 vmovdqa $T0, 32*0(%rdi) lea 112(%rdi), %rax # size optimization vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; vmovdqa $T1, 32*1(%rdi) vmovdqa $T2, 32*2(%rdi) vmovdqa $T3, 32*3(%rdi) vmovdqa $T4, 32*4-112(%rax) vmovdqa $T5, 32*5-112(%rax) vmovdqa $T6, 32*6-112(%rax) vmovdqa $T0, 32*7-112(%rax) vmovdqa $X3, 32*8-112(%rax) lea 448(%rdi), %rax # size optimization vpand $T7, $Y0, $T0 # out[0] = in[0] & mask; vpsrlq \$29, $Y0, $Y0 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask; vpsrlq \$29, $Y0, $Y0 vpsllq \$6, $Y1, $T2 vpxor $Y0, $T2, $T2 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; vpsrlq \$23, $Y1, $Y1 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; vpsrlq \$29, $Y1, $Y1 vpsllq \$12, $Y2, $T4 vpxor $Y1, $T4, $T4 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; vpsrlq \$17, $Y2, $Y2 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; vpsrlq \$29, $Y2, $Y2 vpsllq \$18, $Y3, $T6 vpxor $Y2, $T6, $T6 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; vpsrlq \$11, $Y3, $Y3 vmovdqa $T0, 32*9-448(%rax) vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; vmovdqa $T1, 32*10-448(%rax) vmovdqa $T2, 32*11-448(%rax) vmovdqa $T3, 32*12-448(%rax) vmovdqa $T4, 32*13-448(%rax) vmovdqa $T5, 32*14-448(%rax) vmovdqa $T6, 32*15-448(%rax) vmovdqa $T0, 32*16-448(%rax) vmovdqa $Y3, 32*17-448(%rax) vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert ___ } { ################################################################################ # This function receives a pointer to an array of four AVX2 formatted points # (X, Y, Z) convert the data to normal representation, and rearranges the data my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8)); my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15)); $code.=<<___; .globl ecp_nistz256_avx2_convert_transpose_back .type ecp_nistz256_avx2_convert_transpose_back,\@function,2 .align 32 ecp_nistz256_avx2_convert_transpose_back: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; mov \$3, %ecx .Lconv_loop: vmovdqa 32*0(%rsi), $D0 lea 160(%rsi), %rax # size optimization vmovdqa 32*1(%rsi), $D1 vmovdqa 32*2(%rsi), $D2 vmovdqa 32*3(%rsi), $D3 vmovdqa 32*4-160(%rax), $D4 vmovdqa 32*5-160(%rax), $D5 vmovdqa 32*6-160(%rax), $D6 vmovdqa 32*7-160(%rax), $D7 vmovdqa 32*8-160(%rax), $D8 vpsllq \$29, $D1, $D1 vpsllq \$58, $D2, $T0 vpaddq $D1, $D0, $D0 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2); vpsrlq \$6, $D2, $D2 vpsllq \$23, $D3, $D3 vpsllq \$52, $D4, $T1 vpaddq $D2, $D3, $D3 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64); vpsrlq \$12, $D4, $D4 vpsllq \$17, $D5, $D5 vpsllq \$46, $D6, $T2 vpaddq $D4, $D5, $D5 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64); vpsrlq \$18, $D6, $D6 vpsllq \$11, $D7, $D7 vpsllq \$40, $D8, $T3 vpaddq $D6, $D7, $D7 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64); vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0] vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0] vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1] vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1] vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0] vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1] vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2] vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3] vmovdqa $D0, 32*0(%rdi) vmovdqa $D1, 32*3(%rdi) vmovdqa $D2, 32*6(%rdi) vmovdqa $D3, 32*9(%rdi) lea 32*9(%rsi), %rsi lea 32*1(%rdi), %rdi dec %ecx jnz .Lconv_loop vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back ___ } { my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx"); my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8)); my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13)); sub NORMALIZE { my $ret=<<___; vpsrlq $digit_size, $ACC0, $T0 vpand $AND_MASK, $ACC0, $ACC0 vpaddq $T0, $ACC1, $ACC1 vpsrlq $digit_size, $ACC1, $T0 vpand $AND_MASK, $ACC1, $ACC1 vpaddq $T0, $ACC2, $ACC2 vpsrlq $digit_size, $ACC2, $T0 vpand $AND_MASK, $ACC2, $ACC2 vpaddq $T0, $ACC3, $ACC3 vpsrlq $digit_size, $ACC3, $T0 vpand $AND_MASK, $ACC3, $ACC3 vpaddq $T0, $ACC4, $ACC4 vpsrlq $digit_size, $ACC4, $T0 vpand $AND_MASK, $ACC4, $ACC4 vpaddq $T0, $ACC5, $ACC5 vpsrlq $digit_size, $ACC5, $T0 vpand $AND_MASK, $ACC5, $ACC5 vpaddq $T0, $ACC6, $ACC6 vpsrlq $digit_size, $ACC6, $T0 vpand $AND_MASK, $ACC6, $ACC6 vpaddq $T0, $ACC7, $ACC7 vpsrlq $digit_size, $ACC7, $T0 vpand $AND_MASK, $ACC7, $ACC7 vpaddq $T0, $ACC8, $ACC8 #vpand $AND_MASK, $ACC8, $ACC8 ___ $ret; } sub STORE { my $ret=<<___; vmovdqa $ACC0, 32*0(%rdi) lea 160(%rdi), %rax # size optimization vmovdqa $ACC1, 32*1(%rdi) vmovdqa $ACC2, 32*2(%rdi) vmovdqa $ACC3, 32*3(%rdi) vmovdqa $ACC4, 32*4-160(%rax) vmovdqa $ACC5, 32*5-160(%rax) vmovdqa $ACC6, 32*6-160(%rax) vmovdqa $ACC7, 32*7-160(%rax) vmovdqa $ACC8, 32*8-160(%rax) ___ $ret; } $code.=<<___; .type avx2_normalize,\@abi-omnipotent .align 32 avx2_normalize: vpsrlq $digit_size, $ACC0, $T0 vpand $AND_MASK, $ACC0, $ACC0 vpaddq $T0, $ACC1, $ACC1 vpsrlq $digit_size, $ACC1, $T0 vpand $AND_MASK, $ACC1, $ACC1 vpaddq $T0, $ACC2, $ACC2 vpsrlq $digit_size, $ACC2, $T0 vpand $AND_MASK, $ACC2, $ACC2 vpaddq $T0, $ACC3, $ACC3 vpsrlq $digit_size, $ACC3, $T0 vpand $AND_MASK, $ACC3, $ACC3 vpaddq $T0, $ACC4, $ACC4 vpsrlq $digit_size, $ACC4, $T0 vpand $AND_MASK, $ACC4, $ACC4 vpaddq $T0, $ACC5, $ACC5 vpsrlq $digit_size, $ACC5, $T0 vpand $AND_MASK, $ACC5, $ACC5 vpaddq $T0, $ACC6, $ACC6 vpsrlq $digit_size, $ACC6, $T0 vpand $AND_MASK, $ACC6, $ACC6 vpaddq $T0, $ACC7, $ACC7 vpsrlq $digit_size, $ACC7, $T0 vpand $AND_MASK, $ACC7, $ACC7 vpaddq $T0, $ACC8, $ACC8 #vpand $AND_MASK, $ACC8, $ACC8 ret .size avx2_normalize,.-avx2_normalize .type avx2_normalize_n_store,\@abi-omnipotent .align 32 avx2_normalize_n_store: vpsrlq $digit_size, $ACC0, $T0 vpand $AND_MASK, $ACC0, $ACC0 vpaddq $T0, $ACC1, $ACC1 vpsrlq $digit_size, $ACC1, $T0 vpand $AND_MASK, $ACC1, $ACC1 vmovdqa $ACC0, 32*0(%rdi) lea 160(%rdi), %rax # size optimization vpaddq $T0, $ACC2, $ACC2 vpsrlq $digit_size, $ACC2, $T0 vpand $AND_MASK, $ACC2, $ACC2 vmovdqa $ACC1, 32*1(%rdi) vpaddq $T0, $ACC3, $ACC3 vpsrlq $digit_size, $ACC3, $T0 vpand $AND_MASK, $ACC3, $ACC3 vmovdqa $ACC2, 32*2(%rdi) vpaddq $T0, $ACC4, $ACC4 vpsrlq $digit_size, $ACC4, $T0 vpand $AND_MASK, $ACC4, $ACC4 vmovdqa $ACC3, 32*3(%rdi) vpaddq $T0, $ACC5, $ACC5 vpsrlq $digit_size, $ACC5, $T0 vpand $AND_MASK, $ACC5, $ACC5 vmovdqa $ACC4, 32*4-160(%rax) vpaddq $T0, $ACC6, $ACC6 vpsrlq $digit_size, $ACC6, $T0 vpand $AND_MASK, $ACC6, $ACC6 vmovdqa $ACC5, 32*5-160(%rax) vpaddq $T0, $ACC7, $ACC7 vpsrlq $digit_size, $ACC7, $T0 vpand $AND_MASK, $ACC7, $ACC7 vmovdqa $ACC6, 32*6-160(%rax) vpaddq $T0, $ACC8, $ACC8 #vpand $AND_MASK, $ACC8, $ACC8 vmovdqa $ACC7, 32*7-160(%rax) vmovdqa $ACC8, 32*8-160(%rax) ret .size avx2_normalize_n_store,.-avx2_normalize_n_store ################################################################################ # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4); .type avx2_mul_x4,\@abi-omnipotent .align 32 avx2_mul_x4: lea .LAVX2_POLY(%rip), %rax vpxor $ACC0, $ACC0, $ACC0 vpxor $ACC1, $ACC1, $ACC1 vpxor $ACC2, $ACC2, $ACC2 vpxor $ACC3, $ACC3, $ACC3 vpxor $ACC4, $ACC4, $ACC4 vpxor $ACC5, $ACC5, $ACC5 vpxor $ACC6, $ACC6, $ACC6 vpxor $ACC7, $ACC7, $ACC7 vmovdqa 32*7(%rax), %ymm14 vmovdqa 32*8(%rax), %ymm15 mov $n_digits, $itr lea -512($a_ptr), $a_ptr # strategic bias to control u-op density jmp .Lavx2_mul_x4_loop .align 32 .Lavx2_mul_x4_loop: vmovdqa 32*0($b_ptr), $B lea 32*1($b_ptr), $b_ptr vpmuludq 32*0+512($a_ptr), $B, $T0 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW vpaddq $T0, $ACC0, $ACC0 vpmuludq 32*2+512($a_ptr), $B, $T0 vpaddq $OVERFLOW, $ACC1, $ACC1 vpand $AND_MASK, $ACC0, $Y vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW vpaddq $T0, $ACC2, $ACC2 vpmuludq 32*4+512($a_ptr), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*6+512($a_ptr), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 # Skip some multiplications, optimizing for the constant poly vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*8+512($a_ptr), $B, $ACC8 vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 .byte 0x67 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $OVERFLOW .byte 0x67 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $T0 vpaddq $OVERFLOW, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $OVERFLOW vpaddq $T0, $ACC7, $ACC6 vpaddq $OVERFLOW, $ACC8, $ACC7 dec $itr jnz .Lavx2_mul_x4_loop vpxor $ACC8, $ACC8, $ACC8 ret .size avx2_mul_x4,.-avx2_mul_x4 # Function optimized for the constant 1 ################################################################################ # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4); .type avx2_mul_by1_x4,\@abi-omnipotent .align 32 avx2_mul_by1_x4: lea .LAVX2_POLY(%rip), %rax vpxor $ACC0, $ACC0, $ACC0 vpxor $ACC1, $ACC1, $ACC1 vpxor $ACC2, $ACC2, $ACC2 vpxor $ACC3, $ACC3, $ACC3 vpxor $ACC4, $ACC4, $ACC4 vpxor $ACC5, $ACC5, $ACC5 vpxor $ACC6, $ACC6, $ACC6 vpxor $ACC7, $ACC7, $ACC7 vpxor $ACC8, $ACC8, $ACC8 vmovdqa 32*3+.LONE(%rip), %ymm14 vmovdqa 32*7+.LONE(%rip), %ymm15 mov $n_digits, $itr jmp .Lavx2_mul_by1_x4_loop .align 32 .Lavx2_mul_by1_x4_loop: vmovdqa 32*0($a_ptr), $B .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr vpsllq \$5, $B, $OVERFLOW vpmuludq %ymm14, $B, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC3 .byte 0x67 vpmuludq $AND_MASK, $B, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $T0, $ACC4, $ACC4 vpaddq $T0, $ACC5, $ACC5 vpaddq $T0, $ACC6, $ACC6 vpsllq \$23, $B, $T0 .byte 0x67,0x67 vpmuludq %ymm15, $B, $OVERFLOW vpsubq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 .byte 0x67,0x67 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $OVERFLOW vmovdqa $ACC5, $ACC4 vpmuludq 32*7(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC6, $ACC5 vpaddq $T0, $ACC7, $ACC6 vpmuludq 32*8(%rax), $Y, $ACC7 dec $itr jnz .Lavx2_mul_by1_x4_loop ret .size avx2_mul_by1_x4,.-avx2_mul_by1_x4 ################################################################################ # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4); .type avx2_sqr_x4,\@abi-omnipotent .align 32 avx2_sqr_x4: lea .LAVX2_POLY(%rip), %rax vmovdqa 32*7(%rax), %ymm14 vmovdqa 32*8(%rax), %ymm15 vmovdqa 32*0($a_ptr), $B vmovdqa 32*1($a_ptr), $ACC1 vmovdqa 32*2($a_ptr), $ACC2 vmovdqa 32*3($a_ptr), $ACC3 vmovdqa 32*4($a_ptr), $ACC4 vmovdqa 32*5($a_ptr), $ACC5 vmovdqa 32*6($a_ptr), $ACC6 vmovdqa 32*7($a_ptr), $ACC7 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7 vmovdqa 32*8($a_ptr), $ACC8 vpaddq $ACC2, $ACC2, $ACC2 vmovdqa $ACC1, 32*0(%rcx) vpaddq $ACC3, $ACC3, $ACC3 vmovdqa $ACC2, 32*1(%rcx) vpaddq $ACC4, $ACC4, $ACC4 vmovdqa $ACC3, 32*2(%rcx) vpaddq $ACC5, $ACC5, $ACC5 vmovdqa $ACC4, 32*3(%rcx) vpaddq $ACC6, $ACC6, $ACC6 vmovdqa $ACC5, 32*4(%rcx) vpaddq $ACC7, $ACC7, $ACC7 vmovdqa $ACC6, 32*5(%rcx) vpaddq $ACC8, $ACC8, $ACC8 vmovdqa $ACC7, 32*6(%rcx) vmovdqa $ACC8, 32*7(%rcx) #itr 1 vpmuludq $B, $B, $ACC0 vpmuludq $B, $ACC1, $ACC1 vpand $AND_MASK, $ACC0, $Y vpmuludq $B, $ACC2, $ACC2 vpmuludq $B, $ACC3, $ACC3 vpmuludq $B, $ACC4, $ACC4 vpmuludq $B, $ACC5, $ACC5 vpmuludq $B, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpmuludq $B, $ACC7, $ACC7 vpmuludq $B, $ACC8, $ACC8 vmovdqa 32*1($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 2 vpmuludq $B, $B, $OVERFLOW vpand $AND_MASK, $ACC0, $Y vpmuludq 32*1(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC1, $ACC1 vpmuludq 32*2(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC2, $ACC2 vpmuludq 32*3(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*2($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 3 vpmuludq $B, $B, $T0 vpand $AND_MASK, $ACC0, $Y vpmuludq 32*2(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC2, $ACC2 vpmuludq 32*3(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*3($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 4 vpmuludq $B, $B, $OVERFLOW vpmuludq 32*3(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*4($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 5 vpmuludq $B, $B, $T0 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*5($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 6 vpmuludq $B, $B, $OVERFLOW vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*6($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 7 vpmuludq $B, $B, $T0 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*7($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 8 vpmuludq $B, $B, $OVERFLOW vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*8($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 9 vpmuludq $B, $B, $ACC8 vpmuludq $AND_MASK, $Y, $T0 vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 vpxor $ACC8, $ACC8, $ACC8 ret .size avx2_sqr_x4,.-avx2_sqr_x4 ################################################################################ # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4); .type avx2_sub_x4,\@abi-omnipotent .align 32 avx2_sub_x4: vmovdqa 32*0($a_ptr), $ACC0 lea 160($a_ptr), $a_ptr lea .LAVX2_POLY_x8+128(%rip), %rax lea 128($b_ptr), $b_ptr vmovdqa 32*1-160($a_ptr), $ACC1 vmovdqa 32*2-160($a_ptr), $ACC2 vmovdqa 32*3-160($a_ptr), $ACC3 vmovdqa 32*4-160($a_ptr), $ACC4 vmovdqa 32*5-160($a_ptr), $ACC5 vmovdqa 32*6-160($a_ptr), $ACC6 vmovdqa 32*7-160($a_ptr), $ACC7 vmovdqa 32*8-160($a_ptr), $ACC8 vpaddq 32*0-128(%rax), $ACC0, $ACC0 vpaddq 32*1-128(%rax), $ACC1, $ACC1 vpaddq 32*2-128(%rax), $ACC2, $ACC2 vpaddq 32*3-128(%rax), $ACC3, $ACC3 vpaddq 32*4-128(%rax), $ACC4, $ACC4 vpaddq 32*5-128(%rax), $ACC5, $ACC5 vpaddq 32*6-128(%rax), $ACC6, $ACC6 vpaddq 32*7-128(%rax), $ACC7, $ACC7 vpaddq 32*8-128(%rax), $ACC8, $ACC8 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8 ret .size avx2_sub_x4,.-avx2_sub_x4 .type avx2_select_n_store,\@abi-omnipotent .align 32 avx2_select_n_store: vmovdqa `8+32*9*8`(%rsp), $Y vpor `8+32*9*8+32`(%rsp), $Y, $Y vpandn $ACC0, $Y, $ACC0 vpandn $ACC1, $Y, $ACC1 vpandn $ACC2, $Y, $ACC2 vpandn $ACC3, $Y, $ACC3 vpandn $ACC4, $Y, $ACC4 vpandn $ACC5, $Y, $ACC5 vpandn $ACC6, $Y, $ACC6 vmovdqa `8+32*9*8+32`(%rsp), $B vpandn $ACC7, $Y, $ACC7 vpandn `8+32*9*8`(%rsp), $B, $B vpandn $ACC8, $Y, $ACC8 vpand 32*0(%rsi), $B, $T0 lea 160(%rsi), %rax vpand 32*1(%rsi), $B, $Y vpxor $T0, $ACC0, $ACC0 vpand 32*2(%rsi), $B, $T0 vpxor $Y, $ACC1, $ACC1 vpand 32*3(%rsi), $B, $Y vpxor $T0, $ACC2, $ACC2 vpand 32*4-160(%rax), $B, $T0 vpxor $Y, $ACC3, $ACC3 vpand 32*5-160(%rax), $B, $Y vpxor $T0, $ACC4, $ACC4 vpand 32*6-160(%rax), $B, $T0 vpxor $Y, $ACC5, $ACC5 vpand 32*7-160(%rax), $B, $Y vpxor $T0, $ACC6, $ACC6 vpand 32*8-160(%rax), $B, $T0 vmovdqa `8+32*9*8+32`(%rsp), $B vpxor $Y, $ACC7, $ACC7 vpand 32*0(%rdx), $B, $Y lea 160(%rdx), %rax vpxor $T0, $ACC8, $ACC8 vpand 32*1(%rdx), $B, $T0 vpxor $Y, $ACC0, $ACC0 vpand 32*2(%rdx), $B, $Y vpxor $T0, $ACC1, $ACC1 vpand 32*3(%rdx), $B, $T0 vpxor $Y, $ACC2, $ACC2 vpand 32*4-160(%rax), $B, $Y vpxor $T0, $ACC3, $ACC3 vpand 32*5-160(%rax), $B, $T0 vpxor $Y, $ACC4, $ACC4 vpand 32*6-160(%rax), $B, $Y vpxor $T0, $ACC5, $ACC5 vpand 32*7-160(%rax), $B, $T0 vpxor $Y, $ACC6, $ACC6 vpand 32*8-160(%rax), $B, $Y vpxor $T0, $ACC7, $ACC7 vpxor $Y, $ACC8, $ACC8 `&STORE` ret .size avx2_select_n_store,.-avx2_select_n_store ___ $code.=<<___ if (0); # inlined ################################################################################ # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4); .type avx2_mul_by2_x4,\@abi-omnipotent .align 32 avx2_mul_by2_x4: vmovdqa 32*0($a_ptr), $ACC0 lea 160($a_ptr), %rax vmovdqa 32*1($a_ptr), $ACC1 vmovdqa 32*2($a_ptr), $ACC2 vmovdqa 32*3($a_ptr), $ACC3 vmovdqa 32*4-160(%rax), $ACC4 vmovdqa 32*5-160(%rax), $ACC5 vmovdqa 32*6-160(%rax), $ACC6 vmovdqa 32*7-160(%rax), $ACC7 vmovdqa 32*8-160(%rax), $ACC8 vpaddq $ACC0, $ACC0, $ACC0 vpaddq $ACC1, $ACC1, $ACC1 vpaddq $ACC2, $ACC2, $ACC2 vpaddq $ACC3, $ACC3, $ACC3 vpaddq $ACC4, $ACC4, $ACC4 vpaddq $ACC5, $ACC5, $ACC5 vpaddq $ACC6, $ACC6, $ACC6 vpaddq $ACC7, $ACC7, $ACC7 vpaddq $ACC8, $ACC8, $ACC8 ret .size avx2_mul_by2_x4,.-avx2_mul_by2_x4 ___ my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx"); my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10"); $code.=<<___; ################################################################################ # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4); .globl ecp_nistz256_avx2_point_add_affine_x4 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3 .align 32 ecp_nistz256_avx2_point_add_affine_x4: mov %rsp, %rax push %rbp vzeroupper ___ $code.=<<___ if ($win64); lea -16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; lea -8(%rax), %rbp # Result + 32*0 = Result.X # Result + 32*9 = Result.Y # Result + 32*18 = Result.Z # A + 32*0 = A.X # A + 32*9 = A.Y # A + 32*18 = A.Z # B + 32*0 = B.X # B + 32*9 = B.Y sub \$`32*9*8+32*2+32*8`, %rsp and \$-64, %rsp mov $r_ptr_in, $r_ptr mov $a_ptr_in, $a_ptr mov $b_ptr_in, $b_ptr vmovdqa 32*0($a_ptr_in), %ymm0 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK vpxor %ymm1, %ymm1, %ymm1 lea 256($a_ptr_in), %rax # size optimization vpor 32*1($a_ptr_in), %ymm0, %ymm0 vpor 32*2($a_ptr_in), %ymm0, %ymm0 vpor 32*3($a_ptr_in), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8`(%rsp) vpxor %ymm1, %ymm1, %ymm1 vmovdqa 32*0($b_ptr), %ymm0 lea 256($b_ptr), %rax # size optimization vpor 32*1($b_ptr), %ymm0, %ymm0 vpor 32*2($b_ptr), %ymm0, %ymm0 vpor 32*3($b_ptr), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8+32`(%rsp) # Z1^2 = Z1*Z1 lea `32*9*2`($a_ptr), %rsi lea `32*9*2`(%rsp), %rdi lea `32*9*8+32*2`(%rsp), %rcx # temporary vector call avx2_sqr_x4 call avx2_normalize_n_store # U2 = X2*Z1^2 lea `32*9*0`($b_ptr), %rsi lea `32*9*2`(%rsp), %rdx lea `32*9*0`(%rsp), %rdi call avx2_mul_x4 #call avx2_normalize `&STORE` # S2 = Z1*Z1^2 = Z1^3 lea `32*9*2`($a_ptr), %rsi lea `32*9*2`(%rsp), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # S2 = S2*Y2 = Y2*Z1^3 lea `32*9*1`($b_ptr), %rsi lea `32*9*1`(%rsp), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # H = U2 - U1 = U2 - X1 lea `32*9*0`(%rsp), %rsi lea `32*9*0`($a_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # R = S2 - S1 = S2 - Y1 lea `32*9*1`(%rsp), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*4`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # Z3 = H*Z1*Z2 lea `32*9*3`(%rsp), %rsi lea `32*9*2`($a_ptr), %rdx lea `32*9*2`($r_ptr), %rdi call avx2_mul_x4 call avx2_normalize lea .LONE(%rip), %rsi lea `32*9*2`($a_ptr), %rdx call avx2_select_n_store # R^2 = R^2 lea `32*9*4`(%rsp), %rsi lea `32*9*6`(%rsp), %rdi lea `32*9*8+32*2`(%rsp), %rcx # temporary vector call avx2_sqr_x4 call avx2_normalize_n_store # H^2 = H^2 lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdi call avx2_sqr_x4 call avx2_normalize_n_store # H^3 = H^2*H lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*7`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # U2 = U1*H^2 lea `32*9*0`($a_ptr), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*0`(%rsp), %rdi call avx2_mul_x4 #call avx2_normalize `&STORE` # Hsqr = U2*2 #lea 32*9*0(%rsp), %rsi #lea 32*9*5(%rsp), %rdi #call avx2_mul_by2_x4 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 lea `32*9*5`(%rsp), %rdi vpaddq $ACC1, $ACC1, $ACC1 vpaddq $ACC2, $ACC2, $ACC2 vpaddq $ACC3, $ACC3, $ACC3 vpaddq $ACC4, $ACC4, $ACC4 vpaddq $ACC5, $ACC5, $ACC5 vpaddq $ACC6, $ACC6, $ACC6 vpaddq $ACC7, $ACC7, $ACC7 vpaddq $ACC8, $ACC8, $ACC8 call avx2_normalize_n_store # X3 = R^2 - H^3 #lea 32*9*6(%rsp), %rsi #lea 32*9*7(%rsp), %rdx #lea 32*9*5(%rsp), %rcx #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE # X3 = X3 - U2*2 #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE lea `32*9*6+128`(%rsp), %rsi lea .LAVX2_POLY_x2+128(%rip), %rax lea `32*9*7+128`(%rsp), %rdx lea `32*9*5+128`(%rsp), %rcx lea `32*9*0`($r_ptr), %rdi vmovdqa 32*0-128(%rsi), $ACC0 vmovdqa 32*1-128(%rsi), $ACC1 vmovdqa 32*2-128(%rsi), $ACC2 vmovdqa 32*3-128(%rsi), $ACC3 vmovdqa 32*4-128(%rsi), $ACC4 vmovdqa 32*5-128(%rsi), $ACC5 vmovdqa 32*6-128(%rsi), $ACC6 vmovdqa 32*7-128(%rsi), $ACC7 vmovdqa 32*8-128(%rsi), $ACC8 vpaddq 32*0-128(%rax), $ACC0, $ACC0 vpaddq 32*1-128(%rax), $ACC1, $ACC1 vpaddq 32*2-128(%rax), $ACC2, $ACC2 vpaddq 32*3-128(%rax), $ACC3, $ACC3 vpaddq 32*4-128(%rax), $ACC4, $ACC4 vpaddq 32*5-128(%rax), $ACC5, $ACC5 vpaddq 32*6-128(%rax), $ACC6, $ACC6 vpaddq 32*7-128(%rax), $ACC7, $ACC7 vpaddq 32*8-128(%rax), $ACC8, $ACC8 vpsubq 32*0-128(%rdx), $ACC0, $ACC0 vpsubq 32*1-128(%rdx), $ACC1, $ACC1 vpsubq 32*2-128(%rdx), $ACC2, $ACC2 vpsubq 32*3-128(%rdx), $ACC3, $ACC3 vpsubq 32*4-128(%rdx), $ACC4, $ACC4 vpsubq 32*5-128(%rdx), $ACC5, $ACC5 vpsubq 32*6-128(%rdx), $ACC6, $ACC6 vpsubq 32*7-128(%rdx), $ACC7, $ACC7 vpsubq 32*8-128(%rdx), $ACC8, $ACC8 vpsubq 32*0-128(%rcx), $ACC0, $ACC0 vpsubq 32*1-128(%rcx), $ACC1, $ACC1 vpsubq 32*2-128(%rcx), $ACC2, $ACC2 vpsubq 32*3-128(%rcx), $ACC3, $ACC3 vpsubq 32*4-128(%rcx), $ACC4, $ACC4 vpsubq 32*5-128(%rcx), $ACC5, $ACC5 vpsubq 32*6-128(%rcx), $ACC6, $ACC6 vpsubq 32*7-128(%rcx), $ACC7, $ACC7 vpsubq 32*8-128(%rcx), $ACC8, $ACC8 call avx2_normalize lea 32*0($b_ptr), %rsi lea 32*0($a_ptr), %rdx call avx2_select_n_store # H = U2 - X3 lea `32*9*0`(%rsp), %rsi lea `32*9*0`($r_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # lea `32*9*3`(%rsp), %rsi lea `32*9*4`(%rsp), %rdx lea `32*9*3`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # lea `32*9*7`(%rsp), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # lea `32*9*3`(%rsp), %rsi lea `32*9*1`(%rsp), %rdx lea `32*9*1`($r_ptr), %rdi call avx2_sub_x4 call avx2_normalize lea 32*9($b_ptr), %rsi lea 32*9($a_ptr), %rdx call avx2_select_n_store #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_mul_by1_x4 #NORMALIZE #STORE lea `32*9*1`($r_ptr), %rsi lea `32*9*1`($r_ptr), %rdi call avx2_mul_by1_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps %xmm6, -16*10(%rbp) movaps %xmm7, -16*9(%rbp) movaps %xmm8, -16*8(%rbp) movaps %xmm9, -16*7(%rbp) movaps %xmm10, -16*6(%rbp) movaps %xmm11, -16*5(%rbp) movaps %xmm12, -16*4(%rbp) movaps %xmm13, -16*3(%rbp) movaps %xmm14, -16*2(%rbp) movaps %xmm15, -16*1(%rbp) ___ $code.=<<___; mov %rbp, %rsp pop %rbp ret .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4 ################################################################################ # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4); .globl ecp_nistz256_avx2_point_add_affines_x4 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3 .align 32 ecp_nistz256_avx2_point_add_affines_x4: mov %rsp, %rax push %rbp vzeroupper ___ $code.=<<___ if ($win64); lea -16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; lea -8(%rax), %rbp # Result + 32*0 = Result.X # Result + 32*9 = Result.Y # Result + 32*18 = Result.Z # A + 32*0 = A.X # A + 32*9 = A.Y # B + 32*0 = B.X # B + 32*9 = B.Y sub \$`32*9*8+32*2+32*8`, %rsp and \$-64, %rsp mov $r_ptr_in, $r_ptr mov $a_ptr_in, $a_ptr mov $b_ptr_in, $b_ptr vmovdqa 32*0($a_ptr_in), %ymm0 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK vpxor %ymm1, %ymm1, %ymm1 lea 256($a_ptr_in), %rax # size optimization vpor 32*1($a_ptr_in), %ymm0, %ymm0 vpor 32*2($a_ptr_in), %ymm0, %ymm0 vpor 32*3($a_ptr_in), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8`(%rsp) vpxor %ymm1, %ymm1, %ymm1 vmovdqa 32*0($b_ptr), %ymm0 lea 256($b_ptr), %rax # size optimization vpor 32*1($b_ptr), %ymm0, %ymm0 vpor 32*2($b_ptr), %ymm0, %ymm0 vpor 32*3($b_ptr), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8+32`(%rsp) # H = U2 - U1 = X2 - X1 lea `32*9*0`($b_ptr), %rsi lea `32*9*0`($a_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # R = S2 - S1 = Y2 - Y1 lea `32*9*1`($b_ptr), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*4`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # Z3 = H*Z1*Z2 = H lea `32*9*3`(%rsp), %rsi lea `32*9*2`($r_ptr), %rdi call avx2_mul_by1_x4 call avx2_normalize vmovdqa `32*9*8`(%rsp), $B vpor `32*9*8+32`(%rsp), $B, $B vpandn $ACC0, $B, $ACC0 lea .LONE+128(%rip), %rax vpandn $ACC1, $B, $ACC1 vpandn $ACC2, $B, $ACC2 vpandn $ACC3, $B, $ACC3 vpandn $ACC4, $B, $ACC4 vpandn $ACC5, $B, $ACC5 vpandn $ACC6, $B, $ACC6 vpandn $ACC7, $B, $ACC7 vpand 32*0-128(%rax), $B, $T0 vpandn $ACC8, $B, $ACC8 vpand 32*1-128(%rax), $B, $Y vpxor $T0, $ACC0, $ACC0 vpand 32*2-128(%rax), $B, $T0 vpxor $Y, $ACC1, $ACC1 vpand 32*3-128(%rax), $B, $Y vpxor $T0, $ACC2, $ACC2 vpand 32*4-128(%rax), $B, $T0 vpxor $Y, $ACC3, $ACC3 vpand 32*5-128(%rax), $B, $Y vpxor $T0, $ACC4, $ACC4 vpand 32*6-128(%rax), $B, $T0 vpxor $Y, $ACC5, $ACC5 vpand 32*7-128(%rax), $B, $Y vpxor $T0, $ACC6, $ACC6 vpand 32*8-128(%rax), $B, $T0 vpxor $Y, $ACC7, $ACC7 vpxor $T0, $ACC8, $ACC8 `&STORE` # R^2 = R^2 lea `32*9*4`(%rsp), %rsi lea `32*9*6`(%rsp), %rdi lea `32*9*8+32*2`(%rsp), %rcx # temporary vector call avx2_sqr_x4 call avx2_normalize_n_store # H^2 = H^2 lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdi call avx2_sqr_x4 call avx2_normalize_n_store # H^3 = H^2*H lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*7`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # U2 = U1*H^2 lea `32*9*0`($a_ptr), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*0`(%rsp), %rdi call avx2_mul_x4 #call avx2_normalize `&STORE` # Hsqr = U2*2 #lea 32*9*0(%rsp), %rsi #lea 32*9*5(%rsp), %rdi #call avx2_mul_by2_x4 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 lea `32*9*5`(%rsp), %rdi vpaddq $ACC1, $ACC1, $ACC1 vpaddq $ACC2, $ACC2, $ACC2 vpaddq $ACC3, $ACC3, $ACC3 vpaddq $ACC4, $ACC4, $ACC4 vpaddq $ACC5, $ACC5, $ACC5 vpaddq $ACC6, $ACC6, $ACC6 vpaddq $ACC7, $ACC7, $ACC7 vpaddq $ACC8, $ACC8, $ACC8 call avx2_normalize_n_store # X3 = R^2 - H^3 #lea 32*9*6(%rsp), %rsi #lea 32*9*7(%rsp), %rdx #lea 32*9*5(%rsp), %rcx #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE # X3 = X3 - U2*2 #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE lea `32*9*6+128`(%rsp), %rsi lea .LAVX2_POLY_x2+128(%rip), %rax lea `32*9*7+128`(%rsp), %rdx lea `32*9*5+128`(%rsp), %rcx lea `32*9*0`($r_ptr), %rdi vmovdqa 32*0-128(%rsi), $ACC0 vmovdqa 32*1-128(%rsi), $ACC1 vmovdqa 32*2-128(%rsi), $ACC2 vmovdqa 32*3-128(%rsi), $ACC3 vmovdqa 32*4-128(%rsi), $ACC4 vmovdqa 32*5-128(%rsi), $ACC5 vmovdqa 32*6-128(%rsi), $ACC6 vmovdqa 32*7-128(%rsi), $ACC7 vmovdqa 32*8-128(%rsi), $ACC8 vpaddq 32*0-128(%rax), $ACC0, $ACC0 vpaddq 32*1-128(%rax), $ACC1, $ACC1 vpaddq 32*2-128(%rax), $ACC2, $ACC2 vpaddq 32*3-128(%rax), $ACC3, $ACC3 vpaddq 32*4-128(%rax), $ACC4, $ACC4 vpaddq 32*5-128(%rax), $ACC5, $ACC5 vpaddq 32*6-128(%rax), $ACC6, $ACC6 vpaddq 32*7-128(%rax), $ACC7, $ACC7 vpaddq 32*8-128(%rax), $ACC8, $ACC8 vpsubq 32*0-128(%rdx), $ACC0, $ACC0 vpsubq 32*1-128(%rdx), $ACC1, $ACC1 vpsubq 32*2-128(%rdx), $ACC2, $ACC2 vpsubq 32*3-128(%rdx), $ACC3, $ACC3 vpsubq 32*4-128(%rdx), $ACC4, $ACC4 vpsubq 32*5-128(%rdx), $ACC5, $ACC5 vpsubq 32*6-128(%rdx), $ACC6, $ACC6 vpsubq 32*7-128(%rdx), $ACC7, $ACC7 vpsubq 32*8-128(%rdx), $ACC8, $ACC8 vpsubq 32*0-128(%rcx), $ACC0, $ACC0 vpsubq 32*1-128(%rcx), $ACC1, $ACC1 vpsubq 32*2-128(%rcx), $ACC2, $ACC2 vpsubq 32*3-128(%rcx), $ACC3, $ACC3 vpsubq 32*4-128(%rcx), $ACC4, $ACC4 vpsubq 32*5-128(%rcx), $ACC5, $ACC5 vpsubq 32*6-128(%rcx), $ACC6, $ACC6 vpsubq 32*7-128(%rcx), $ACC7, $ACC7 vpsubq 32*8-128(%rcx), $ACC8, $ACC8 call avx2_normalize lea 32*0($b_ptr), %rsi lea 32*0($a_ptr), %rdx call avx2_select_n_store # H = U2 - X3 lea `32*9*0`(%rsp), %rsi lea `32*9*0`($r_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # H = H*R lea `32*9*3`(%rsp), %rsi lea `32*9*4`(%rsp), %rdx lea `32*9*3`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # S2 = S1 * H^3 lea `32*9*7`(%rsp), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # lea `32*9*3`(%rsp), %rsi lea `32*9*1`(%rsp), %rdx lea `32*9*1`($r_ptr), %rdi call avx2_sub_x4 call avx2_normalize lea 32*9($b_ptr), %rsi lea 32*9($a_ptr), %rdx call avx2_select_n_store #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_mul_by1_x4 #NORMALIZE #STORE lea `32*9*1`($r_ptr), %rsi lea `32*9*1`($r_ptr), %rdi call avx2_mul_by1_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps %xmm6, -16*10(%rbp) movaps %xmm7, -16*9(%rbp) movaps %xmm8, -16*8(%rbp) movaps %xmm9, -16*7(%rbp) movaps %xmm10, -16*6(%rbp) movaps %xmm11, -16*5(%rbp) movaps %xmm12, -16*4(%rbp) movaps %xmm13, -16*3(%rbp) movaps %xmm14, -16*2(%rbp) movaps %xmm15, -16*1(%rbp) ___ $code.=<<___; mov %rbp, %rsp pop %rbp ret .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4 ################################################################################ # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4); .globl ecp_nistz256_avx2_to_mont .type ecp_nistz256_avx2_to_mont,\@function,2 .align 32 ecp_nistz256_avx2_to_mont: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK lea .LTO_MONT_AVX2(%rip), %rdx call avx2_mul_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont ################################################################################ # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4); .globl ecp_nistz256_avx2_from_mont .type ecp_nistz256_avx2_from_mont,\@function,2 .align 32 ecp_nistz256_avx2_from_mont: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK lea .LFROM_MONT_AVX2(%rip), %rdx call avx2_mul_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont ################################################################################ # void ecp_nistz256_avx2_set1(void* RESULTx4); .globl ecp_nistz256_avx2_set1 .type ecp_nistz256_avx2_set1,\@function,1 .align 32 ecp_nistz256_avx2_set1: lea .LONE+128(%rip), %rax lea 128(%rdi), %rdi vzeroupper vmovdqa 32*0-128(%rax), %ymm0 vmovdqa 32*1-128(%rax), %ymm1 vmovdqa 32*2-128(%rax), %ymm2 vmovdqa 32*3-128(%rax), %ymm3 vmovdqa 32*4-128(%rax), %ymm4 vmovdqa 32*5-128(%rax), %ymm5 vmovdqa %ymm0, 32*0-128(%rdi) vmovdqa 32*6-128(%rax), %ymm0 vmovdqa %ymm1, 32*1-128(%rdi) vmovdqa 32*7-128(%rax), %ymm1 vmovdqa %ymm2, 32*2-128(%rdi) vmovdqa 32*8-128(%rax), %ymm2 vmovdqa %ymm3, 32*3-128(%rdi) vmovdqa %ymm4, 32*4-128(%rdi) vmovdqa %ymm5, 32*5-128(%rdi) vmovdqa %ymm0, 32*6-128(%rdi) vmovdqa %ymm1, 32*7-128(%rdi) vmovdqa %ymm2, 32*8-128(%rdi) vzeroupper ret .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1 ___ } { ################################################################################ # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in, # int index0, int index1, int index2, int index3); ################################################################################ my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d"); my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3)); my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); $code.=<<___; .globl ecp_nistz256_avx2_multi_gather_w7 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6 .align 32 ecp_nistz256_avx2_multi_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; lea .LIntOne(%rip), %rax vmovd $index0, %xmm0 vmovd $index1, %xmm1 vmovd $index2, %xmm2 vmovd $index3, %xmm3 vpxor $R0a, $R0a, $R0a vpxor $R0b, $R0b, $R0b vpxor $R1a, $R1a, $R1a vpxor $R1b, $R1b, $R1b vpxor $R2a, $R2a, $R2a vpxor $R2b, $R2b, $R2b vpxor $R3a, $R3a, $R3a vpxor $R3b, $R3b, $R3b vmovdqa (%rax), $M0 vpermd $INDEX0, $R0a, $INDEX0 vpermd $INDEX1, $R0a, $INDEX1 vpermd $INDEX2, $R0a, $INDEX2 vpermd $INDEX3, $R0a, $INDEX3 mov \$64, %ecx lea 112($val), $val # size optimization jmp .Lmulti_select_loop_avx2 # INDEX=0, corresponds to the point at infty (0,0) .align 32 .Lmulti_select_loop_avx2: vpcmpeqd $INDEX0, $M0, $TMP0 vmovdqa `32*0+32*64*2*0`($in_t), $T0 vmovdqa `32*1+32*64*2*0`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R0a, $R0a vpxor $T1, $R0b, $R0b vpcmpeqd $INDEX1, $M0, $TMP0 vmovdqa `32*0+32*64*2*1`($in_t), $T0 vmovdqa `32*1+32*64*2*1`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R1a, $R1a vpxor $T1, $R1b, $R1b vpcmpeqd $INDEX2, $M0, $TMP0 vmovdqa `32*0+32*64*2*2`($in_t), $T0 vmovdqa `32*1+32*64*2*2`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R2a, $R2a vpxor $T1, $R2b, $R2b vpcmpeqd $INDEX3, $M0, $TMP0 vmovdqa `32*0+32*64*2*3`($in_t), $T0 vmovdqa `32*1+32*64*2*3`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R3a, $R3a vpxor $T1, $R3b, $R3b vpaddd (%rax), $M0, $M0 # increment lea 32*2($in_t), $in_t dec %ecx jnz .Lmulti_select_loop_avx2 vmovdqu $R0a, 32*0-112($val) vmovdqu $R0b, 32*1-112($val) vmovdqu $R1a, 32*2-112($val) vmovdqu $R1b, 32*3-112($val) vmovdqu $R2a, 32*4-112($val) vmovdqu $R2b, 32*5-112($val) vmovdqu $R3a, 32*6-112($val) vmovdqu $R3b, 32*7-112($val) vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .extern OPENSSL_ia32cap_P .globl ecp_nistz_avx2_eligible .type ecp_nistz_avx2_eligible,\@abi-omnipotent .align 32 ecp_nistz_avx2_eligible: mov OPENSSL_ia32cap_P+8(%rip),%eax shr \$5,%eax and \$1,%eax ret .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible ___ } }} else {{ # assembler is too old $code.=<<___; .text .globl ecp_nistz256_avx2_transpose_convert .globl ecp_nistz256_avx2_convert_transpose_back .globl ecp_nistz256_avx2_point_add_affine_x4 .globl ecp_nistz256_avx2_point_add_affines_x4 .globl ecp_nistz256_avx2_to_mont .globl ecp_nistz256_avx2_from_mont .globl ecp_nistz256_avx2_set1 .globl ecp_nistz256_avx2_multi_gather_w7 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent ecp_nistz256_avx2_transpose_convert: ecp_nistz256_avx2_convert_transpose_back: ecp_nistz256_avx2_point_add_affine_x4: ecp_nistz256_avx2_point_add_affines_x4: ecp_nistz256_avx2_to_mont: ecp_nistz256_avx2_from_mont: ecp_nistz256_avx2_set1: ecp_nistz256_avx2_multi_gather_w7: .byte 0x0f,0x0b # ud2 ret .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .globl ecp_nistz_avx2_eligible .type ecp_nistz_avx2_eligible,\@abi-omnipotent ecp_nistz_avx2_eligible: xor %eax,%eax ret .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible ___ }} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl (revision 364822) @@ -1,4739 +1,4739 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2014, Intel Corporation. All Rights Reserved. # Copyright (c) 2015 CloudFlare, Inc. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # (3) CloudFlare, Inc. # # Reference: # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with # 256 Bit Primes" # Further optimization by : # # this/original with/without -DECP_NISTZ256_ASM(*) # Opteron +15-49% +150-195% # Bulldozer +18-45% +175-240% # P4 +24-46% +100-150% # Westmere +18-34% +87-160% # Sandy Bridge +14-35% +120-185% # Ivy Bridge +11-35% +125-180% # Haswell +10-37% +160-200% # Broadwell +24-58% +210-270% # Atom +20-50% +180-240% # VIA Nano +50-160% +480-480% # # (*) "without -DECP_NISTZ256_ASM" refers to build with # "enable-ec_nistp_64_gcc_128"; # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. In "this/original" column lower coefficient is for # ECDSA sign, while in "with/without" - for ECDH key agreement, and # higher - for ECDSA sign, relatively fastest server-side operation. # Keep in mind that +100% means 2x improvement. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } $code.=<<___; .text .extern OPENSSL_ia32cap_P # The polynomial .align 64 .Lpoly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 # 2^512 mod P precomputed for NIST P256 polynomial .LRR: .quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd .LOne: .long 1,1,1,1,1,1,1,1 .LTwo: .long 2,2,2,2,2,2,2,2 .LThree: .long 3,3,3,3,3,3,3,3 .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe # Constants for computations modulo ord(p256) .Lord: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f ___ { ################################################################################ # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); $code.=<<___; .globl ecp_nistz256_mul_by_2 .type ecp_nistz256_mul_by_2,\@function,2 .align 64 ecp_nistz256_mul_by_2: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lmul_by_2_body: mov 8*0($a_ptr), $a0 xor $t4,$t4 mov 8*1($a_ptr), $a1 add $a0, $a0 # a0:a3+a0:a3 mov 8*2($a_ptr), $a2 adc $a1, $a1 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub 8*0($a_ptr), $a0 mov $a2, $t2 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a3, $t3 sbb 8*3($a_ptr), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lmul_by_2_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 ################################################################################ # void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_div_by_2 .type ecp_nistz256_div_by_2,\@function,2 .align 32 ecp_nistz256_div_by_2: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Ldiv_by_2_body: mov 8*0($a_ptr), $a0 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), $a2 mov $a0, $t0 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a1, $t1 xor $t4, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 adc \$0, $t4 xor $a_ptr, $a_ptr # borrow $a_ptr test \$1, $t0 cmovz $t0, $a0 cmovz $t1, $a1 cmovz $t2, $a2 cmovz $t3, $a3 cmovz $a_ptr, $t4 mov $a1, $t0 # a0:a3>>1 shr \$1, $a0 shl \$63, $t0 mov $a2, $t1 shr \$1, $a1 or $t0, $a0 shl \$63, $t1 mov $a3, $t2 shr \$1, $a2 or $t1, $a1 shl \$63, $t2 shr \$1, $a3 shl \$63, $t4 or $t2, $a2 or $t4, $a3 mov $a0, 8*0($r_ptr) mov $a1, 8*1($r_ptr) mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Ldiv_by_2_epilogue: ret .cfi_endproc .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 ################################################################################ # void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_mul_by_3 .type ecp_nistz256_mul_by_3,\@function,2 .align 32 ecp_nistz256_mul_by_3: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lmul_by_3_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 mov 8*1($a_ptr), $a1 add $a0, $a0 # a0:a3+a0:a3 mov 8*2($a_ptr), $a2 adc $a1, $a1 mov 8*3($a_ptr), $a3 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb .Lpoly+8*1(%rip), $a1 sbb \$0, $a2 mov $a3, $t3 sbb .Lpoly+8*3(%rip), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 cmovc $t2, $a2 cmovc $t3, $a3 xor $t4, $t4 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] adc 8*1($a_ptr), $a1 mov $a0, $t0 adc 8*2($a_ptr), $a2 adc 8*3($a_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb .Lpoly+8*1(%rip), $a1 sbb \$0, $a2 mov $a3, $t3 sbb .Lpoly+8*3(%rip), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lmul_by_3_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 ################################################################################ # void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); .globl ecp_nistz256_add .type ecp_nistz256_add,\@function,3 .align 32 ecp_nistz256_add: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Ladd_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), $a2 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr add 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub 8*0($a_ptr), $a0 mov $a2, $t2 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a3, $t3 sbb 8*3($a_ptr), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Ladd_epilogue: ret .cfi_endproc .size ecp_nistz256_add,.-ecp_nistz256_add ################################################################################ # void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); .globl ecp_nistz256_sub .type ecp_nistz256_sub,\@function,3 .align 32 ecp_nistz256_sub: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lsub_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), $a2 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr sub 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb \$0, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lsub_epilogue: ret .cfi_endproc .size ecp_nistz256_sub,.-ecp_nistz256_sub ################################################################################ # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_neg .type ecp_nistz256_neg,\@function,2 .align 32 ecp_nistz256_neg: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lneg_body: xor $a0, $a0 xor $a1, $a1 xor $a2, $a2 xor $a3, $a3 xor $t4, $t4 sub 8*0($a_ptr), $a0 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a0, $t0 sbb 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a1, $t1 sbb \$0, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lneg_epilogue: ret .cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg ___ } { my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); my ($poly1,$poly3)=($acc6,$acc7); $code.=<<___; ################################################################################ # void ecp_nistz256_ord_mul_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .type ecp_nistz256_ord_mul_mont,\@function,3 .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lecp_nistz256_ord_mul_montx ___ $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_mul_body: mov 8*0($b_org), %rax mov $b_org, $b_ptr lea .Lord(%rip), %r14 mov .LordK(%rip), %r15 ################################# * b[0] mov %rax, $t0 mulq 8*0($a_ptr) mov %rax, $acc0 mov $t0, %rax mov %rdx, $acc1 mulq 8*1($a_ptr) add %rax, $acc1 mov $t0, %rax adc \$0, %rdx mov %rdx, $acc2 mulq 8*2($a_ptr) add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov $acc0, $acc5 imulq %r15,$acc0 mov %rdx, $acc3 mulq 8*3($a_ptr) add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx mov %rdx, $acc4 ################################# First reduction step mulq 8*0(%r14) mov $acc0, $t1 add %rax, $acc5 # guaranteed to be zero mov $acc0, %rax adc \$0, %rdx mov %rdx, $t0 sub $acc0, $acc2 sbb \$0, $acc0 # can't borrow mulq 8*1(%r14) add $t0, $acc1 adc \$0, %rdx add %rax, $acc1 mov $t1, %rax adc %rdx, $acc2 mov $t1, %rdx adc \$0, $acc0 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc3 mov 8*1($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc0, $acc3 adc $t1, $acc4 adc \$0, $acc5 ################################# * b[1] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc1 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov $acc1, $t0 imulq %r15, $acc1 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc4 adc \$0, %rdx xor $acc0, $acc0 add %rax, $acc4 mov $acc1, %rax adc %rdx, $acc5 adc \$0, $acc0 ################################# Second reduction step mulq 8*0(%r14) mov $acc1, $t1 add %rax, $t0 # guaranteed to be zero mov $acc1, %rax adc %rdx, $t0 sub $acc1, $acc3 sbb \$0, $acc1 # can't borrow mulq 8*1(%r14) add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t1, %rax adc %rdx, $acc3 mov $t1, %rdx adc \$0, $acc1 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc4 mov 8*2($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc1, $acc4 adc $t1, $acc5 adc \$0, $acc0 ################################## * b[2] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t0, %rax adc \$0, %rdx mov $acc2, $t0 imulq %r15, $acc2 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc5 adc \$0, %rdx xor $acc1, $acc1 add %rax, $acc5 mov $acc2, %rax adc %rdx, $acc0 adc \$0, $acc1 ################################# Third reduction step mulq 8*0(%r14) mov $acc2, $t1 add %rax, $t0 # guaranteed to be zero mov $acc2, %rax adc %rdx, $t0 sub $acc2, $acc4 sbb \$0, $acc2 # can't borrow mulq 8*1(%r14) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc %rdx, $acc4 mov $t1, %rdx adc \$0, $acc2 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc5 mov 8*3($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc2, $acc5 adc $t1, $acc0 adc \$0, $acc1 ################################# * b[3] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc5 adc \$0, %rdx add %rax, $acc5 mov $t0, %rax adc \$0, %rdx mov $acc3, $t0 imulq %r15, $acc3 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc0 adc \$0, %rdx xor $acc2, $acc2 add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 adc \$0, $acc2 ################################# Last reduction step mulq 8*0(%r14) mov $acc3, $t1 add %rax, $t0 # guaranteed to be zero mov $acc3, %rax adc %rdx, $t0 sub $acc3, $acc5 sbb \$0, $acc3 # can't borrow mulq 8*1(%r14) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc %rdx, $acc5 mov $t1, %rdx adc \$0, $acc3 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc0 sbb %rdx, $t1 # can't borrow add $acc3, $acc0 adc $t1, $acc1 adc \$0, $acc2 ################################# Subtract ord mov $acc4, $a_ptr sub 8*0(%r14), $acc4 mov $acc5, $acc3 sbb 8*1(%r14), $acc5 mov $acc0, $t0 sbb 8*2(%r14), $acc0 mov $acc1, $t1 sbb 8*3(%r14), $acc1 sbb \$0, $acc2 cmovc $a_ptr, $acc4 cmovc $acc3, $acc5 cmovc $t0, $acc0 cmovc $t1, $acc1 mov $acc4, 8*0($r_ptr) mov $acc5, 8*1($r_ptr) mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mul_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont ################################################################################ # void ecp_nistz256_ord_sqr_mont( # uint64_t res[4], # uint64_t a[4], # int rep); .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,\@function,3 .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lecp_nistz256_ord_sqr_montx ___ $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_sqr_body: mov 8*0($a_ptr), $acc0 mov 8*1($a_ptr), %rax mov 8*2($a_ptr), $acc6 mov 8*3($a_ptr), $acc7 lea .Lord(%rip), $a_ptr # pointer to modulus mov $b_org, $b_ptr jmp .Loop_ord_sqr .align 32 .Loop_ord_sqr: ################################# a[1:] * a[0] mov %rax, $t1 # put aside a[1] mul $acc0 # a[1] * a[0] mov %rax, $acc1 movq $t1, %xmm1 # offload a[1] mov $acc6, %rax mov %rdx, $acc2 mul $acc0 # a[2] * a[0] add %rax, $acc2 mov $acc7, %rax movq $acc6, %xmm2 # offload a[2] adc \$0, %rdx mov %rdx, $acc3 mul $acc0 # a[3] * a[0] add %rax, $acc3 mov $acc7, %rax movq $acc7, %xmm3 # offload a[3] adc \$0, %rdx mov %rdx, $acc4 ################################# a[3] * a[2] mul $acc6 # a[3] * a[2] mov %rax, $acc5 mov $acc6, %rax mov %rdx, $acc6 ################################# a[2:] * a[1] mul $t1 # a[2] * a[1] add %rax, $acc3 mov $acc7, %rax adc \$0, %rdx mov %rdx, $acc7 mul $t1 # a[3] * a[1] add %rax, $acc4 adc \$0, %rdx add $acc7, $acc4 adc %rdx, $acc5 adc \$0, $acc6 # can't overflow ################################# *2 xor $acc7, $acc7 mov $acc0, %rax add $acc1, $acc1 adc $acc2, $acc2 adc $acc3, $acc3 adc $acc4, $acc4 adc $acc5, $acc5 adc $acc6, $acc6 adc \$0, $acc7 ################################# Missing products mul %rax # a[0] * a[0] mov %rax, $acc0 movq %xmm1, %rax mov %rdx, $t1 mul %rax # a[1] * a[1] add $t1, $acc1 adc %rax, $acc2 movq %xmm2, %rax adc \$0, %rdx mov %rdx, $t1 mul %rax # a[2] * a[2] add $t1, $acc3 adc %rax, $acc4 movq %xmm3, %rax adc \$0, %rdx mov %rdx, $t1 mov $acc0, $t0 imulq 8*4($a_ptr), $acc0 # *= .LordK mul %rax # a[3] * a[3] add $t1, $acc5 adc %rax, $acc6 mov 8*0($a_ptr), %rax # modulus[0] adc %rdx, $acc7 # can't overflow ################################# First reduction step mul $acc0 mov $acc0, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax # modulus[1] adc %rdx, $t0 sub $acc0, $acc2 sbb \$0, $t1 # can't borrow mul $acc0 add $t0, $acc1 adc \$0, %rdx add %rax, $acc1 mov $acc0, %rax adc %rdx, $acc2 mov $acc0, %rdx adc \$0, $t1 # can't overflow mov $acc1, $t0 imulq 8*4($a_ptr), $acc1 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc3 mov 8*0($a_ptr), %rax sbb %rdx, $acc0 # can't borrow add $t1, $acc3 adc \$0, $acc0 # can't overflow ################################# Second reduction step mul $acc1 mov $acc1, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc1, $acc3 sbb \$0, $t1 # can't borrow mul $acc1 add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $acc1, %rax adc %rdx, $acc3 mov $acc1, %rdx adc \$0, $t1 # can't overflow mov $acc2, $t0 imulq 8*4($a_ptr), $acc2 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc0 mov 8*0($a_ptr), %rax sbb %rdx, $acc1 # can't borrow add $t1, $acc0 adc \$0, $acc1 # can't overflow ################################# Third reduction step mul $acc2 mov $acc2, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc2, $acc0 sbb \$0, $t1 # can't borrow mul $acc2 add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $acc2, %rax adc %rdx, $acc0 mov $acc2, %rdx adc \$0, $t1 # can't overflow mov $acc3, $t0 imulq 8*4($a_ptr), $acc3 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc1 mov 8*0($a_ptr), %rax sbb %rdx, $acc2 # can't borrow add $t1, $acc1 adc \$0, $acc2 # can't overflow ################################# Last reduction step mul $acc3 mov $acc3, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc3, $acc1 sbb \$0, $t1 # can't borrow mul $acc3 add $t0, $acc0 adc \$0, %rdx add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 mov $acc3, %rdx adc \$0, $t1 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc2 sbb %rdx, $acc3 # can't borrow add $t1, $acc2 adc \$0, $acc3 # can't overflow ################################# Add bits [511:256] of the sqr result xor %rdx, %rdx add $acc4, $acc0 adc $acc5, $acc1 mov $acc0, $acc4 adc $acc6, $acc2 adc $acc7, $acc3 mov $acc1, %rax adc \$0, %rdx ################################# Compare to modulus sub 8*0($a_ptr), $acc0 mov $acc2, $acc6 sbb 8*1($a_ptr), $acc1 sbb 8*2($a_ptr), $acc2 mov $acc3, $acc7 sbb 8*3($a_ptr), $acc3 sbb \$0, %rdx cmovc $acc4, $acc0 cmovnc $acc1, %rax cmovnc $acc2, $acc6 cmovnc $acc3, $acc7 dec $b_ptr jnz .Loop_ord_sqr mov $acc0, 8*0($r_ptr) mov %rax, 8*1($r_ptr) pxor %xmm1, %xmm1 mov $acc6, 8*2($r_ptr) pxor %xmm2, %xmm2 mov $acc7, 8*3($r_ptr) pxor %xmm3, %xmm3 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqr_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont ___ $code.=<<___ if ($addx); ################################################################################ .type ecp_nistz256_ord_mul_montx,\@function,3 .align 32 ecp_nistz256_ord_mul_montx: .cfi_startproc .Lecp_nistz256_ord_mul_montx: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_mulx_body: mov $b_org, $b_ptr mov 8*0($b_org), %rdx mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 lea -128($a_ptr), $a_ptr # control u-op density lea .Lord-128(%rip), %r14 mov .LordK(%rip), %r15 ################################# Multiply by b[0] mulx $acc1, $acc0, $acc1 mulx $acc2, $t0, $acc2 mulx $acc3, $t1, $acc3 add $t0, $acc1 mulx $acc4, $t0, $acc4 mov $acc0, %rdx mulx %r15, %rdx, %rax adc $t1, $acc2 adc $t0, $acc3 adc \$0, $acc4 ################################# reduction xor $acc5, $acc5 # $acc5=0, cf=0, of=0 mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc0 # guaranteed to be zero adox $t1, $acc1 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*3+128(%r14), $t0, $t1 mov 8*1($b_ptr), %rdx adcx $t0, $acc3 adox $t1, $acc4 adcx $acc0, $acc4 adox $acc0, $acc5 adc \$0, $acc5 # cf=0, of=0 ################################# Multiply by b[1] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc1, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc4 adox $t1, $acc5 adcx $acc0, $acc5 adox $acc0, $acc0 adc \$0, $acc0 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc1 # guaranteed to be zero adox $t1, $acc2 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128(%r14), $t0, $t1 mov 8*2($b_ptr), %rdx adcx $t0, $acc4 adox $t1, $acc5 adcx $acc1, $acc5 adox $acc1, $acc0 adc \$0, $acc0 # cf=0, of=0 ################################# Multiply by b[2] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc2, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc5 adox $t1, $acc0 adcx $acc1, $acc0 adox $acc1, $acc1 adc \$0, $acc1 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc2 # guaranteed to be zero adox $t1, $acc3 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128(%r14), $t0, $t1 mov 8*3($b_ptr), %rdx adcx $t0, $acc5 adox $t1, $acc0 adcx $acc2, $acc0 adox $acc2, $acc1 adc \$0, $acc1 # cf=0, of=0 ################################# Multiply by b[3] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc3, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc0 adox $t1, $acc1 adcx $acc2, $acc1 adox $acc2, $acc2 adc \$0, $acc2 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc3 # guaranteed to be zero adox $t1, $acc4 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128(%r14), $t0, $t1 lea 128(%r14),%r14 mov $acc4, $t2 adcx $t0, $acc0 adox $t1, $acc1 mov $acc5, $t3 adcx $acc3, $acc1 adox $acc3, $acc2 adc \$0, $acc2 ################################# # Branch-less conditional subtraction of P mov $acc0, $t0 sub 8*0(%r14), $acc4 sbb 8*1(%r14), $acc5 sbb 8*2(%r14), $acc0 mov $acc1, $t1 sbb 8*3(%r14), $acc1 sbb \$0, $acc2 cmovc $t2, $acc4 cmovc $t3, $acc5 cmovc $t0, $acc0 cmovc $t1, $acc1 mov $acc4, 8*0($r_ptr) mov $acc5, 8*1($r_ptr) mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mulx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx .type ecp_nistz256_ord_sqr_montx,\@function,3 .align 32 ecp_nistz256_ord_sqr_montx: .cfi_startproc .Lecp_nistz256_ord_sqr_montx: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_sqrx_body: mov $b_org, $b_ptr mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 lea .Lord(%rip), $a_ptr jmp .Loop_ord_sqrx .align 32 .Loop_ord_sqrx: mulx $acc6, $acc1, $acc2 # a[0]*a[1] mulx $acc7, $t0, $acc3 # a[0]*a[2] mov %rdx, %rax # offload a[0] movq $acc6, %xmm1 # offload a[1] mulx $acc0, $t1, $acc4 # a[0]*a[3] mov $acc6, %rdx add $t0, $acc2 movq $acc7, %xmm2 # offload a[2] adc $t1, $acc3 adc \$0, $acc4 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 ################################# mulx $acc7, $t0, $t1 # a[1]*a[2] adcx $t0, $acc3 adox $t1, $acc4 mulx $acc0, $t0, $t1 # a[1]*a[3] mov $acc7, %rdx adcx $t0, $acc4 adox $t1, $acc5 adc \$0, $acc5 ################################# mulx $acc0, $t0, $acc6 # a[2]*a[3] mov %rax, %rdx movq $acc0, %xmm3 # offload a[3] xor $acc7, $acc7 # $acc7=0,cf=0,of=0 adcx $acc1, $acc1 # acc1:6<<1 adox $t0, $acc5 adcx $acc2, $acc2 adox $acc7, $acc6 # of=0 ################################# a[i]*a[i] mulx %rdx, $acc0, $t1 movq %xmm1, %rdx adcx $acc3, $acc3 adox $t1, $acc1 adcx $acc4, $acc4 mulx %rdx, $t0, $t4 movq %xmm2, %rdx adcx $acc5, $acc5 adox $t0, $acc2 adcx $acc6, $acc6 mulx %rdx, $t0, $t1 .byte 0x67 movq %xmm3, %rdx adox $t4, $acc3 adcx $acc7, $acc7 adox $t0, $acc4 adox $t1, $acc5 mulx %rdx, $t0, $t4 adox $t0, $acc6 adox $t4, $acc7 ################################# reduction mov $acc0, %rdx mulx 8*4($a_ptr), %rdx, $t0 xor %rax, %rax # cf=0, of=0 mulx 8*0($a_ptr), $t0, $t1 adcx $t0, $acc0 # guaranteed to be zero adox $t1, $acc1 mulx 8*1($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*2($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*3($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc0 # of=0 adcx %rax, $acc0 # cf=0 ################################# mov $acc1, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adox $t0, $acc1 # guaranteed to be zero adcx $t1, $acc2 mulx 8*1($a_ptr), $t0, $t1 adox $t0, $acc2 adcx $t1, $acc3 mulx 8*2($a_ptr), $t0, $t1 adox $t0, $acc3 adcx $t1, $acc0 mulx 8*3($a_ptr), $t0, $t1 adox $t0, $acc0 adcx $t1, $acc1 # cf=0 adox %rax, $acc1 # of=0 ################################# mov $acc2, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adcx $t0, $acc2 # guaranteed to be zero adox $t1, $acc3 mulx 8*1($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc0 mulx 8*2($a_ptr), $t0, $t1 adcx $t0, $acc0 adox $t1, $acc1 mulx 8*3($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 # of=0 adcx %rax, $acc2 # cf=0 ################################# mov $acc3, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adox $t0, $acc3 # guaranteed to be zero adcx $t1, $acc0 mulx 8*1($a_ptr), $t0, $t1 adox $t0, $acc0 adcx $t1, $acc1 mulx 8*2($a_ptr), $t0, $t1 adox $t0, $acc1 adcx $t1, $acc2 mulx 8*3($a_ptr), $t0, $t1 adox $t0, $acc2 adcx $t1, $acc3 adox %rax, $acc3 ################################# accumulate upper half add $acc0, $acc4 # add $acc4, $acc0 adc $acc5, $acc1 mov $acc4, %rdx adc $acc6, $acc2 adc $acc7, $acc3 mov $acc1, $acc6 adc \$0, %rax ################################# compare to modulus sub 8*0($a_ptr), $acc4 mov $acc2, $acc7 sbb 8*1($a_ptr), $acc1 sbb 8*2($a_ptr), $acc2 mov $acc3, $acc0 sbb 8*3($a_ptr), $acc3 sbb \$0, %rax cmovnc $acc4, %rdx cmovnc $acc1, $acc6 cmovnc $acc2, $acc7 cmovnc $acc3, $acc0 dec $b_ptr jnz .Loop_ord_sqrx mov %rdx, 8*0($r_ptr) mov $acc6, 8*1($r_ptr) pxor %xmm1, %xmm1 mov $acc7, 8*2($r_ptr) pxor %xmm2, %xmm2 mov $acc0, 8*3($r_ptr) pxor %xmm3, %xmm3 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqrx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx ___ $code.=<<___; ################################################################################ # void ecp_nistz256_to_mont( # uint64_t res[4], # uint64_t in[4]); .globl ecp_nistz256_to_mont .type ecp_nistz256_to_mont,\@function,2 .align 32 ecp_nistz256_to_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx ___ $code.=<<___; lea .LRR(%rip), $b_org jmp .Lmul_mont .cfi_endproc .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont ################################################################################ # void ecp_nistz256_mul_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,\@function,3 .align 32 ecp_nistz256_mul_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx ___ $code.=<<___; .Lmul_mont: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmul_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx je .Lmul_montx ___ $code.=<<___; mov $b_org, $b_ptr mov 8*0($b_org), %rax mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 call __ecp_nistz256_mul_montq ___ $code.=<<___ if ($addx); jmp .Lmul_mont_done .align 32 .Lmul_montx: mov $b_org, $b_ptr mov 8*0($b_org), %rdx mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 lea -128($a_ptr), $a_ptr # control u-op density call __ecp_nistz256_mul_montx ___ $code.=<<___; .Lmul_mont_done: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmul_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,\@abi-omnipotent .align 32 __ecp_nistz256_mul_montq: .cfi_startproc ######################################################################## # Multiply a by b[0] mov %rax, $t1 mulq $acc1 mov .Lpoly+8*1(%rip),$poly1 mov %rax, $acc0 mov $t1, %rax mov %rdx, $acc1 mulq $acc2 mov .Lpoly+8*3(%rip),$poly3 add %rax, $acc1 mov $t1, %rax adc \$0, %rdx mov %rdx, $acc2 mulq $acc3 add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $acc3 mulq $acc4 add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx xor $acc5, $acc5 mov %rdx, $acc4 ######################################################################## # First reduction step # Basically now we want to multiply acc[0] by p256, # and add the result to the acc. # Due to the special form of p256 we do some optimizations # # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] # then we add acc[0] and get acc[0] x 2^96 mov $acc0, $t1 shl \$32, $acc0 mulq $poly3 shr \$32, $t1 add $acc0, $acc1 # +=acc[0]<<96 adc $t1, $acc2 adc %rax, $acc3 mov 8*1($b_ptr), %rax adc %rdx, $acc4 adc \$0, $acc5 xor $acc0, $acc0 ######################################################################## # Multiply by b[1] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc1 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $acc1, %rax adc %rdx, $acc5 adc \$0, $acc0 ######################################################################## # Second reduction step mov $acc1, $t1 shl \$32, $acc1 mulq $poly3 shr \$32, $t1 add $acc1, $acc2 adc $t1, $acc3 adc %rax, $acc4 mov 8*2($b_ptr), %rax adc %rdx, $acc5 adc \$0, $acc0 xor $acc1, $acc1 ######################################################################## # Multiply by b[2] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc5 adc \$0, %rdx add %rax, $acc5 mov $acc2, %rax adc %rdx, $acc0 adc \$0, $acc1 ######################################################################## # Third reduction step mov $acc2, $t1 shl \$32, $acc2 mulq $poly3 shr \$32, $t1 add $acc2, $acc3 adc $t1, $acc4 adc %rax, $acc5 mov 8*3($b_ptr), %rax adc %rdx, $acc0 adc \$0, $acc1 xor $acc2, $acc2 ######################################################################## # Multiply by b[3] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc5 adc \$0, %rdx add %rax, $acc5 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc0 adc \$0, %rdx add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 adc \$0, $acc2 ######################################################################## # Final reduction step mov $acc3, $t1 shl \$32, $acc3 mulq $poly3 shr \$32, $t1 add $acc3, $acc4 adc $t1, $acc5 mov $acc4, $t0 adc %rax, $acc0 adc %rdx, $acc1 mov $acc5, $t1 adc \$0, $acc2 ######################################################################## # Branch-less conditional subtraction of P sub \$-1, $acc4 # .Lpoly[0] mov $acc0, $t2 sbb $poly1, $acc5 # .Lpoly[1] sbb \$0, $acc0 # .Lpoly[2] mov $acc1, $t3 sbb $poly3, $acc1 # .Lpoly[3] sbb \$0, $acc2 cmovc $t0, $acc4 cmovc $t1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $t2, $acc0 mov $acc5, 8*1($r_ptr) cmovc $t3, $acc1 mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq ################################################################################ # void ecp_nistz256_sqr_mont( # uint64_t res[4], # uint64_t a[4]); # we optimize the square according to S.Gueron and V.Krasnov, # "Speeding up Big-Number Squaring" .globl ecp_nistz256_sqr_mont .type ecp_nistz256_sqr_mont,\@function,2 .align 32 ecp_nistz256_sqr_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx ___ $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqr_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx je .Lsqr_montx ___ $code.=<<___; mov 8*0($a_ptr), %rax mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 call __ecp_nistz256_sqr_montq ___ $code.=<<___ if ($addx); jmp .Lsqr_mont_done .align 32 .Lsqr_montx: mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 lea -128($a_ptr), $a_ptr # control u-op density call __ecp_nistz256_sqr_montx ___ $code.=<<___; .Lsqr_mont_done: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqr_epilogue: ret .cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,\@abi-omnipotent .align 32 __ecp_nistz256_sqr_montq: .cfi_startproc mov %rax, $acc5 mulq $acc6 # a[1]*a[0] mov %rax, $acc1 mov $acc7, %rax mov %rdx, $acc2 mulq $acc5 # a[0]*a[2] add %rax, $acc2 mov $acc0, %rax adc \$0, %rdx mov %rdx, $acc3 mulq $acc5 # a[0]*a[3] add %rax, $acc3 mov $acc7, %rax adc \$0, %rdx mov %rdx, $acc4 ################################# mulq $acc6 # a[1]*a[2] add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx mov %rdx, $t1 mulq $acc6 # a[1]*a[3] add %rax, $acc4 mov $acc0, %rax adc \$0, %rdx add $t1, $acc4 mov %rdx, $acc5 adc \$0, $acc5 ################################# mulq $acc7 # a[2]*a[3] xor $acc7, $acc7 add %rax, $acc5 mov 8*0($a_ptr), %rax mov %rdx, $acc6 adc \$0, $acc6 add $acc1, $acc1 # acc1:6<<1 adc $acc2, $acc2 adc $acc3, $acc3 adc $acc4, $acc4 adc $acc5, $acc5 adc $acc6, $acc6 adc \$0, $acc7 mulq %rax mov %rax, $acc0 mov 8*1($a_ptr), %rax mov %rdx, $t0 mulq %rax add $t0, $acc1 adc %rax, $acc2 mov 8*2($a_ptr), %rax adc \$0, %rdx mov %rdx, $t0 mulq %rax add $t0, $acc3 adc %rax, $acc4 mov 8*3($a_ptr), %rax adc \$0, %rdx mov %rdx, $t0 mulq %rax add $t0, $acc5 adc %rax, $acc6 mov $acc0, %rax adc %rdx, $acc7 mov .Lpoly+8*1(%rip), $a_ptr mov .Lpoly+8*3(%rip), $t1 ########################################## # Now the reduction # First iteration mov $acc0, $t0 shl \$32, $acc0 mulq $t1 shr \$32, $t0 add $acc0, $acc1 # +=acc[0]<<96 adc $t0, $acc2 adc %rax, $acc3 mov $acc1, %rax adc \$0, %rdx ########################################## # Second iteration mov $acc1, $t0 shl \$32, $acc1 mov %rdx, $acc0 mulq $t1 shr \$32, $t0 add $acc1, $acc2 adc $t0, $acc3 adc %rax, $acc0 mov $acc2, %rax adc \$0, %rdx ########################################## # Third iteration mov $acc2, $t0 shl \$32, $acc2 mov %rdx, $acc1 mulq $t1 shr \$32, $t0 add $acc2, $acc3 adc $t0, $acc0 adc %rax, $acc1 mov $acc3, %rax adc \$0, %rdx ########################################### # Last iteration mov $acc3, $t0 shl \$32, $acc3 mov %rdx, $acc2 mulq $t1 shr \$32, $t0 add $acc3, $acc0 adc $t0, $acc1 adc %rax, $acc2 adc \$0, %rdx xor $acc3, $acc3 ############################################ # Add the rest of the acc add $acc0, $acc4 adc $acc1, $acc5 mov $acc4, $acc0 adc $acc2, $acc6 adc %rdx, $acc7 mov $acc5, $acc1 adc \$0, $acc3 sub \$-1, $acc4 # .Lpoly[0] mov $acc6, $acc2 sbb $a_ptr, $acc5 # .Lpoly[1] sbb \$0, $acc6 # .Lpoly[2] mov $acc7, $t0 sbb $t1, $acc7 # .Lpoly[3] sbb \$0, $acc3 cmovc $acc0, $acc4 cmovc $acc1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $acc2, $acc6 mov $acc5, 8*1($r_ptr) cmovc $t0, $acc7 mov $acc6, 8*2($r_ptr) mov $acc7, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq ___ if ($addx) { $code.=<<___; .type __ecp_nistz256_mul_montx,\@abi-omnipotent .align 32 __ecp_nistz256_mul_montx: .cfi_startproc ######################################################################## # Multiply by b[0] mulx $acc1, $acc0, $acc1 mulx $acc2, $t0, $acc2 mov \$32, $poly1 xor $acc5, $acc5 # cf=0 mulx $acc3, $t1, $acc3 mov .Lpoly+8*3(%rip), $poly3 adc $t0, $acc1 mulx $acc4, $t0, $acc4 mov $acc0, %rdx adc $t1, $acc2 shlx $poly1,$acc0,$t1 adc $t0, $acc3 shrx $poly1,$acc0,$t0 adc \$0, $acc4 ######################################################################## # First reduction step add $t1, $acc1 adc $t0, $acc2 mulx $poly3, $t0, $t1 mov 8*1($b_ptr), %rdx adc $t0, $acc3 adc $t1, $acc4 adc \$0, $acc5 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 ######################################################################## # Multiply by b[1] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc1, %rdx adcx $t0, $acc4 shlx $poly1, $acc1, $t0 adox $t1, $acc5 shrx $poly1, $acc1, $t1 adcx $acc0, $acc5 adox $acc0, $acc0 adc \$0, $acc0 ######################################################################## # Second reduction step add $t0, $acc2 adc $t1, $acc3 mulx $poly3, $t0, $t1 mov 8*2($b_ptr), %rdx adc $t0, $acc4 adc $t1, $acc5 adc \$0, $acc0 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 ######################################################################## # Multiply by b[2] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc2, %rdx adcx $t0, $acc5 shlx $poly1, $acc2, $t0 adox $t1, $acc0 shrx $poly1, $acc2, $t1 adcx $acc1, $acc0 adox $acc1, $acc1 adc \$0, $acc1 ######################################################################## # Third reduction step add $t0, $acc3 adc $t1, $acc4 mulx $poly3, $t0, $t1 mov 8*3($b_ptr), %rdx adc $t0, $acc5 adc $t1, $acc0 adc \$0, $acc1 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 ######################################################################## # Multiply by b[3] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc3, %rdx adcx $t0, $acc0 shlx $poly1, $acc3, $t0 adox $t1, $acc1 shrx $poly1, $acc3, $t1 adcx $acc2, $acc1 adox $acc2, $acc2 adc \$0, $acc2 ######################################################################## # Fourth reduction step add $t0, $acc4 adc $t1, $acc5 mulx $poly3, $t0, $t1 mov $acc4, $t2 mov .Lpoly+8*1(%rip), $poly1 adc $t0, $acc0 mov $acc5, $t3 adc $t1, $acc1 adc \$0, $acc2 ######################################################################## # Branch-less conditional subtraction of P xor %eax, %eax mov $acc0, $t0 sbb \$-1, $acc4 # .Lpoly[0] sbb $poly1, $acc5 # .Lpoly[1] sbb \$0, $acc0 # .Lpoly[2] mov $acc1, $t1 sbb $poly3, $acc1 # .Lpoly[3] sbb \$0, $acc2 cmovc $t2, $acc4 cmovc $t3, $acc5 mov $acc4, 8*0($r_ptr) cmovc $t0, $acc0 mov $acc5, 8*1($r_ptr) cmovc $t1, $acc1 mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx .type __ecp_nistz256_sqr_montx,\@abi-omnipotent .align 32 __ecp_nistz256_sqr_montx: .cfi_startproc mulx $acc6, $acc1, $acc2 # a[0]*a[1] mulx $acc7, $t0, $acc3 # a[0]*a[2] xor %eax, %eax adc $t0, $acc2 mulx $acc0, $t1, $acc4 # a[0]*a[3] mov $acc6, %rdx adc $t1, $acc3 adc \$0, $acc4 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 ################################# mulx $acc7, $t0, $t1 # a[1]*a[2] adcx $t0, $acc3 adox $t1, $acc4 mulx $acc0, $t0, $t1 # a[1]*a[3] mov $acc7, %rdx adcx $t0, $acc4 adox $t1, $acc5 adc \$0, $acc5 ################################# mulx $acc0, $t0, $acc6 # a[2]*a[3] mov 8*0+128($a_ptr), %rdx xor $acc7, $acc7 # $acc7=0,cf=0,of=0 adcx $acc1, $acc1 # acc1:6<<1 adox $t0, $acc5 adcx $acc2, $acc2 adox $acc7, $acc6 # of=0 mulx %rdx, $acc0, $t1 mov 8*1+128($a_ptr), %rdx adcx $acc3, $acc3 adox $t1, $acc1 adcx $acc4, $acc4 mulx %rdx, $t0, $t4 mov 8*2+128($a_ptr), %rdx adcx $acc5, $acc5 adox $t0, $acc2 adcx $acc6, $acc6 .byte 0x67 mulx %rdx, $t0, $t1 mov 8*3+128($a_ptr), %rdx adox $t4, $acc3 adcx $acc7, $acc7 adox $t0, $acc4 mov \$32, $a_ptr adox $t1, $acc5 .byte 0x67,0x67 mulx %rdx, $t0, $t4 mov .Lpoly+8*3(%rip), %rdx adox $t0, $acc6 shlx $a_ptr, $acc0, $t0 adox $t4, $acc7 shrx $a_ptr, $acc0, $t4 mov %rdx,$t1 # reduction step 1 add $t0, $acc1 adc $t4, $acc2 mulx $acc0, $t0, $acc0 adc $t0, $acc3 shlx $a_ptr, $acc1, $t0 adc \$0, $acc0 shrx $a_ptr, $acc1, $t4 # reduction step 2 add $t0, $acc2 adc $t4, $acc3 mulx $acc1, $t0, $acc1 adc $t0, $acc0 shlx $a_ptr, $acc2, $t0 adc \$0, $acc1 shrx $a_ptr, $acc2, $t4 # reduction step 3 add $t0, $acc3 adc $t4, $acc0 mulx $acc2, $t0, $acc2 adc $t0, $acc1 shlx $a_ptr, $acc3, $t0 adc \$0, $acc2 shrx $a_ptr, $acc3, $t4 # reduction step 4 add $t0, $acc0 adc $t4, $acc1 mulx $acc3, $t0, $acc3 adc $t0, $acc2 adc \$0, $acc3 xor $t3, $t3 add $acc0, $acc4 # accumulate upper half mov .Lpoly+8*1(%rip), $a_ptr adc $acc1, $acc5 mov $acc4, $acc0 adc $acc2, $acc6 adc $acc3, $acc7 mov $acc5, $acc1 adc \$0, $t3 sub \$-1, $acc4 # .Lpoly[0] mov $acc6, $acc2 sbb $a_ptr, $acc5 # .Lpoly[1] sbb \$0, $acc6 # .Lpoly[2] mov $acc7, $acc3 sbb $t1, $acc7 # .Lpoly[3] sbb \$0, $t3 cmovc $acc0, $acc4 cmovc $acc1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $acc2, $acc6 mov $acc5, 8*1($r_ptr) cmovc $acc3, $acc7 mov $acc6, 8*2($r_ptr) mov $acc7, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx ___ } } { my ($r_ptr,$in_ptr)=("%rdi","%rsi"); my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); $code.=<<___; ################################################################################ # void ecp_nistz256_from_mont( # uint64_t res[4], # uint64_t in[4]); # This one performs Montgomery multiplication by 1, so we only need the reduction .globl ecp_nistz256_from_mont .type ecp_nistz256_from_mont,\@function,2 .align 32 ecp_nistz256_from_mont: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lfrom_body: mov 8*0($in_ptr), %rax mov .Lpoly+8*3(%rip), $t2 mov 8*1($in_ptr), $acc1 mov 8*2($in_ptr), $acc2 mov 8*3($in_ptr), $acc3 mov %rax, $acc0 mov .Lpoly+8*1(%rip), $t1 ######################################### # First iteration mov %rax, $t0 shl \$32, $acc0 mulq $t2 shr \$32, $t0 add $acc0, $acc1 adc $t0, $acc2 adc %rax, $acc3 mov $acc1, %rax adc \$0, %rdx ######################################### # Second iteration mov $acc1, $t0 shl \$32, $acc1 mov %rdx, $acc0 mulq $t2 shr \$32, $t0 add $acc1, $acc2 adc $t0, $acc3 adc %rax, $acc0 mov $acc2, %rax adc \$0, %rdx ########################################## # Third iteration mov $acc2, $t0 shl \$32, $acc2 mov %rdx, $acc1 mulq $t2 shr \$32, $t0 add $acc2, $acc3 adc $t0, $acc0 adc %rax, $acc1 mov $acc3, %rax adc \$0, %rdx ########################################### # Last iteration mov $acc3, $t0 shl \$32, $acc3 mov %rdx, $acc2 mulq $t2 shr \$32, $t0 add $acc3, $acc0 adc $t0, $acc1 mov $acc0, $t0 adc %rax, $acc2 mov $acc1, $in_ptr adc \$0, %rdx ########################################### # Branch-less conditional subtraction sub \$-1, $acc0 mov $acc2, %rax sbb $t1, $acc1 sbb \$0, $acc2 mov %rdx, $acc3 sbb $t2, %rdx sbb $t2, $t2 cmovnz $t0, $acc0 cmovnz $in_ptr, $acc1 mov $acc0, 8*0($r_ptr) cmovnz %rax, $acc2 mov $acc1, 8*1($r_ptr) cmovz %rdx, $acc3 mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lfrom_epilogue: ret .cfi_endproc .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont ___ } { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); $code.=<<___; ################################################################################ # void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_scatter_w5 .type ecp_nistz256_scatter_w5,\@abi-omnipotent .align 32 ecp_nistz256_scatter_w5: .cfi_startproc lea -3($index,$index,2), $index movdqa 0x00($in_t), %xmm0 shl \$5, $index movdqa 0x10($in_t), %xmm1 movdqa 0x20($in_t), %xmm2 movdqa 0x30($in_t), %xmm3 movdqa 0x40($in_t), %xmm4 movdqa 0x50($in_t), %xmm5 movdqa %xmm0, 0x00($val,$index) movdqa %xmm1, 0x10($val,$index) movdqa %xmm2, 0x20($val,$index) movdqa %xmm3, 0x30($val,$index) movdqa %xmm4, 0x40($val,$index) movdqa %xmm5, 0x50($val,$index) ret .cfi_endproc .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 ################################################################################ # void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_gather_w5 .type ecp_nistz256_gather_w5,\@abi-omnipotent .align 32 ecp_nistz256_gather_w5: .cfi_startproc ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax jnz .Lavx2_gather_w5 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_gather_w5: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) ___ $code.=<<___; movdqa .LOne(%rip), $ONE movd $index, $INDEX pxor $Ra, $Ra pxor $Rb, $Rb pxor $Rc, $Rc pxor $Rd, $Rd pxor $Re, $Re pxor $Rf, $Rf movdqa $ONE, $M0 pshufd \$0, $INDEX, $INDEX mov \$16, %rax .Lselect_loop_sse_w5: movdqa $M0, $TMP0 paddd $ONE, $M0 pcmpeqd $INDEX, $TMP0 movdqa 16*0($in_t), $T0a movdqa 16*1($in_t), $T0b movdqa 16*2($in_t), $T0c movdqa 16*3($in_t), $T0d movdqa 16*4($in_t), $T0e movdqa 16*5($in_t), $T0f lea 16*6($in_t), $in_t pand $TMP0, $T0a pand $TMP0, $T0b por $T0a, $Ra pand $TMP0, $T0c por $T0b, $Rb pand $TMP0, $T0d por $T0c, $Rc pand $TMP0, $T0e por $T0d, $Rd pand $TMP0, $T0f por $T0e, $Re por $T0f, $Rf dec %rax jnz .Lselect_loop_sse_w5 movdqu $Ra, 16*0($val) movdqu $Rb, 16*1($val) movdqu $Rc, 16*2($val) movdqu $Rd, 16*3($val) movdqu $Re, 16*4($val) movdqu $Rf, 16*5($val) ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_gather_w5: .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 ################################################################################ # void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_scatter_w7 .type ecp_nistz256_scatter_w7,\@abi-omnipotent .align 32 ecp_nistz256_scatter_w7: .cfi_startproc movdqu 0x00($in_t), %xmm0 shl \$6, $index movdqu 0x10($in_t), %xmm1 movdqu 0x20($in_t), %xmm2 movdqu 0x30($in_t), %xmm3 movdqa %xmm0, 0x00($val,$index) movdqa %xmm1, 0x10($val,$index) movdqa %xmm2, 0x20($val,$index) movdqa %xmm3, 0x30($val,$index) ret .cfi_endproc .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 ################################################################################ # void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_gather_w7 .type ecp_nistz256_gather_w7,\@abi-omnipotent .align 32 ecp_nistz256_gather_w7: .cfi_startproc ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax jnz .Lavx2_gather_w7 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_gather_w7: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) ___ $code.=<<___; movdqa .LOne(%rip), $M0 movd $index, $INDEX pxor $Ra, $Ra pxor $Rb, $Rb pxor $Rc, $Rc pxor $Rd, $Rd movdqa $M0, $ONE pshufd \$0, $INDEX, $INDEX mov \$64, %rax .Lselect_loop_sse_w7: movdqa $M0, $TMP0 paddd $ONE, $M0 movdqa 16*0($in_t), $T0a movdqa 16*1($in_t), $T0b pcmpeqd $INDEX, $TMP0 movdqa 16*2($in_t), $T0c movdqa 16*3($in_t), $T0d lea 16*4($in_t), $in_t pand $TMP0, $T0a pand $TMP0, $T0b por $T0a, $Ra pand $TMP0, $T0c por $T0b, $Rb pand $TMP0, $T0d por $T0c, $Rc prefetcht0 255($in_t) por $T0d, $Rd dec %rax jnz .Lselect_loop_sse_w7 movdqu $Ra, 16*0($val) movdqu $Rb, 16*1($val) movdqu $Rc, 16*2($val) movdqu $Rd, 16*3($val) ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_gather_w7: .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } if ($avx>1) { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); $code.=<<___; ################################################################################ # void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); .type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent .align 32 ecp_nistz256_avx2_gather_w5: .cfi_startproc .Lavx2_gather_w5: vzeroupper ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax mov %rsp,%r11 .LSEH_begin_ecp_nistz256_avx2_gather_w5: .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LTwo(%rip), $TWO vpxor $Ra, $Ra, $Ra vpxor $Rb, $Rb, $Rb vpxor $Rc, $Rc, $Rc vmovdqa .LOne(%rip), $M0 vmovdqa .LTwo(%rip), $M1 vmovd $index, %xmm1 vpermd $INDEX, $Ra, $INDEX mov \$8, %rax .Lselect_loop_avx2_w5: vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vmovdqa 32*2($in_t), $T0c vmovdqa 32*3($in_t), $T1a vmovdqa 32*4($in_t), $T1b vmovdqa 32*5($in_t), $T1c vpcmpeqd $INDEX, $M0, $TMP0 vpcmpeqd $INDEX, $M1, $TMP1 vpaddd $TWO, $M0, $M0 vpaddd $TWO, $M1, $M1 lea 32*6($in_t), $in_t vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpand $TMP0, $T0c, $T0c vpand $TMP1, $T1a, $T1a vpand $TMP1, $T1b, $T1b vpand $TMP1, $T1c, $T1c vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vpxor $T0c, $Rc, $Rc vpxor $T1a, $Ra, $Ra vpxor $T1b, $Rb, $Rb vpxor $T1c, $Rc, $Rc dec %rax jnz .Lselect_loop_avx2_w5 vmovdqu $Ra, 32*0($val) vmovdqu $Rb, 32*1($val) vmovdqu $Rc, 32*2($val) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea (%r11), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_avx2_gather_w5: .size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 ___ } if ($avx>1) { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); $code.=<<___; ################################################################################ # void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_avx2_gather_w7 .type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent .align 32 ecp_nistz256_avx2_gather_w7: .cfi_startproc .Lavx2_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); mov %rsp,%r11 lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_avx2_gather_w7: .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LThree(%rip), $THREE vpxor $Ra, $Ra, $Ra vpxor $Rb, $Rb, $Rb vmovdqa .LOne(%rip), $M0 vmovdqa .LTwo(%rip), $M1 vmovdqa .LThree(%rip), $M2 vmovd $index, %xmm1 vpermd $INDEX, $Ra, $INDEX # Skip index = 0, because it is implicitly the point at infinity mov \$21, %rax .Lselect_loop_avx2_w7: vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vmovdqa 32*2($in_t), $T1a vmovdqa 32*3($in_t), $T1b vmovdqa 32*4($in_t), $T2a vmovdqa 32*5($in_t), $T2b vpcmpeqd $INDEX, $M0, $TMP0 vpcmpeqd $INDEX, $M1, $TMP1 vpcmpeqd $INDEX, $M2, $TMP2 vpaddd $THREE, $M0, $M0 vpaddd $THREE, $M1, $M1 vpaddd $THREE, $M2, $M2 lea 32*6($in_t), $in_t vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpand $TMP1, $T1a, $T1a vpand $TMP1, $T1b, $T1b vpand $TMP2, $T2a, $T2a vpand $TMP2, $T2b, $T2b vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vpxor $T1a, $Ra, $Ra vpxor $T1b, $Rb, $Rb vpxor $T2a, $Ra, $Ra vpxor $T2b, $Rb, $Rb dec %rax jnz .Lselect_loop_avx2_w7 vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vpcmpeqd $INDEX, $M0, $TMP0 vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vmovdqu $Ra, 32*0($val) vmovdqu $Rb, 32*1($val) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea (%r11), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_avx2_gather_w7: .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } else { $code.=<<___; .globl ecp_nistz256_avx2_gather_w7 .type ecp_nistz256_avx2_gather_w7,\@function,3 .align 32 ecp_nistz256_avx2_gather_w7: .cfi_startproc .byte 0x0f,0x0b # ud2 ret .cfi_endproc .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } {{{ ######################################################################## # This block implements higher level point_double, point_add and # point_add_affine. The key to performance in this case is to allow # out-of-order execution logic to overlap computations from next step # with tail processing from current step. By using tailored calling # sequence we minimize inter-step overhead to give processor better # shot at overlapping operations... # # You will notice that input data is copied to stack. Trouble is that # there are no registers to spare for holding original pointers and # reloading them, pointers, would create undesired dependencies on # effective addresses calculation paths. In other words it's too done # to favour out-of-order execution logic. # my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); my ($poly1,$poly3)=($acc6,$acc7); sub load_for_mul () { my ($a,$b,$src0) = @_; my $bias = $src0 eq "%rax" ? 0 : -128; " mov $b, $src0 lea $b, $b_ptr mov 8*0+$a, $acc1 mov 8*1+$a, $acc2 lea $bias+$a, $a_ptr mov 8*2+$a, $acc3 mov 8*3+$a, $acc4" } sub load_for_sqr () { my ($a,$src0) = @_; my $bias = $src0 eq "%rax" ? 0 : -128; " mov 8*0+$a, $src0 mov 8*1+$a, $acc6 lea $bias+$a, $a_ptr mov 8*2+$a, $acc7 mov 8*3+$a, $acc0" } { ######################################################################## # operate in 4-5-0-1 "name space" that matches multiplication output # my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); $code.=<<___; .type __ecp_nistz256_add_toq,\@abi-omnipotent .align 32 __ecp_nistz256_add_toq: .cfi_startproc xor $t4,$t4 add 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,\@abi-omnipotent .align 32 __ecp_nistz256_sub_fromq: .cfi_startproc sub 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb $t4, $t4 add \$-1, $a0 mov $a2, $t2 adc $poly1, $a1 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,\@abi-omnipotent .align 32 __ecp_nistz256_subq: .cfi_startproc sub $a0, $t0 sbb $a1, $t1 mov $t0, $a0 sbb $a2, $t2 sbb $a3, $t3 mov $t1, $a1 sbb $t4, $t4 add \$-1, $t0 mov $t2, $a2 adc $poly1, $t1 adc \$0, $t2 mov $t3, $a3 adc $poly3, $t3 test $t4, $t4 cmovnz $t0, $a0 cmovnz $t1, $a1 cmovnz $t2, $a2 cmovnz $t3, $a3 ret .cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent .align 32 __ecp_nistz256_mul_by_2q: .cfi_startproc xor $t4, $t4 add $a0, $a0 # a0:a3+a0:a3 adc $a1, $a1 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q ___ } sub gen_double () { my $x = shift; my ($src0,$sfx,$bias); my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); if ($x ne "x") { $src0 = "%rax"; $sfx = ""; $bias = 0; $code.=<<___; .globl ecp_nistz256_point_double .type ecp_nistz256_point_double,\@function,2 .align 32 ecp_nistz256_point_double: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lpoint_doublex ___ } else { $src0 = "%rdx"; $sfx = "x"; $bias = 128; $code.=<<___; .type ecp_nistz256_point_doublex,\@function,2 .align 32 ecp_nistz256_point_doublex: .cfi_startproc .Lpoint_doublex: ___ } $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*5+8, %rsp .cfi_adjust_cfa_offset 32*5+8 .Lpoint_double${x}_body: .Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x mov $a_ptr, $b_ptr # backup copy movdqu 0x10($a_ptr), %xmm1 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order mov 0x20+8*1($a_ptr), $acc5 mov 0x20+8*2($a_ptr), $acc0 mov 0x20+8*3($a_ptr), $acc1 mov .Lpoly+8*1(%rip), $poly1 mov .Lpoly+8*3(%rip), $poly3 movdqa %xmm0, $in_x(%rsp) movdqa %xmm1, $in_x+0x10(%rsp) lea 0x20($r_ptr), $acc2 lea 0x40($r_ptr), $acc3 movq $r_ptr, %xmm0 movq $acc2, %xmm1 movq $acc3, %xmm2 lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); mov 0x40+8*0($a_ptr), $src0 mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 lea 0x40-$bias($a_ptr), $a_ptr lea $Zsqr(%rsp), $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); `&load_for_sqr("$S(%rsp)", "$src0")` lea $S(%rsp), $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); mov 0x20($b_ptr), $src0 # $b_ptr is still valid mov 0x40+8*0($b_ptr), $acc1 mov 0x40+8*1($b_ptr), $acc2 mov 0x40+8*2($b_ptr), $acc3 mov 0x40+8*3($b_ptr), $acc4 lea 0x40-$bias($b_ptr), $a_ptr lea 0x20($b_ptr), $b_ptr movq %xmm2, $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order mov $in_x+8*1(%rsp), $acc5 lea $Zsqr(%rsp), $b_ptr mov $in_x+8*2(%rsp), $acc0 mov $in_x+8*3(%rsp), $acc1 lea $M(%rsp), $r_ptr call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order mov $in_x+8*1(%rsp), $acc5 lea $Zsqr(%rsp), $b_ptr mov $in_x+8*2(%rsp), $acc0 mov $in_x+8*3(%rsp), $acc1 lea $Zsqr(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); `&load_for_sqr("$S(%rsp)", "$src0")` movq %xmm1, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); ___ { ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## # operate in 4-5-6-7 "name space" that matches squaring output # my ($poly1,$poly3)=($a_ptr,$t1); my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); $code.=<<___; xor $t4, $t4 mov $a0, $t0 add \$-1, $a0 mov $a1, $t1 adc $poly1, $a1 mov $a2, $t2 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 adc \$0, $t4 xor $a_ptr, $a_ptr # borrow $a_ptr test \$1, $t0 cmovz $t0, $a0 cmovz $t1, $a1 cmovz $t2, $a2 cmovz $t3, $a3 cmovz $a_ptr, $t4 mov $a1, $t0 # a0:a3>>1 shr \$1, $a0 shl \$63, $t0 mov $a2, $t1 shr \$1, $a1 or $t0, $a0 shl \$63, $t1 mov $a3, $t2 shr \$1, $a2 or $t1, $a1 shl \$63, $t2 mov $a0, 8*0($r_ptr) shr \$1, $a3 mov $a1, 8*1($r_ptr) shl \$63, $t4 or $t2, $a2 or $t4, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` lea $M(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); lea $tmp0(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x lea $M(%rsp), $b_ptr lea $M(%rsp), $r_ptr call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); lea $tmp0(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); `&load_for_sqr("$M(%rsp)", "$src0")` movq %xmm0, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); lea $tmp0(%rsp), $b_ptr mov $acc6, $acc0 # harmonize sqr output and sub input mov $acc7, $acc1 mov $a_ptr, $poly1 mov $t1, $poly3 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); mov $S+8*0(%rsp), $t0 mov $S+8*1(%rsp), $t1 mov $S+8*2(%rsp), $t2 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order lea $S(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); mov $M(%rsp), $src0 lea $M(%rsp), $b_ptr mov $acc4, $acc6 # harmonize sub output and mul input xor %ecx, %ecx mov $acc4, $S+8*0(%rsp) # have to save:-( mov $acc5, $acc2 mov $acc5, $S+8*1(%rsp) cmovz $acc0, $acc3 mov $acc0, $S+8*2(%rsp) lea $S-$bias(%rsp), $a_ptr cmovz $acc1, $acc4 mov $acc1, $S+8*3(%rsp) mov $acc6, $acc1 lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); movq %xmm1, $b_ptr movq %xmm1, $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); lea 32*5+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_double${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx ___ } &gen_double("q"); sub gen_add () { my $x = shift; my ($src0,$sfx,$bias); my ($H,$Hsqr,$R,$Rsqr,$Hcub, $U1,$U2,$S1,$S2, $res_x,$res_y,$res_z, $in1_x,$in1_y,$in1_z, $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); if ($x ne "x") { $src0 = "%rax"; $sfx = ""; $bias = 0; $code.=<<___; .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,\@function,3 .align 32 ecp_nistz256_point_add: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lpoint_addx ___ } else { $src0 = "%rdx"; $sfx = "x"; $bias = 128; $code.=<<___; .type ecp_nistz256_point_addx,\@function,3 .align 32 ecp_nistz256_point_addx: .cfi_startproc .Lpoint_addx: ___ } $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*18+8, %rsp .cfi_adjust_cfa_offset 32*18+8 .Lpoint_add${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 movdqu 0x30($a_ptr), %xmm3 movdqu 0x40($a_ptr), %xmm4 movdqu 0x50($a_ptr), %xmm5 mov $a_ptr, $b_ptr # reassign mov $b_org, $a_ptr # reassign movdqa %xmm0, $in1_x(%rsp) movdqa %xmm1, $in1_x+0x10(%rsp) movdqa %xmm2, $in1_y(%rsp) movdqa %xmm3, $in1_y+0x10(%rsp) movdqa %xmm4, $in1_z(%rsp) movdqa %xmm5, $in1_z+0x10(%rsp) por %xmm4, %xmm5 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr pshufd \$0xb1, %xmm5, %xmm3 movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 por %xmm3, %xmm5 movdqu 0x30($a_ptr), %xmm3 mov 0x40+8*0($a_ptr), $src0 # load original in2_z mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 movdqa %xmm0, $in2_x(%rsp) pshufd \$0x1e, %xmm5, %xmm4 movdqa %xmm1, $in2_x+0x10(%rsp) movdqu 0x40($a_ptr),%xmm0 # in2_z again movdqu 0x50($a_ptr),%xmm1 movdqa %xmm2, $in2_y(%rsp) movdqa %xmm3, $in2_y+0x10(%rsp) por %xmm4, %xmm5 pxor %xmm4, %xmm4 por %xmm0, %xmm1 movq $r_ptr, %xmm0 # save $r_ptr lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid mov $src0, $in2_z+8*0(%rsp) # make in2_z copy mov $acc6, $in2_z+8*1(%rsp) mov $acc7, $in2_z+8*2(%rsp) mov $acc0, $in2_z+8*3(%rsp) lea $Z2sqr(%rsp), $r_ptr # Z2^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); pcmpeqd %xmm4, %xmm5 pshufd \$0xb1, %xmm1, %xmm4 por %xmm1, %xmm4 pshufd \$0, %xmm5, %xmm5 # in1infty pshufd \$0x1e, %xmm4, %xmm3 por %xmm3, %xmm4 pxor %xmm3, %xmm3 pcmpeqd %xmm3, %xmm4 pshufd \$0, %xmm4, %xmm4 # in2infty mov 0x40+8*0($b_ptr), $src0 # load original in1_z mov 0x40+8*1($b_ptr), $acc6 mov 0x40+8*2($b_ptr), $acc7 mov 0x40+8*3($b_ptr), $acc0 movq $b_ptr, %xmm1 lea 0x40-$bias($b_ptr), $a_ptr lea $Z1sqr(%rsp), $r_ptr # Z1^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` lea $S1(%rsp), $r_ptr # S1 = Z2^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); lea $S1(%rsp), $b_ptr lea $R(%rsp), $r_ptr # R = S2 - S1 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); or $acc5, $acc4 # see if result is zero movdqa %xmm4, %xmm2 or $acc0, $acc4 or $acc1, $acc4 por %xmm5, %xmm2 # in1infty || in2infty movq $acc4, %xmm3 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); lea $U1(%rsp), $b_ptr lea $H(%rsp), $r_ptr # H = U2 - U1 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); or $acc5, $acc4 # see if result is zero or $acc0, $acc4 or $acc1, $acc4 # !is_equal(U1, U2) movq %xmm2, $acc0 # in1infty | in2infty movq %xmm3, $acc1 # !is_equal(S1, S2) or $acc0, $acc4 or $acc1, $acc4 # if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2)) .byte 0x3e # predict taken jnz .Ladd_proceed$x .Ladd_double$x: movq %xmm1, $a_ptr # restore $a_ptr movq %xmm0, $r_ptr # restore $r_ptr add \$`32*(18-5)`, %rsp # difference in frame sizes .cfi_adjust_cfa_offset `-32*(18-5)` jmp .Lpoint_double_shortcut$x .cfi_adjust_cfa_offset `32*(18-5)` .align 32 .Ladd_proceed$x: `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); `&load_for_sqr("$H(%rsp)", "$src0")` lea $Hsqr(%rsp), $r_ptr # H^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` lea $Hcub(%rsp), $r_ptr # H^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U1*H^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); ___ { ####################################################################### # operate in 4-5-0-1 "name space" that matches multiplication output # my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); my ($poly1, $poly3)=($acc6,$acc7); $code.=<<___; #lea $U2(%rsp), $a_ptr #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); xor $t4, $t4 add $acc0, $acc0 # a0:a3+a0:a3 lea $Rsqr(%rsp), $a_ptr adc $acc1, $acc1 mov $acc0, $t0 adc $acc2, $acc2 adc $acc3, $acc3 mov $acc1, $t1 adc \$0, $t4 sub \$-1, $acc0 mov $acc2, $t2 sbb $poly1, $acc1 sbb \$0, $acc2 mov $acc3, $t3 sbb $poly3, $acc3 sbb \$0, $t4 cmovc $t0, $acc0 mov 8*0($a_ptr), $t0 cmovc $t1, $acc1 mov 8*1($a_ptr), $t1 cmovc $t2, $acc2 mov 8*2($a_ptr), $t2 cmovc $t3, $acc3 mov 8*3($a_ptr), $t3 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); lea $Hcub(%rsp), $b_ptr lea $res_x(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); mov $U2+8*0(%rsp), $t0 mov $U2+8*1(%rsp), $t1 mov $U2+8*2(%rsp), $t2 mov $U2+8*3(%rsp), $t3 lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); mov $acc0, 8*0($r_ptr) # save the result, as mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` lea $res_y(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); lea $S2(%rsp), $b_ptr lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); movq %xmm0, $r_ptr # restore $r_ptr movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); movdqa %xmm5, %xmm1 pandn $res_z(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_z+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_z(%rsp), %xmm2 pand $in2_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_z(%rsp), %xmm2 pand $in1_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x40($r_ptr) movdqu %xmm3, 0x50($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); movdqa %xmm5, %xmm1 pandn $res_x(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_x+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_x(%rsp), %xmm2 pand $in2_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_x(%rsp), %xmm2 pand $in1_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x00($r_ptr) movdqu %xmm3, 0x10($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); movdqa %xmm5, %xmm1 pandn $res_y(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_y+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_y(%rsp), %xmm2 pand $in2_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_y(%rsp), %xmm2 pand $in1_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) .Ladd_done$x: lea 32*18+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_add${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx ___ } &gen_add("q"); sub gen_add_affine () { my $x = shift; my ($src0,$sfx,$bias); my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, $res_x,$res_y,$res_z, $in1_x,$in1_y,$in1_z, $in2_x,$in2_y)=map(32*$_,(0..14)); my $Z1sqr = $S2; if ($x ne "x") { $src0 = "%rax"; $sfx = ""; $bias = 0; $code.=<<___; .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,\@function,3 .align 32 ecp_nistz256_point_add_affine: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lpoint_add_affinex ___ } else { $src0 = "%rdx"; $sfx = "x"; $bias = 128; $code.=<<___; .type ecp_nistz256_point_add_affinex,\@function,3 .align 32 ecp_nistz256_point_add_affinex: .cfi_startproc .Lpoint_add_affinex: ___ } $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*15+8, %rsp .cfi_adjust_cfa_offset 32*15+8 .Ladd_affine${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr mov $b_org, $b_ptr # reassign movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 movdqu 0x30($a_ptr), %xmm3 movdqu 0x40($a_ptr), %xmm4 movdqu 0x50($a_ptr), %xmm5 mov 0x40+8*0($a_ptr), $src0 # load original in1_z mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 movdqa %xmm0, $in1_x(%rsp) movdqa %xmm1, $in1_x+0x10(%rsp) movdqa %xmm2, $in1_y(%rsp) movdqa %xmm3, $in1_y+0x10(%rsp) movdqa %xmm4, $in1_z(%rsp) movdqa %xmm5, $in1_z+0x10(%rsp) por %xmm4, %xmm5 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr pshufd \$0xb1, %xmm5, %xmm3 movdqu 0x10($b_ptr), %xmm1 movdqu 0x20($b_ptr), %xmm2 por %xmm3, %xmm5 movdqu 0x30($b_ptr), %xmm3 movdqa %xmm0, $in2_x(%rsp) pshufd \$0x1e, %xmm5, %xmm4 movdqa %xmm1, $in2_x+0x10(%rsp) por %xmm0, %xmm1 movq $r_ptr, %xmm0 # save $r_ptr movdqa %xmm2, $in2_y(%rsp) movdqa %xmm3, $in2_y+0x10(%rsp) por %xmm2, %xmm3 por %xmm4, %xmm5 pxor %xmm4, %xmm4 por %xmm1, %xmm3 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid lea $Z1sqr(%rsp), $r_ptr # Z1^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); pcmpeqd %xmm4, %xmm5 pshufd \$0xb1, %xmm3, %xmm4 mov 0x00($b_ptr), $src0 # $b_ptr is still valid #lea 0x00($b_ptr), $b_ptr mov $acc4, $acc1 # harmonize sqr output and mul input por %xmm3, %xmm4 pshufd \$0, %xmm5, %xmm5 # in1infty pshufd \$0x1e, %xmm4, %xmm3 mov $acc5, $acc2 por %xmm3, %xmm4 pxor %xmm3, %xmm3 mov $acc6, $acc3 pcmpeqd %xmm3, %xmm4 pshufd \$0, %xmm4, %xmm4 # in2infty lea $Z1sqr-$bias(%rsp), $a_ptr mov $acc7, $acc4 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); lea $in1_x(%rsp), $b_ptr lea $H(%rsp), $r_ptr # H = U2 - U1 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); lea $in1_y(%rsp), $b_ptr lea $R(%rsp), $r_ptr # R = S2 - S1 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); `&load_for_sqr("$H(%rsp)", "$src0")` lea $Hsqr(%rsp), $r_ptr # H^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` lea $Hcub(%rsp), $r_ptr # H^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U1*H^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); ___ { ####################################################################### # operate in 4-5-0-1 "name space" that matches multiplication output # my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); my ($poly1, $poly3)=($acc6,$acc7); $code.=<<___; #lea $U2(%rsp), $a_ptr #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); xor $t4, $t4 add $acc0, $acc0 # a0:a3+a0:a3 lea $Rsqr(%rsp), $a_ptr adc $acc1, $acc1 mov $acc0, $t0 adc $acc2, $acc2 adc $acc3, $acc3 mov $acc1, $t1 adc \$0, $t4 sub \$-1, $acc0 mov $acc2, $t2 sbb $poly1, $acc1 sbb \$0, $acc2 mov $acc3, $t3 sbb $poly3, $acc3 sbb \$0, $t4 cmovc $t0, $acc0 mov 8*0($a_ptr), $t0 cmovc $t1, $acc1 mov 8*1($a_ptr), $t1 cmovc $t2, $acc2 mov 8*2($a_ptr), $t2 cmovc $t3, $acc3 mov 8*3($a_ptr), $t3 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); lea $Hcub(%rsp), $b_ptr lea $res_x(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); mov $U2+8*0(%rsp), $t0 mov $U2+8*1(%rsp), $t1 mov $U2+8*2(%rsp), $t2 mov $U2+8*3(%rsp), $t3 lea $H(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); mov $acc0, 8*0($r_ptr) # save the result, as mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` lea $H(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); lea $S2(%rsp), $b_ptr lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); movq %xmm0, $r_ptr # restore $r_ptr movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); movdqa %xmm5, %xmm1 pandn $res_z(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_z+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand .LONE_mont(%rip), %xmm2 pand .LONE_mont+0x10(%rip), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_z(%rsp), %xmm2 pand $in1_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x40($r_ptr) movdqu %xmm3, 0x50($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); movdqa %xmm5, %xmm1 pandn $res_x(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_x+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_x(%rsp), %xmm2 pand $in2_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_x(%rsp), %xmm2 pand $in1_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x00($r_ptr) movdqu %xmm3, 0x10($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); movdqa %xmm5, %xmm1 pandn $res_y(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_y+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_y(%rsp), %xmm2 pand $in2_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_y(%rsp), %xmm2 pand $in1_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) lea 32*15+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Ladd_affine${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx ___ } &gen_add_affine("q"); ######################################################################## # AD*X magic # if ($addx) { { ######################################################################## # operate in 4-5-0-1 "name space" that matches multiplication output # my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); $code.=<<___; .type __ecp_nistz256_add_tox,\@abi-omnipotent .align 32 __ecp_nistz256_add_tox: .cfi_startproc xor $t4, $t4 adc 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 xor $t3, $t3 sbb \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox .type __ecp_nistz256_sub_fromx,\@abi-omnipotent .align 32 __ecp_nistz256_sub_fromx: .cfi_startproc xor $t4, $t4 sbb 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb \$0, $t4 xor $t3, $t3 adc \$-1, $a0 mov $a2, $t2 adc $poly1, $a1 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 bt \$0, $t4 cmovnc $t0, $a0 cmovnc $t1, $a1 mov $a0, 8*0($r_ptr) cmovnc $t2, $a2 mov $a1, 8*1($r_ptr) cmovnc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx .type __ecp_nistz256_subx,\@abi-omnipotent .align 32 __ecp_nistz256_subx: .cfi_startproc xor $t4, $t4 sbb $a0, $t0 sbb $a1, $t1 mov $t0, $a0 sbb $a2, $t2 sbb $a3, $t3 mov $t1, $a1 sbb \$0, $t4 xor $a3 ,$a3 adc \$-1, $t0 mov $t2, $a2 adc $poly1, $t1 adc \$0, $t2 mov $t3, $a3 adc $poly3, $t3 bt \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 cmovc $t2, $a2 cmovc $t3, $a3 ret .cfi_endproc .size __ecp_nistz256_subx,.-__ecp_nistz256_subx .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent .align 32 __ecp_nistz256_mul_by_2x: .cfi_startproc xor $t4, $t4 adc $a0, $a0 # a0:a3+a0:a3 adc $a1, $a1 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 xor $t3, $t3 sbb \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x ___ } &gen_double("x"); &gen_add("x"); &gen_add_affine("x"); } }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type short_handler,\@abi-omnipotent .align 16 short_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 16(%rax),%rax mov -8(%rax),%r12 mov -16(%rax),%r13 mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 jmp .Lcommon_seh_tail .size short_handler,.-short_handler .type full_handler,\@abi-omnipotent .align 16 full_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 8(%r11),%r10d # HandlerData[2] lea (%rax,%r10),%rax mov -8(%rax),%rbp mov -16(%rax),%rbx mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size full_handler,.-full_handler .section .pdata .align 4 .rva .LSEH_begin_ecp_nistz256_mul_by_2 .rva .LSEH_end_ecp_nistz256_mul_by_2 .rva .LSEH_info_ecp_nistz256_mul_by_2 .rva .LSEH_begin_ecp_nistz256_div_by_2 .rva .LSEH_end_ecp_nistz256_div_by_2 .rva .LSEH_info_ecp_nistz256_div_by_2 .rva .LSEH_begin_ecp_nistz256_mul_by_3 .rva .LSEH_end_ecp_nistz256_mul_by_3 .rva .LSEH_info_ecp_nistz256_mul_by_3 .rva .LSEH_begin_ecp_nistz256_add .rva .LSEH_end_ecp_nistz256_add .rva .LSEH_info_ecp_nistz256_add .rva .LSEH_begin_ecp_nistz256_sub .rva .LSEH_end_ecp_nistz256_sub .rva .LSEH_info_ecp_nistz256_sub .rva .LSEH_begin_ecp_nistz256_neg .rva .LSEH_end_ecp_nistz256_neg .rva .LSEH_info_ecp_nistz256_neg .rva .LSEH_begin_ecp_nistz256_ord_mul_mont .rva .LSEH_end_ecp_nistz256_ord_mul_mont .rva .LSEH_info_ecp_nistz256_ord_mul_mont .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont .rva .LSEH_end_ecp_nistz256_ord_sqr_mont .rva .LSEH_info_ecp_nistz256_ord_sqr_mont ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_ord_mul_montx .rva .LSEH_end_ecp_nistz256_ord_mul_montx .rva .LSEH_info_ecp_nistz256_ord_mul_montx .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx .rva .LSEH_end_ecp_nistz256_ord_sqr_montx .rva .LSEH_info_ecp_nistz256_ord_sqr_montx ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_to_mont .rva .LSEH_end_ecp_nistz256_to_mont .rva .LSEH_info_ecp_nistz256_to_mont .rva .LSEH_begin_ecp_nistz256_mul_mont .rva .LSEH_end_ecp_nistz256_mul_mont .rva .LSEH_info_ecp_nistz256_mul_mont .rva .LSEH_begin_ecp_nistz256_sqr_mont .rva .LSEH_end_ecp_nistz256_sqr_mont .rva .LSEH_info_ecp_nistz256_sqr_mont .rva .LSEH_begin_ecp_nistz256_from_mont .rva .LSEH_end_ecp_nistz256_from_mont .rva .LSEH_info_ecp_nistz256_from_mont .rva .LSEH_begin_ecp_nistz256_gather_w5 .rva .LSEH_end_ecp_nistz256_gather_w5 .rva .LSEH_info_ecp_nistz256_gather_wX .rva .LSEH_begin_ecp_nistz256_gather_w7 .rva .LSEH_end_ecp_nistz256_gather_w7 .rva .LSEH_info_ecp_nistz256_gather_wX ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_point_double .rva .LSEH_end_ecp_nistz256_point_double .rva .LSEH_info_ecp_nistz256_point_double .rva .LSEH_begin_ecp_nistz256_point_add .rva .LSEH_end_ecp_nistz256_point_add .rva .LSEH_info_ecp_nistz256_point_add .rva .LSEH_begin_ecp_nistz256_point_add_affine .rva .LSEH_end_ecp_nistz256_point_add_affine .rva .LSEH_info_ecp_nistz256_point_add_affine ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_point_doublex .rva .LSEH_end_ecp_nistz256_point_doublex .rva .LSEH_info_ecp_nistz256_point_doublex .rva .LSEH_begin_ecp_nistz256_point_addx .rva .LSEH_end_ecp_nistz256_point_addx .rva .LSEH_info_ecp_nistz256_point_addx .rva .LSEH_begin_ecp_nistz256_point_add_affinex .rva .LSEH_end_ecp_nistz256_point_add_affinex .rva .LSEH_info_ecp_nistz256_point_add_affinex ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_ecp_nistz256_mul_by_2: .byte 9,0,0,0 .rva short_handler .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_div_by_2: .byte 9,0,0,0 .rva short_handler .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_mul_by_3: .byte 9,0,0,0 .rva short_handler .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_add: .byte 9,0,0,0 .rva short_handler .rva .Ladd_body,.Ladd_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_sub: .byte 9,0,0,0 .rva short_handler .rva .Lsub_body,.Lsub_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_neg: .byte 9,0,0,0 .rva short_handler .rva .Lneg_body,.Lneg_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_ord_mul_mont: .byte 9,0,0,0 .rva full_handler .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_ord_sqr_mont: .byte 9,0,0,0 .rva full_handler .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___ if ($addx); .LSEH_info_ecp_nistz256_ord_mul_montx: .byte 9,0,0,0 .rva full_handler .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_ord_sqr_montx: .byte 9,0,0,0 .rva full_handler .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___; .LSEH_info_ecp_nistz256_to_mont: .byte 9,0,0,0 .rva full_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_mul_mont: .byte 9,0,0,0 .rva full_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_sqr_mont: .byte 9,0,0,0 .rva full_handler .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_from_mont: .byte 9,0,0,0 .rva short_handler .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_gather_wX: .byte 0x01,0x33,0x16,0x00 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 .align 8 ___ $code.=<<___ if ($avx>1); .LSEH_info_ecp_nistz256_avx2_gather_wX: .byte 0x01,0x36,0x17,0x0b .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 .align 8 ___ $code.=<<___; .LSEH_info_ecp_nistz256_point_double: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] .long 32*5+56,0 .LSEH_info_ecp_nistz256_point_add: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] .long 32*18+56,0 .LSEH_info_ecp_nistz256_point_add_affine: .byte 9,0,0,0 .rva full_handler .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] .long 32*15+56,0 ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_ecp_nistz256_point_doublex: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] .long 32*5+56,0 .LSEH_info_ecp_nistz256_point_addx: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] .long 32*18+56,0 .LSEH_info_ecp_nistz256_point_add_affinex: .byte 9,0,0,0 .rva full_handler .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] .long 32*15+56,0 ___ } ######################################################################## # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 # open TABLE,") { s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; } close TABLE; die "insane number of elements" if ($#arr != 64*16*37-1); print <<___; .text .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,\@object .align 4096 ecp_nistz256_precomputed: ___ while (@line=splice(@arr,0,16)) { print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; } print <<___; .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/ec/asm/x25519-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/ec/asm/x25519-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/ec/asm/x25519-x86_64.pl (revision 364822) @@ -1,1131 +1,1131 @@ #!/usr/bin/env perl # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # X25519 lower-level primitives for x86_64. # # February 2018. # # This module implements radix 2^51 multiplication and squaring, and # radix 2^64 multiplication, squaring, addition, subtraction and final # reduction. Latter radix is used on ADCX/ADOX-capable processors such # as Broadwell. On related note one should mention that there are # vector implementations that provide significantly better performance # on some processors(*), but they are large and overly complex. Which # in combination with them being effectively processor-specific makes # the undertaking hard to justify. The goal for this implementation # is rather versatility and simplicity [and ultimately formal # verification]. # # (*) For example sandy2x should provide ~30% improvement on Sandy # Bridge, but only nominal ~5% on Haswell [and big loss on # Broadwell and successors]. # ###################################################################### # Improvement coefficients: # # amd64-51(*) gcc-5.x(**) # # P4 +22% +40% # Sandy Bridge -3% +11% # Haswell -1% +13% # Broadwell(***) +30% +35% # Skylake(***) +33% +47% # Silvermont +20% +26% # Goldmont +40% +50% # Bulldozer +20% +9% # Ryzen(***) +43% +40% # VIA +170% +120% # # (*) amd64-51 is popular assembly implementation with 2^51 radix, # only multiplication and squaring subroutines were linked # for comparison, but not complete ladder step; gain on most # processors is because this module refrains from shld, and # minor regression on others is because this does result in # higher instruction count; # (**) compiler is free to inline functions, in assembly one would # need to implement ladder step to do that, and it will improve # performance by several percent; # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding # C implementation, so that comparison is always against # 2^51 radix; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } $code.=<<___; .text .globl x25519_fe51_mul .type x25519_fe51_mul,\@function,3 .align 32 x25519_fe51_mul: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_mul_body: mov 8*0(%rsi),%rax # f[0] mov 8*0(%rdx),%r11 # load g[0-4] mov 8*1(%rdx),%r12 mov 8*2(%rdx),%r13 mov 8*3(%rdx),%rbp mov 8*4(%rdx),%r14 mov %rdi,8*4(%rsp) # offload 1st argument mov %rax,%rdi mulq %r11 # f[0]*g[0] mov %r11,8*0(%rsp) # offload g[0] mov %rax,%rbx # %rbx:%rcx = h0 mov %rdi,%rax mov %rdx,%rcx mulq %r12 # f[0]*g[1] mov %r12,8*1(%rsp) # offload g[1] mov %rax,%r8 # %r8:%r9 = h1 mov %rdi,%rax lea (%r14,%r14,8),%r15 mov %rdx,%r9 mulq %r13 # f[0]*g[2] mov %r13,8*2(%rsp) # offload g[2] mov %rax,%r10 # %r10:%r11 = h2 mov %rdi,%rax lea (%r14,%r15,2),%rdi # g[4]*19 mov %rdx,%r11 mulq %rbp # f[0]*g[3] mov %rax,%r12 # %r12:%r13 = h3 mov 8*0(%rsi),%rax # f[0] mov %rdx,%r13 mulq %r14 # f[0]*g[4] mov %rax,%r14 # %r14:%r15 = h4 mov 8*1(%rsi),%rax # f[1] mov %rdx,%r15 mulq %rdi # f[1]*g[4]*19 add %rax,%rbx mov 8*2(%rsi),%rax # f[2] adc %rdx,%rcx mulq %rdi # f[2]*g[4]*19 add %rax,%r8 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r9 mulq %rdi # f[3]*g[4]*19 add %rax,%r10 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r11 mulq %rdi # f[4]*g[4]*19 imulq \$19,%rbp,%rdi # g[3]*19 add %rax,%r12 mov 8*1(%rsi),%rax # f[1] adc %rdx,%r13 mulq %rbp # f[1]*g[3] mov 8*2(%rsp),%rbp # g[2] add %rax,%r14 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r15 mulq %rdi # f[2]*g[3]*19 add %rax,%rbx mov 8*3(%rsi),%rax # f[3] adc %rdx,%rcx mulq %rdi # f[3]*g[3]*19 add %rax,%r8 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r9 mulq %rdi # f[4]*g[3]*19 imulq \$19,%rbp,%rdi # g[2]*19 add %rax,%r10 mov 8*1(%rsi),%rax # f[1] adc %rdx,%r11 mulq %rbp # f[1]*g[2] add %rax,%r12 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r13 mulq %rbp # f[2]*g[2] mov 8*1(%rsp),%rbp # g[1] add %rax,%r14 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r15 mulq %rdi # f[3]*g[2]*19 add %rax,%rbx mov 8*4(%rsi),%rax # f[3] adc %rdx,%rcx mulq %rdi # f[4]*g[2]*19 add %rax,%r8 mov 8*1(%rsi),%rax # f[1] adc %rdx,%r9 mulq %rbp # f[1]*g[1] imulq \$19,%rbp,%rdi add %rax,%r10 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r11 mulq %rbp # f[2]*g[1] add %rax,%r12 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r13 mulq %rbp # f[3]*g[1] mov 8*0(%rsp),%rbp # g[0] add %rax,%r14 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r15 mulq %rdi # f[4]*g[1]*19 add %rax,%rbx mov 8*1(%rsi),%rax # f[1] adc %rdx,%rcx mul %rbp # f[1]*g[0] add %rax,%r8 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r9 mul %rbp # f[2]*g[0] add %rax,%r10 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r11 mul %rbp # f[3]*g[0] add %rax,%r12 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r13 mulq %rbp # f[4]*g[0] add %rax,%r14 adc %rdx,%r15 mov 8*4(%rsp),%rdi # restore 1st argument jmp .Lreduce51 .Lfe51_mul_epilogue: .cfi_endproc .size x25519_fe51_mul,.-x25519_fe51_mul .globl x25519_fe51_sqr .type x25519_fe51_sqr,\@function,2 .align 32 x25519_fe51_sqr: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_sqr_body: mov 8*0(%rsi),%rax # g[0] mov 8*2(%rsi),%r15 # g[2] mov 8*4(%rsi),%rbp # g[4] mov %rdi,8*4(%rsp) # offload 1st argument lea (%rax,%rax),%r14 mulq %rax # g[0]*g[0] mov %rax,%rbx mov 8*1(%rsi),%rax # g[1] mov %rdx,%rcx mulq %r14 # 2*g[0]*g[1] mov %rax,%r8 mov %r15,%rax mov %r15,8*0(%rsp) # offload g[2] mov %rdx,%r9 mulq %r14 # 2*g[0]*g[2] mov %rax,%r10 mov 8*3(%rsi),%rax mov %rdx,%r11 imulq \$19,%rbp,%rdi # g[4]*19 mulq %r14 # 2*g[0]*g[3] mov %rax,%r12 mov %rbp,%rax mov %rdx,%r13 mulq %r14 # 2*g[0]*g[4] mov %rax,%r14 mov %rbp,%rax mov %rdx,%r15 mulq %rdi # g[4]*g[4]*19 add %rax,%r12 mov 8*1(%rsi),%rax # g[1] adc %rdx,%r13 mov 8*3(%rsi),%rsi # g[3] lea (%rax,%rax),%rbp mulq %rax # g[1]*g[1] add %rax,%r10 mov 8*0(%rsp),%rax # g[2] adc %rdx,%r11 mulq %rbp # 2*g[1]*g[2] add %rax,%r12 mov %rbp,%rax adc %rdx,%r13 mulq %rsi # 2*g[1]*g[3] add %rax,%r14 mov %rbp,%rax adc %rdx,%r15 imulq \$19,%rsi,%rbp # g[3]*19 mulq %rdi # 2*g[1]*g[4]*19 add %rax,%rbx lea (%rsi,%rsi),%rax adc %rdx,%rcx mulq %rdi # 2*g[3]*g[4]*19 add %rax,%r10 mov %rsi,%rax adc %rdx,%r11 mulq %rbp # g[3]*g[3]*19 add %rax,%r8 mov 8*0(%rsp),%rax # g[2] adc %rdx,%r9 lea (%rax,%rax),%rsi mulq %rax # g[2]*g[2] add %rax,%r14 mov %rbp,%rax adc %rdx,%r15 mulq %rsi # 2*g[2]*g[3]*19 add %rax,%rbx mov %rsi,%rax adc %rdx,%rcx mulq %rdi # 2*g[2]*g[4]*19 add %rax,%r8 adc %rdx,%r9 mov 8*4(%rsp),%rdi # restore 1st argument jmp .Lreduce51 .align 32 .Lreduce51: mov \$0x7ffffffffffff,%rbp mov %r10,%rdx shr \$51,%r10 shl \$13,%r11 and %rbp,%rdx # %rdx = g2 = h2 & mask or %r10,%r11 # h2>>51 add %r11,%r12 adc \$0,%r13 # h3 += h2>>51 mov %rbx,%rax shr \$51,%rbx shl \$13,%rcx and %rbp,%rax # %rax = g0 = h0 & mask or %rbx,%rcx # h0>>51 add %rcx,%r8 # h1 += h0>>51 adc \$0,%r9 mov %r12,%rbx shr \$51,%r12 shl \$13,%r13 and %rbp,%rbx # %rbx = g3 = h3 & mask or %r12,%r13 # h3>>51 add %r13,%r14 # h4 += h3>>51 adc \$0,%r15 mov %r8,%rcx shr \$51,%r8 shl \$13,%r9 and %rbp,%rcx # %rcx = g1 = h1 & mask or %r8,%r9 add %r9,%rdx # g2 += h1>>51 mov %r14,%r10 shr \$51,%r14 shl \$13,%r15 and %rbp,%r10 # %r10 = g4 = h0 & mask or %r14,%r15 # h0>>51 lea (%r15,%r15,8),%r14 lea (%r15,%r14,2),%r15 add %r15,%rax # g0 += (h0>>51)*19 mov %rdx,%r8 and %rbp,%rdx # g2 &= mask shr \$51,%r8 add %r8,%rbx # g3 += g2>>51 mov %rax,%r9 and %rbp,%rax # g0 &= mask shr \$51,%r9 add %r9,%rcx # g1 += g0>>51 mov %rax,8*0(%rdi) # save the result mov %rcx,8*1(%rdi) mov %rdx,8*2(%rdi) mov %rbx,8*3(%rdi) mov %r10,8*4(%rdi) mov 8*5(%rsp),%r15 .cfi_restore %r15 mov 8*6(%rsp),%r14 .cfi_restore %r14 mov 8*7(%rsp),%r13 .cfi_restore %r13 mov 8*8(%rsp),%r12 .cfi_restore %r12 mov 8*9(%rsp),%rbx .cfi_restore %rbx mov 8*10(%rsp),%rbp .cfi_restore %rbp lea 8*11(%rsp),%rsp .cfi_adjust_cfa_offset 88 .Lfe51_sqr_epilogue: ret .cfi_endproc .size x25519_fe51_sqr,.-x25519_fe51_sqr .globl x25519_fe51_mul121666 .type x25519_fe51_mul121666,\@function,2 .align 32 x25519_fe51_mul121666: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_mul121666_body: mov \$121666,%eax mulq 8*0(%rsi) mov %rax,%rbx # %rbx:%rcx = h0 mov \$121666,%eax mov %rdx,%rcx mulq 8*1(%rsi) mov %rax,%r8 # %r8:%r9 = h1 mov \$121666,%eax mov %rdx,%r9 mulq 8*2(%rsi) mov %rax,%r10 # %r10:%r11 = h2 mov \$121666,%eax mov %rdx,%r11 mulq 8*3(%rsi) mov %rax,%r12 # %r12:%r13 = h3 mov \$121666,%eax # f[0] mov %rdx,%r13 mulq 8*4(%rsi) mov %rax,%r14 # %r14:%r15 = h4 mov %rdx,%r15 jmp .Lreduce51 .Lfe51_mul121666_epilogue: .cfi_endproc .size x25519_fe51_mul121666,.-x25519_fe51_mul121666 ___ ######################################################################## # Base 2^64 subroutines modulo 2*(2^255-19) # if ($addx) { my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); $code.=<<___; .extern OPENSSL_ia32cap_P .globl x25519_fe64_eligible .type x25519_fe64_eligible,\@abi-omnipotent .align 32 x25519_fe64_eligible: .cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%ecx xor %eax,%eax and \$0x80100,%ecx cmp \$0x80100,%ecx cmove %ecx,%eax ret .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,\@function,3 .align 32 x25519_fe64_mul: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push %rdi # offload dst .cfi_push %rdi lea -8*2(%rsp),%rsp .cfi_adjust_cfa_offset 16 .Lfe64_mul_body: mov %rdx,%rax mov 8*0(%rdx),%rbp # b[0] mov 8*0(%rsi),%rdx # a[0] mov 8*1(%rax),%rcx # b[1] mov 8*2(%rax),$acc6 # b[2] mov 8*3(%rax),$acc7 # b[3] mulx %rbp,$acc0,%rax # a[0]*b[0] xor %edi,%edi # cf=0,of=0 mulx %rcx,$acc1,%rbx # a[0]*b[1] adcx %rax,$acc1 mulx $acc6,$acc2,%rax # a[0]*b[2] adcx %rbx,$acc2 mulx $acc7,$acc3,$acc4 # a[0]*b[3] mov 8*1(%rsi),%rdx # a[1] adcx %rax,$acc3 mov $acc6,(%rsp) # offload b[2] adcx %rdi,$acc4 # cf=0 mulx %rbp,%rax,%rbx # a[1]*b[0] adox %rax,$acc1 adcx %rbx,$acc2 mulx %rcx,%rax,%rbx # a[1]*b[1] adox %rax,$acc2 adcx %rbx,$acc3 mulx $acc6,%rax,%rbx # a[1]*b[2] adox %rax,$acc3 adcx %rbx,$acc4 mulx $acc7,%rax,$acc5 # a[1]*b[3] mov 8*2(%rsi),%rdx # a[2] adox %rax,$acc4 adcx %rdi,$acc5 # cf=0 adox %rdi,$acc5 # of=0 mulx %rbp,%rax,%rbx # a[2]*b[0] adcx %rax,$acc2 adox %rbx,$acc3 mulx %rcx,%rax,%rbx # a[2]*b[1] adcx %rax,$acc3 adox %rbx,$acc4 mulx $acc6,%rax,%rbx # a[2]*b[2] adcx %rax,$acc4 adox %rbx,$acc5 mulx $acc7,%rax,$acc6 # a[2]*b[3] mov 8*3(%rsi),%rdx # a[3] adcx %rax,$acc5 adox %rdi,$acc6 # of=0 adcx %rdi,$acc6 # cf=0 mulx %rbp,%rax,%rbx # a[3]*b[0] adox %rax,$acc3 adcx %rbx,$acc4 mulx %rcx,%rax,%rbx # a[3]*b[1] adox %rax,$acc4 adcx %rbx,$acc5 mulx (%rsp),%rax,%rbx # a[3]*b[2] adox %rax,$acc5 adcx %rbx,$acc6 mulx $acc7,%rax,$acc7 # a[3]*b[3] mov \$38,%edx adox %rax,$acc6 adcx %rdi,$acc7 # cf=0 adox %rdi,$acc7 # of=0 jmp .Lreduce64 .Lfe64_mul_epilogue: .cfi_endproc .size x25519_fe64_mul,.-x25519_fe64_mul .globl x25519_fe64_sqr .type x25519_fe64_sqr,\@function,2 .align 32 x25519_fe64_sqr: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push %rdi # offload dst .cfi_push %rdi lea -8*2(%rsp),%rsp .cfi_adjust_cfa_offset 16 .Lfe64_sqr_body: mov 8*0(%rsi),%rdx # a[0] mov 8*1(%rsi),%rcx # a[1] mov 8*2(%rsi),%rbp # a[2] mov 8*3(%rsi),%rsi # a[3] ################################################################ mulx %rdx,$acc0,$acc7 # a[0]*a[0] mulx %rcx,$acc1,%rax # a[0]*a[1] xor %edi,%edi # cf=0,of=0 mulx %rbp,$acc2,%rbx # a[0]*a[2] adcx %rax,$acc2 mulx %rsi,$acc3,$acc4 # a[0]*a[3] mov %rcx,%rdx # a[1] adcx %rbx,$acc3 adcx %rdi,$acc4 # cf=0 ################################################################ mulx %rbp,%rax,%rbx # a[1]*a[2] adox %rax,$acc3 adcx %rbx,$acc4 mulx %rsi,%rax,$acc5 # a[1]*a[3] mov %rbp,%rdx # a[2] adox %rax,$acc4 adcx %rdi,$acc5 ################################################################ mulx %rsi,%rax,$acc6 # a[2]*a[3] mov %rcx,%rdx # a[1] adox %rax,$acc5 adcx %rdi,$acc6 # cf=0 adox %rdi,$acc6 # of=0 adcx $acc1,$acc1 # acc1:6<<1 adox $acc7,$acc1 adcx $acc2,$acc2 mulx %rdx,%rax,%rbx # a[1]*a[1] mov %rbp,%rdx # a[2] adcx $acc3,$acc3 adox %rax,$acc2 adcx $acc4,$acc4 adox %rbx,$acc3 mulx %rdx,%rax,%rbx # a[2]*a[2] mov %rsi,%rdx # a[3] adcx $acc5,$acc5 adox %rax,$acc4 adcx $acc6,$acc6 adox %rbx,$acc5 mulx %rdx,%rax,$acc7 # a[3]*a[3] mov \$38,%edx adox %rax,$acc6 adcx %rdi,$acc7 # cf=0 adox %rdi,$acc7 # of=0 jmp .Lreduce64 .align 32 .Lreduce64: mulx $acc4,%rax,%rbx adcx %rax,$acc0 adox %rbx,$acc1 mulx $acc5,%rax,%rbx adcx %rax,$acc1 adox %rbx,$acc2 mulx $acc6,%rax,%rbx adcx %rax,$acc2 adox %rbx,$acc3 mulx $acc7,%rax,$acc4 adcx %rax,$acc3 adox %rdi,$acc4 adcx %rdi,$acc4 mov 8*2(%rsp),%rdi # restore dst imulq %rdx,$acc4 add $acc4,$acc0 adc \$0,$acc1 adc \$0,$acc2 adc \$0,$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax add %rax,$acc0 mov $acc1,8*1(%rdi) mov $acc2,8*2(%rdi) mov $acc3,8*3(%rdi) mov $acc0,8*0(%rdi) mov 8*3(%rsp),%r15 .cfi_restore %r15 mov 8*4(%rsp),%r14 .cfi_restore %r14 mov 8*5(%rsp),%r13 .cfi_restore %r13 mov 8*6(%rsp),%r12 .cfi_restore %r12 mov 8*7(%rsp),%rbx .cfi_restore %rbx mov 8*8(%rsp),%rbp .cfi_restore %rbp lea 8*9(%rsp),%rsp .cfi_adjust_cfa_offset 88 .Lfe64_sqr_epilogue: ret .cfi_endproc .size x25519_fe64_sqr,.-x25519_fe64_sqr .globl x25519_fe64_mul121666 .type x25519_fe64_mul121666,\@function,2 .align 32 x25519_fe64_mul121666: .Lfe64_mul121666_body: .cfi_startproc mov \$121666,%edx mulx 8*0(%rsi),$acc0,%rcx mulx 8*1(%rsi),$acc1,%rax add %rcx,$acc1 mulx 8*2(%rsi),$acc2,%rcx adc %rax,$acc2 mulx 8*3(%rsi),$acc3,%rax adc %rcx,$acc3 adc \$0,%rax imulq \$38,%rax,%rax add %rax,$acc0 adc \$0,$acc1 adc \$0,$acc2 adc \$0,$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax add %rax,$acc0 mov $acc1,8*1(%rdi) mov $acc2,8*2(%rdi) mov $acc3,8*3(%rdi) mov $acc0,8*0(%rdi) .Lfe64_mul121666_epilogue: ret .cfi_endproc .size x25519_fe64_mul121666,.-x25519_fe64_mul121666 .globl x25519_fe64_add .type x25519_fe64_add,\@function,3 .align 32 x25519_fe64_add: .Lfe64_add_body: .cfi_startproc mov 8*0(%rsi),$acc0 mov 8*1(%rsi),$acc1 mov 8*2(%rsi),$acc2 mov 8*3(%rsi),$acc3 add 8*0(%rdx),$acc0 adc 8*1(%rdx),$acc1 adc 8*2(%rdx),$acc2 adc 8*3(%rdx),$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax add %rax,$acc0 adc \$0,$acc1 adc \$0,$acc2 mov $acc1,8*1(%rdi) adc \$0,$acc3 mov $acc2,8*2(%rdi) sbb %rax,%rax # cf -> mask mov $acc3,8*3(%rdi) and \$38,%rax add %rax,$acc0 mov $acc0,8*0(%rdi) .Lfe64_add_epilogue: ret .cfi_endproc .size x25519_fe64_add,.-x25519_fe64_add .globl x25519_fe64_sub .type x25519_fe64_sub,\@function,3 .align 32 x25519_fe64_sub: .Lfe64_sub_body: .cfi_startproc mov 8*0(%rsi),$acc0 mov 8*1(%rsi),$acc1 mov 8*2(%rsi),$acc2 mov 8*3(%rsi),$acc3 sub 8*0(%rdx),$acc0 sbb 8*1(%rdx),$acc1 sbb 8*2(%rdx),$acc2 sbb 8*3(%rdx),$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax sub %rax,$acc0 sbb \$0,$acc1 sbb \$0,$acc2 mov $acc1,8*1(%rdi) sbb \$0,$acc3 mov $acc2,8*2(%rdi) sbb %rax,%rax # cf -> mask mov $acc3,8*3(%rdi) and \$38,%rax sub %rax,$acc0 mov $acc0,8*0(%rdi) .Lfe64_sub_epilogue: ret .cfi_endproc .size x25519_fe64_sub,.-x25519_fe64_sub .globl x25519_fe64_tobytes .type x25519_fe64_tobytes,\@function,2 .align 32 x25519_fe64_tobytes: .Lfe64_to_body: .cfi_startproc mov 8*0(%rsi),$acc0 mov 8*1(%rsi),$acc1 mov 8*2(%rsi),$acc2 mov 8*3(%rsi),$acc3 ################################# reduction modulo 2^255-19 lea ($acc3,$acc3),%rax sar \$63,$acc3 # most significant bit -> mask shr \$1,%rax # most significant bit cleared and \$19,$acc3 add \$19,$acc3 # compare to modulus in the same go add $acc3,$acc0 adc \$0,$acc1 adc \$0,$acc2 adc \$0,%rax lea (%rax,%rax),$acc3 sar \$63,%rax # most significant bit -> mask shr \$1,$acc3 # most significant bit cleared not %rax and \$19,%rax sub %rax,$acc0 sbb \$0,$acc1 sbb \$0,$acc2 sbb \$0,$acc3 mov $acc0,8*0(%rdi) mov $acc1,8*1(%rdi) mov $acc2,8*2(%rdi) mov $acc3,8*3(%rdi) .Lfe64_to_epilogue: ret .cfi_endproc .size x25519_fe64_tobytes,.-x25519_fe64_tobytes ___ } else { $code.=<<___; .globl x25519_fe64_eligible .type x25519_fe64_eligible,\@abi-omnipotent .align 32 x25519_fe64_eligible: .cfi_startproc xor %eax,%eax ret .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,\@abi-omnipotent .globl x25519_fe64_sqr .globl x25519_fe64_mul121666 .globl x25519_fe64_add .globl x25519_fe64_sub .globl x25519_fe64_tobytes x25519_fe64_mul: x25519_fe64_sqr: x25519_fe64_mul121666: x25519_fe64_add: x25519_fe64_sub: x25519_fe64_tobytes: .cfi_startproc .byte 0x0f,0x0b # ud2 ret .cfi_endproc .size x25519_fe64_mul,.-x25519_fe64_mul ___ } $code.=<<___; .asciz "X25519 primitives for x86_64, CRYPTOGAMS by " ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type short_handler,\@abi-omnipotent .align 16 short_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp jmp .Lcommon_seh_tail .size short_handler,.-short_handler .type full_handler,\@abi-omnipotent .align 16 full_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 8(%r11),%r10d # HandlerData[2] lea (%rax,%r10),%rax mov -8(%rax),%rbp mov -16(%rax),%rbx mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size full_handler,.-full_handler .section .pdata .align 4 .rva .LSEH_begin_x25519_fe51_mul .rva .LSEH_end_x25519_fe51_mul .rva .LSEH_info_x25519_fe51_mul .rva .LSEH_begin_x25519_fe51_sqr .rva .LSEH_end_x25519_fe51_sqr .rva .LSEH_info_x25519_fe51_sqr .rva .LSEH_begin_x25519_fe51_mul121666 .rva .LSEH_end_x25519_fe51_mul121666 .rva .LSEH_info_x25519_fe51_mul121666 ___ $code.=<<___ if ($addx); .rva .LSEH_begin_x25519_fe64_mul .rva .LSEH_end_x25519_fe64_mul .rva .LSEH_info_x25519_fe64_mul .rva .LSEH_begin_x25519_fe64_sqr .rva .LSEH_end_x25519_fe64_sqr .rva .LSEH_info_x25519_fe64_sqr .rva .LSEH_begin_x25519_fe64_mul121666 .rva .LSEH_end_x25519_fe64_mul121666 .rva .LSEH_info_x25519_fe64_mul121666 .rva .LSEH_begin_x25519_fe64_add .rva .LSEH_end_x25519_fe64_add .rva .LSEH_info_x25519_fe64_add .rva .LSEH_begin_x25519_fe64_sub .rva .LSEH_end_x25519_fe64_sub .rva .LSEH_info_x25519_fe64_sub .rva .LSEH_begin_x25519_fe64_tobytes .rva .LSEH_end_x25519_fe64_tobytes .rva .LSEH_info_x25519_fe64_tobytes ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_x25519_fe51_mul: .byte 9,0,0,0 .rva full_handler .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[] .long 88,0 .LSEH_info_x25519_fe51_sqr: .byte 9,0,0,0 .rva full_handler .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[] .long 88,0 .LSEH_info_x25519_fe51_mul121666: .byte 9,0,0,0 .rva full_handler .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[] .long 88,0 ___ $code.=<<___ if ($addx); .LSEH_info_x25519_fe64_mul: .byte 9,0,0,0 .rva full_handler .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[] .long 72,0 .LSEH_info_x25519_fe64_sqr: .byte 9,0,0,0 .rva full_handler .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[] .long 72,0 .LSEH_info_x25519_fe64_mul121666: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[] .LSEH_info_x25519_fe64_add: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[] .LSEH_info_x25519_fe64_sub: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[] .LSEH_info_x25519_fe64_tobytes: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[] ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl (revision 364822) @@ -1,1107 +1,1107 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if ($avx>1) {{{ ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); ($Ii,$T1,$T2,$Hkey, $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); $code=<<___; .text .type _aesni_ctr32_ghash_6x,\@abi-omnipotent .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb sub \$6,$len vpxor $Z0,$Z0,$Z0 # $Z0 = 0 vmovdqu 0x00-0x80($key),$rndkey vpaddb $T2,$T1,$inout1 vpaddb $T2,$inout1,$inout2 vpaddb $T2,$inout2,$inout3 vpaddb $T2,$inout3,$inout4 vpaddb $T2,$inout4,$inout5 vpxor $rndkey,$T1,$inout0 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 jmp .Loop6x .align 32 .Loop6x: add \$`6<<24`,$counter jc .Lhandle_ctr32 # discard $inout[1-5]? vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpaddb $T2,$inout5,$T1 # next counter value vpxor $rndkey,$inout1,$inout1 vpxor $rndkey,$inout2,$inout2 .Lresume_ctr32: vmovdqu $T1,($ivp) # save next counter value vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 vpxor $rndkey,$inout3,$inout3 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 xor %r12,%r12 cmp $in0,$end0 vaesenc $T2,$inout0,$inout0 vmovdqu 0x30+8(%rsp),$Ii # I[4] vpxor $rndkey,$inout4,$inout4 vpclmulqdq \$0x00,$Hkey,$Z3,$T1 vaesenc $T2,$inout1,$inout1 vpxor $rndkey,$inout5,$inout5 setnc %r12b vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 vaesenc $T2,$inout2,$inout2 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 neg %r12 vaesenc $T2,$inout3,$inout3 vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 vpxor $Z0,$Xi,$Xi # modulo-scheduled vaesenc $T2,$inout4,$inout4 vpxor $Z1,$T1,$Z0 and \$0x60,%r12 vmovups 0x20-0x80($key),$rndkey vpclmulqdq \$0x10,$Hkey,$Ii,$T1 vaesenc $T2,$inout5,$inout5 vpclmulqdq \$0x01,$Hkey,$Ii,$T2 lea ($in0,%r12),$in0 vaesenc $rndkey,$inout0,$inout0 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey vmovdqu 0x40+8(%rsp),$Ii # I[3] vaesenc $rndkey,$inout1,$inout1 movbe 0x58($in0),%r13 vaesenc $rndkey,$inout2,$inout2 movbe 0x50($in0),%r12 vaesenc $rndkey,$inout3,$inout3 mov %r13,0x20+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x28+8(%rsp) vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 vaesenc $rndkey,$inout5,$inout5 vmovups 0x30-0x80($key),$rndkey vpxor $T1,$Z2,$Z2 vpclmulqdq \$0x00,$Z1,$Ii,$T1 vaesenc $rndkey,$inout0,$inout0 vpxor $T2,$Z2,$Z2 vpclmulqdq \$0x10,$Z1,$Ii,$T2 vaesenc $rndkey,$inout1,$inout1 vpxor $Hkey,$Z3,$Z3 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey vaesenc $rndkey,$inout2,$inout2 vpclmulqdq \$0x11,$Z1,$Ii,$Z1 vmovdqu 0x50+8(%rsp),$Ii # I[2] vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vpxor $T1,$Z0,$Z0 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 vaesenc $rndkey,$inout5,$inout5 vmovups 0x40-0x80($key),$rndkey vpxor $T2,$Z2,$Z2 vpclmulqdq \$0x00,$T1,$Ii,$T2 vaesenc $rndkey,$inout0,$inout0 vpxor $Hkey,$Z2,$Z2 vpclmulqdq \$0x10,$T1,$Ii,$Hkey vaesenc $rndkey,$inout1,$inout1 movbe 0x48($in0),%r13 vpxor $Z1,$Z3,$Z3 vpclmulqdq \$0x01,$T1,$Ii,$Z1 vaesenc $rndkey,$inout2,$inout2 movbe 0x40($in0),%r12 vpclmulqdq \$0x11,$T1,$Ii,$T1 vmovdqu 0x60+8(%rsp),$Ii # I[1] vaesenc $rndkey,$inout3,$inout3 mov %r13,0x30+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x38+8(%rsp) vpxor $T2,$Z0,$Z0 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 vaesenc $rndkey,$inout5,$inout5 vmovups 0x50-0x80($key),$rndkey vpxor $Hkey,$Z2,$Z2 vpclmulqdq \$0x00,$T2,$Ii,$Hkey vaesenc $rndkey,$inout0,$inout0 vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x10,$T2,$Ii,$Z1 vaesenc $rndkey,$inout1,$inout1 movbe 0x38($in0),%r13 vpxor $T1,$Z3,$Z3 vpclmulqdq \$0x01,$T2,$Ii,$T1 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] vaesenc $rndkey,$inout2,$inout2 movbe 0x30($in0),%r12 vpclmulqdq \$0x11,$T2,$Ii,$T2 vaesenc $rndkey,$inout3,$inout3 mov %r13,0x40+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x48+8(%rsp) vpxor $Hkey,$Z0,$Z0 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 vaesenc $rndkey,$inout5,$inout5 vmovups 0x60-0x80($key),$rndkey vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 vaesenc $rndkey,$inout0,$inout0 vpxor $T1,$Z2,$Z2 vpclmulqdq \$0x01,$Hkey,$Xi,$T1 vaesenc $rndkey,$inout1,$inout1 movbe 0x28($in0),%r13 vpxor $T2,$Z3,$Z3 vpclmulqdq \$0x00,$Hkey,$Xi,$T2 vaesenc $rndkey,$inout2,$inout2 movbe 0x20($in0),%r12 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi vaesenc $rndkey,$inout3,$inout3 mov %r13,0x50+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x58+8(%rsp) vpxor $Z1,$Z2,$Z2 vaesenc $rndkey,$inout5,$inout5 vpxor $T1,$Z2,$Z2 vmovups 0x70-0x80($key),$rndkey vpslldq \$8,$Z2,$Z1 vpxor $T2,$Z0,$Z0 vmovdqu 0x10($const),$Hkey # .Lpoly vaesenc $rndkey,$inout0,$inout0 vpxor $Xi,$Z3,$Z3 vaesenc $rndkey,$inout1,$inout1 vpxor $Z1,$Z0,$Z0 movbe 0x18($in0),%r13 vaesenc $rndkey,$inout2,$inout2 movbe 0x10($in0),%r12 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 mov %r13,0x60+8(%rsp) vaesenc $rndkey,$inout3,$inout3 mov %r12,0x68+8(%rsp) vaesenc $rndkey,$inout4,$inout4 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vmovups 0x90-0x80($key),$rndkey vaesenc $T1,$inout1,$inout1 vpsrldq \$8,$Z2,$Z2 vaesenc $T1,$inout2,$inout2 vpxor $Z2,$Z3,$Z3 vaesenc $T1,$inout3,$inout3 vpxor $Ii,$Z0,$Z0 movbe 0x08($in0),%r13 vaesenc $T1,$inout4,$inout4 movbe 0x00($in0),%r12 vaesenc $T1,$inout5,$inout5 vmovups 0xa0-0x80($key),$T1 cmp \$11,$rounds jb .Lenc_tail # 128-bit key vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vaesenc $T1,$inout1,$inout1 vaesenc $T1,$inout2,$inout2 vaesenc $T1,$inout3,$inout3 vaesenc $T1,$inout4,$inout4 vmovups 0xb0-0x80($key),$rndkey vaesenc $T1,$inout5,$inout5 vmovups 0xc0-0x80($key),$T1 je .Lenc_tail # 192-bit key vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vaesenc $T1,$inout1,$inout1 vaesenc $T1,$inout2,$inout2 vaesenc $T1,$inout3,$inout3 vaesenc $T1,$inout4,$inout4 vmovups 0xd0-0x80($key),$rndkey vaesenc $T1,$inout5,$inout5 vmovups 0xe0-0x80($key),$T1 jmp .Lenc_tail # 256-bit key .align 32 .Lhandle_ctr32: vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask vpshufb $Ii,$T1,$Z2 # byte-swap counter vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb vpaddd $Z1,$Z2,$inout2 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpaddd $Z1,$inout1,$inout3 vpshufb $Ii,$inout1,$inout1 vpaddd $Z1,$inout2,$inout4 vpshufb $Ii,$inout2,$inout2 vpxor $rndkey,$inout1,$inout1 vpaddd $Z1,$inout3,$inout5 vpshufb $Ii,$inout3,$inout3 vpxor $rndkey,$inout2,$inout2 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value vpshufb $Ii,$inout4,$inout4 vpshufb $Ii,$inout5,$inout5 vpshufb $Ii,$T1,$T1 # next counter value jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc $rndkey,$inout0,$inout0 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase vaesenc $rndkey,$inout1,$inout1 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 vpxor 0x00($inp),$T1,$T2 vaesenc $rndkey,$inout2,$inout2 vpxor 0x10($inp),$T1,$Ii vaesenc $rndkey,$inout3,$inout3 vpxor 0x20($inp),$T1,$Z1 vaesenc $rndkey,$inout4,$inout4 vpxor 0x30($inp),$T1,$Z2 vaesenc $rndkey,$inout5,$inout5 vpxor 0x40($inp),$T1,$Z3 vpxor 0x50($inp),$T1,$Hkey vmovdqu ($ivp),$T1 # load next counter value vaesenclast $T2,$inout0,$inout0 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb vaesenclast $Ii,$inout1,$inout1 vpaddb $T2,$T1,$Ii mov %r13,0x70+8(%rsp) lea 0x60($inp),$inp vaesenclast $Z1,$inout2,$inout2 vpaddb $T2,$Ii,$Z1 mov %r12,0x78+8(%rsp) lea 0x60($out),$out vmovdqu 0x00-0x80($key),$rndkey vaesenclast $Z2,$inout3,$inout3 vpaddb $T2,$Z1,$Z2 vaesenclast $Z3, $inout4,$inout4 vpaddb $T2,$Z2,$Z3 vaesenclast $Hkey,$inout5,$inout5 vpaddb $T2,$Z3,$Hkey add \$0x60,$ret sub \$0x6,$len jc .L6x_done vmovups $inout0,-0x60($out) # save output vpxor $rndkey,$T1,$inout0 vmovups $inout1,-0x50($out) vmovdqa $Ii,$inout1 # 0 latency vmovups $inout2,-0x40($out) vmovdqa $Z1,$inout2 # 0 latency vmovups $inout3,-0x30($out) vmovdqa $Z2,$inout3 # 0 latency vmovups $inout4,-0x20($out) vmovdqa $Z3,$inout4 # 0 latency vmovups $inout5,-0x10($out) vmovdqa $Hkey,$inout5 # 0 latency vmovdqu 0x20+8(%rsp),$Z3 # I[5] jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled vpxor $Z0,$Xi,$Xi # modulo-scheduled ret .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x ___ ###################################################################### # # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, # const AES_KEY *key, unsigned char iv[16], # struct { u128 Xi,H,Htbl[9]; } *Xip); $code.=<<___; .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,\@function,6 .align 32 aesni_gcm_decrypt: .cfi_startproc xor $ret,$ret cmp \$0x60,$len # minimal accepted length jb .Lgcm_dec_abort lea (%rsp),%rax # save stack pointer .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,-0xd8(%rax) movaps %xmm7,-0xc8(%rax) movaps %xmm8,-0xb8(%rax) movaps %xmm9,-0xa8(%rax) movaps %xmm10,-0x98(%rax) movaps %xmm11,-0x88(%rax) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) .Lgcm_dec_body: ___ $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const lea -0x80($key),$in0 # borrow $in0 mov \$0xf80,$end0 # borrow $end0 vmovdqu ($Xip),$Xi # load Xi and \$-128,%rsp # ensure stack alignment vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask lea 0x80($key),$key # size optimization lea 0x20+0x20($Xip),$Xip # size optimization mov 0xf0-0x80($key),$rounds vpshufb $Ii,$Xi,$Xi and $end0,$in0 and %rsp,$end0 sub $in0,$end0 jc .Ldec_no_key_aliasing cmp \$768,$end0 jnc .Ldec_no_key_aliasing sub $end0,%rsp # avoid aliasing with key .Ldec_no_key_aliasing: vmovdqu 0x50($inp),$Z3 # I[5] lea ($inp),$in0 vmovdqu 0x40($inp),$Z0 lea -0xc0($inp,$len),$end0 vmovdqu 0x30($inp),$Z1 shr \$4,$len xor $ret,$ret vmovdqu 0x20($inp),$Z2 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x vmovdqu 0x10($inp),$T2 vpshufb $Ii,$Z0,$Z0 vmovdqu ($inp),$Hkey vpshufb $Ii,$Z1,$Z1 vmovdqu $Z0,0x30(%rsp) vpshufb $Ii,$Z2,$Z2 vmovdqu $Z1,0x40(%rsp) vpshufb $Ii,$T2,$T2 vmovdqu $Z2,0x50(%rsp) vpshufb $Ii,$Hkey,$Hkey vmovdqu $T2,0x60(%rsp) vmovdqu $Hkey,0x70(%rsp) call _aesni_ctr32_ghash_6x vmovups $inout0,-0x60($out) # save output vmovups $inout1,-0x50($out) vmovups $inout2,-0x40($out) vmovups $inout3,-0x30($out) vmovups $inout4,-0x20($out) vmovups $inout5,-0x10($out) vpshufb ($const),$Xi,$Xi # .Lbswap_mask vmovdqu $Xi,-0x40($Xip) # output Xi vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lgcm_dec_abort: mov $ret,%rax # return value ret .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ $code.=<<___; .type _aesni_ctr32_6x,\@abi-omnipotent .align 32 _aesni_ctr32_6x: .cfi_startproc vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb lea -1($rounds),%r13 vmovups 0x10-0x80($key),$rndkey lea 0x20-0x80($key),%r12 vpxor $Z0,$T1,$inout0 add \$`6<<24`,$counter jc .Lhandle_ctr32_2 vpaddb $T2,$T1,$inout1 vpaddb $T2,$inout1,$inout2 vpxor $Z0,$inout1,$inout1 vpaddb $T2,$inout2,$inout3 vpxor $Z0,$inout2,$inout2 vpaddb $T2,$inout3,$inout4 vpxor $Z0,$inout3,$inout3 vpaddb $T2,$inout4,$inout5 vpxor $Z0,$inout4,$inout4 vpaddb $T2,$inout5,$T1 vpxor $Z0,$inout5,$inout5 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vmovups (%r12),$rndkey lea 0x10(%r12),%r12 dec %r13d jnz .Loop_ctr32 vmovdqu (%r12),$Hkey # last round key vaesenc $rndkey,$inout0,$inout0 vpxor 0x00($inp),$Hkey,$Z0 vaesenc $rndkey,$inout1,$inout1 vpxor 0x10($inp),$Hkey,$Z1 vaesenc $rndkey,$inout2,$inout2 vpxor 0x20($inp),$Hkey,$Z2 vaesenc $rndkey,$inout3,$inout3 vpxor 0x30($inp),$Hkey,$Xi vaesenc $rndkey,$inout4,$inout4 vpxor 0x40($inp),$Hkey,$T2 vaesenc $rndkey,$inout5,$inout5 vpxor 0x50($inp),$Hkey,$Hkey lea 0x60($inp),$inp vaesenclast $Z0,$inout0,$inout0 vaesenclast $Z1,$inout1,$inout1 vaesenclast $Z2,$inout2,$inout2 vaesenclast $Xi,$inout3,$inout3 vaesenclast $T2,$inout4,$inout4 vaesenclast $Hkey,$inout5,$inout5 vmovups $inout0,0x00($out) vmovups $inout1,0x10($out) vmovups $inout2,0x20($out) vmovups $inout3,0x30($out) vmovups $inout4,0x40($out) vmovups $inout5,0x50($out) lea 0x60($out),$out ret .align 32 .Lhandle_ctr32_2: vpshufb $Ii,$T1,$Z2 # byte-swap counter vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb vpaddd $Z1,$Z2,$inout2 vpaddd $Z1,$inout1,$inout3 vpshufb $Ii,$inout1,$inout1 vpaddd $Z1,$inout2,$inout4 vpshufb $Ii,$inout2,$inout2 vpxor $Z0,$inout1,$inout1 vpaddd $Z1,$inout3,$inout5 vpshufb $Ii,$inout3,$inout3 vpxor $Z0,$inout2,$inout2 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value vpshufb $Ii,$inout4,$inout4 vpxor $Z0,$inout3,$inout3 vpshufb $Ii,$inout5,$inout5 vpxor $Z0,$inout4,$inout4 vpshufb $Ii,$T1,$T1 # next counter value vpxor $Z0,$inout5,$inout5 jmp .Loop_ctr32 .cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,\@function,6 .align 32 aesni_gcm_encrypt: .cfi_startproc xor $ret,$ret cmp \$0x60*3,$len # minimal accepted length jb .Lgcm_enc_abort lea (%rsp),%rax # save stack pointer .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,-0xd8(%rax) movaps %xmm7,-0xc8(%rax) movaps %xmm8,-0xb8(%rax) movaps %xmm9,-0xa8(%rax) movaps %xmm10,-0x98(%rax) movaps %xmm11,-0x88(%rax) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) .Lgcm_enc_body: ___ $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const lea -0x80($key),$in0 # borrow $in0 mov \$0xf80,$end0 # borrow $end0 lea 0x80($key),$key # size optimization vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask and \$-128,%rsp # ensure stack alignment mov 0xf0-0x80($key),$rounds and $end0,$in0 and %rsp,$end0 sub $in0,$end0 jc .Lenc_no_key_aliasing cmp \$768,$end0 jnc .Lenc_no_key_aliasing sub $end0,%rsp # avoid aliasing with key .Lenc_no_key_aliasing: lea ($out),$in0 lea -0xc0($out,$len),$end0 shr \$4,$len call _aesni_ctr32_6x vpshufb $Ii,$inout0,$Xi # save bswapped output on stack vpshufb $Ii,$inout1,$T2 vmovdqu $Xi,0x70(%rsp) vpshufb $Ii,$inout2,$Z0 vmovdqu $T2,0x60(%rsp) vpshufb $Ii,$inout3,$Z1 vmovdqu $Z0,0x50(%rsp) vpshufb $Ii,$inout4,$Z2 vmovdqu $Z1,0x40(%rsp) vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x vmovdqu $Z2,0x30(%rsp) call _aesni_ctr32_6x vmovdqu ($Xip),$Xi # load Xi lea 0x20+0x20($Xip),$Xip # size optimization sub \$12,$len mov \$0x60*2,$ret vpshufb $Ii,$Xi,$Xi call _aesni_ctr32_ghash_6x vmovdqu 0x20(%rsp),$Z3 # I[5] vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpunpckhqdq $Z3,$Z3,$T1 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK vmovups $inout0,-0x60($out) # save output vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy vpxor $Z3,$T1,$T1 vmovups $inout1,-0x50($out) vpshufb $Ii,$inout1,$inout1 vmovups $inout2,-0x40($out) vpshufb $Ii,$inout2,$inout2 vmovups $inout3,-0x30($out) vpshufb $Ii,$inout3,$inout3 vmovups $inout4,-0x20($out) vpshufb $Ii,$inout4,$inout4 vmovups $inout5,-0x10($out) vpshufb $Ii,$inout5,$inout5 vmovdqu $inout0,0x10(%rsp) # free $inout0 ___ { my ($HK,$T3)=($rndkey,$inout0); $code.=<<___; vmovdqu 0x30(%rsp),$Z2 # I[4] vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 vpunpckhqdq $Z2,$Z2,$T2 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 vpxor $Z2,$T2,$T2 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 vpclmulqdq \$0x00,$HK,$T1,$T1 vmovdqu 0x40(%rsp),$T3 # I[3] vpclmulqdq \$0x00,$Ii,$Z2,$Z0 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $T3,$T3,$Z1 vpclmulqdq \$0x11,$Ii,$Z2,$Z2 vpxor $T3,$Z1,$Z1 vpxor $Z3,$Z2,$Z2 vpclmulqdq \$0x10,$HK,$T2,$T2 vmovdqu 0x50-0x20($Xip),$HK vpxor $T1,$T2,$T2 vmovdqu 0x50(%rsp),$T1 # I[2] vpclmulqdq \$0x00,$Hkey,$T3,$Z3 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 vpxor $Z0,$Z3,$Z3 vpunpckhqdq $T1,$T1,$Z0 vpclmulqdq \$0x11,$Hkey,$T3,$T3 vpxor $T1,$Z0,$Z0 vpxor $Z2,$T3,$T3 vpclmulqdq \$0x00,$HK,$Z1,$Z1 vpxor $T2,$Z1,$Z1 vmovdqu 0x60(%rsp),$T2 # I[1] vpclmulqdq \$0x00,$Ii,$T1,$Z2 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 vpxor $Z3,$Z2,$Z2 vpunpckhqdq $T2,$T2,$Z3 vpclmulqdq \$0x11,$Ii,$T1,$T1 vpxor $T2,$Z3,$Z3 vpxor $T3,$T1,$T1 vpclmulqdq \$0x10,$HK,$Z0,$Z0 vmovdqu 0x80-0x20($Xip),$HK vpxor $Z1,$Z0,$Z0 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] vpclmulqdq \$0x00,$Hkey,$T2,$Z1 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 vpunpckhqdq $Xi,$Xi,$T3 vpxor $Z2,$Z1,$Z1 vpclmulqdq \$0x11,$Hkey,$T2,$T2 vpxor $Xi,$T3,$T3 vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$HK,$Z3,$Z3 vpxor $Z0,$Z3,$Z0 vpclmulqdq \$0x00,$Ii,$Xi,$Z2 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpunpckhqdq $inout5,$inout5,$T1 vpclmulqdq \$0x11,$Ii,$Xi,$Xi vpxor $inout5,$T1,$T1 vpxor $Z1,$Z2,$Z1 vpclmulqdq \$0x10,$HK,$T3,$T3 vmovdqu 0x20-0x20($Xip),$HK vpxor $T2,$Xi,$Z3 vpxor $Z0,$T3,$Z2 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 vpxor $T3,$Z2,$Z2 vpunpckhqdq $inout4,$inout4,$T2 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 vpxor $inout4,$T2,$T2 vpslldq \$8,$Z2,$T3 vpclmulqdq \$0x00,$HK,$T1,$T1 vpxor $T3,$Z1,$Xi vpsrldq \$8,$Z2,$Z2 vpxor $Z2,$Z3,$Z3 vpclmulqdq \$0x00,$Ii,$inout4,$Z1 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 vpxor $Z0,$Z1,$Z1 vpunpckhqdq $inout3,$inout3,$T3 vpclmulqdq \$0x11,$Ii,$inout4,$inout4 vpxor $inout3,$T3,$T3 vpxor $inout5,$inout4,$inout4 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase vpclmulqdq \$0x10,$HK,$T2,$T2 vmovdqu 0x50-0x20($Xip),$HK vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $inout2,$inout2,$T1 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 vpxor $inout2,$T1,$T1 vpxor $inout4,$inout3,$inout3 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 vpclmulqdq \$0x00,$HK,$T3,$T3 vpxor $T2,$T3,$T3 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi vxorps $inout5,$Xi,$Xi vpclmulqdq \$0x00,$Ii,$inout2,$Z1 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 vpxor $Z0,$Z1,$Z1 vpunpckhqdq $inout1,$inout1,$T2 vpclmulqdq \$0x11,$Ii,$inout2,$inout2 vpxor $inout1,$T2,$T2 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase vpxor $inout3,$inout2,$inout2 vpclmulqdq \$0x10,$HK,$T1,$T1 vmovdqu 0x80-0x20($Xip),$HK vpxor $T3,$T1,$T1 vxorps $Z3,$inout5,$inout5 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi vxorps $inout5,$Xi,$Xi vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $Xi,$Xi,$T3 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 vpxor $Xi,$T3,$T3 vpxor $inout2,$inout1,$inout1 vpclmulqdq \$0x00,$HK,$T2,$T2 vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$Ii,$Xi,$Z1 vpclmulqdq \$0x11,$Ii,$Xi,$Z3 vpxor $Z0,$Z1,$Z1 vpclmulqdq \$0x10,$HK,$T3,$Z2 vpxor $inout1,$Z3,$Z3 vpxor $T2,$Z2,$Z2 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing vpxor $Z0,$Z2,$Z2 vpslldq \$8,$Z2,$T1 vmovdqu 0x10($const),$Hkey # .Lpoly vpsrldq \$8,$Z2,$Z2 vpxor $T1,$Z1,$Xi vpxor $Z2,$Z3,$Z3 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase vpclmulqdq \$0x10,$Hkey,$Xi,$Xi vpxor $T2,$Xi,$Xi vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase vpclmulqdq \$0x10,$Hkey,$Xi,$Xi vpxor $Z3,$T2,$T2 vpxor $T2,$Xi,$Xi ___ } $code.=<<___; vpshufb ($const),$Xi,$Xi # .Lbswap_mask vmovdqu $Xi,-0x40($Xip) # output Xi vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lgcm_enc_abort: mov $ret,%rax # return value ret .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt ___ $code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by " .align 64 ___ if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___ .extern __imp_RtlVirtualUnwind .type gcm_se_handler,\@abi-omnipotent .align 16 gcm_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 120($context),%rax # pull context->Rax mov -48(%rax),%r15 mov -40(%rax),%r14 mov -32(%rax),%r13 mov -24(%rax),%r12 mov -16(%rax),%rbp mov -8(%rax),%rbx mov %r15,240($context) mov %r14,232($context) mov %r13,224($context) mov %r12,216($context) mov %rbp,160($context) mov %rbx,144($context) lea -0xd8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size gcm_se_handler,.-gcm_se_handler .section .pdata .align 4 .rva .LSEH_begin_aesni_gcm_decrypt .rva .LSEH_end_aesni_gcm_decrypt .rva .LSEH_gcm_dec_info .rva .LSEH_begin_aesni_gcm_encrypt .rva .LSEH_end_aesni_gcm_encrypt .rva .LSEH_gcm_enc_info .section .xdata .align 8 .LSEH_gcm_dec_info: .byte 9,0,0,0 .rva gcm_se_handler .rva .Lgcm_dec_body,.Lgcm_dec_abort .LSEH_gcm_enc_info: .byte 9,0,0,0 .rva gcm_se_handler .rva .Lgcm_enc_body,.Lgcm_enc_abort ___ } }}} else {{{ $code=<<___; # assembler is too old .text .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,\@abi-omnipotent aesni_gcm_encrypt: .cfi_startproc xor %eax,%eax ret .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,\@abi-omnipotent aesni_gcm_decrypt: .cfi_startproc xor %eax,%eax ret .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ }}} $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/modes/asm/ghash-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/modes/asm/ghash-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/modes/asm/ghash-x86_64.pl (revision 364822) @@ -1,1818 +1,1818 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # March, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH # function features so called "528B" variant utilizing additional # 256+16 bytes of per-key storage [+512 bytes shared table]. # Performance results are for this streamed GHASH subroutine and are # expressed in cycles per processed byte, less is better: # # gcc 3.4.x(*) assembler # # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results # are for "528B";-) # (**) it's mystery [to me] why Core2 result is not same as for # Opteron; # May 2010 # # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. # See ghash-x86.pl for background information and details about coding # techniques. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9, increase reduction aggregate factor to 4x. As for # the latter. ghash-x86.pl discusses that it makes lesser sense to # increase aggregate factor. Then why increase here? Critical path # consists of 3 independent pclmulqdq instructions, Karatsuba post- # processing and reduction. "On top" of this we lay down aggregated # multiplication operations, triplets of independent pclmulqdq's. As # issue rate for pclmulqdq is limited, it makes lesser sense to # aggregate more multiplications than it takes to perform remaining # non-multiplication operations. 2x is near-optimal coefficient for # contemporary Intel CPUs (therefore modest improvement coefficient), # but not for Bulldozer. Latter is because logical SIMD operations # are twice as slow in comparison to Intel, so that critical path is # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) # Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 # # ... 8x aggregate factor AVX code path is using reduction algorithm # suggested by Shay Gueron[1]. Even though contemporary AVX-capable # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # # Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $do4xaggr=1; # common register layout $nlo="%rax"; $nhi="%rbx"; $Zlo="%r8"; $Zhi="%r9"; $tmp="%r10"; $rem_4bit = "%r11"; $Xi="%rdi"; $Htbl="%rsi"; # per-function register layout $cnt="%rcx"; $rem="%rdx"; sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or $r =~ s/%[er]([sd]i)/%\1l/ or $r =~ s/%[er](bp)/%\1l/ or $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } { my $N; sub loop() { my $inp = shift; $N++; $code.=<<___; xor $nlo,$nlo xor $nhi,$nhi mov `&LB("$Zlo")`,`&LB("$nlo")` mov `&LB("$Zlo")`,`&LB("$nhi")` shl \$4,`&LB("$nlo")` mov \$14,$cnt mov 8($Htbl,$nlo),$Zlo mov ($Htbl,$nlo),$Zhi and \$0xf0,`&LB("$nhi")` mov $Zlo,$rem jmp .Loop$N .align 16 .Loop$N: shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp mov ($inp,$cnt),`&LB("$nlo")` shr \$4,$Zhi xor 8($Htbl,$nhi),$Zlo shl \$60,$tmp xor ($Htbl,$nhi),$Zhi mov `&LB("$nlo")`,`&LB("$nhi")` xor ($rem_4bit,$rem,8),$Zhi mov $Zlo,$rem shl \$4,`&LB("$nlo")` xor $tmp,$Zlo dec $cnt js .Lbreak$N shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp shr \$4,$Zhi xor 8($Htbl,$nlo),$Zlo shl \$60,$tmp xor ($Htbl,$nlo),$Zhi and \$0xf0,`&LB("$nhi")` xor ($rem_4bit,$rem,8),$Zhi mov $Zlo,$rem xor $tmp,$Zlo jmp .Loop$N .align 16 .Lbreak$N: shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp shr \$4,$Zhi xor 8($Htbl,$nlo),$Zlo shl \$60,$tmp xor ($Htbl,$nlo),$Zhi and \$0xf0,`&LB("$nhi")` xor ($rem_4bit,$rem,8),$Zhi mov $Zlo,$rem xor $tmp,$Zlo shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp shr \$4,$Zhi xor 8($Htbl,$nhi),$Zlo shl \$60,$tmp xor ($Htbl,$nhi),$Zhi xor $tmp,$Zlo xor ($rem_4bit,$rem,8),$Zhi bswap $Zlo bswap $Zhi ___ }} $code=<<___; .text .extern OPENSSL_ia32cap_P .globl gcm_gmult_4bit .type gcm_gmult_4bit,\@function,2 .align 16 gcm_gmult_4bit: .cfi_startproc push %rbx .cfi_push %rbx push %rbp # %rbp and others are pushed exclusively in .cfi_push %rbp push %r12 # order to reuse Win64 exception handler... .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$280,%rsp .cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzb 15($Xi),$Zlo lea .Lrem_4bit(%rip),$rem_4bit ___ &loop ($Xi); $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) lea 280+48(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lgmult_epilogue: ret .cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ # per-function register layout $inp="%rdx"; $len="%rcx"; $rem_8bit=$rem_4bit; $code.=<<___; .globl gcm_ghash_4bit .type gcm_ghash_4bit,\@function,4 .align 16 gcm_ghash_4bit: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$280,%rsp .cfi_adjust_cfa_offset 280 .Lghash_prologue: mov $inp,%r14 # reassign couple of args mov $len,%r15 ___ { my $inp="%r14"; my $dat="%edx"; my $len="%r15"; my @nhi=("%ebx","%ecx"); my @rem=("%r12","%r13"); my $Hshr4="%rbp"; &sub ($Htbl,-128); # size optimization &lea ($Hshr4,"16+128(%rsp)"); { my @lo =($nlo,$nhi); my @hi =($Zlo,$Zhi); &xor ($dat,$dat); for ($i=0,$j=-2;$i<18;$i++,$j++) { &mov ("$j(%rsp)",&LB($dat)) if ($i>1); &or ($lo[0],$tmp) if ($i>1); &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); &shr ($lo[1],4) if ($i>0 && $i<17); &mov ($tmp,$hi[1]) if ($i>0 && $i<17); &shr ($hi[1],4) if ($i>0 && $i<17); &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); &shl (&LB($dat),4) if ($i>0 && $i<17); &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); &shl ($tmp,60) if ($i>0 && $i<17); push (@lo,shift(@lo)); push (@hi,shift(@hi)); } } &add ($Htbl,-128); &mov ($Zlo,"8($Xi)"); &mov ($Zhi,"0($Xi)"); &add ($len,$inp); # pointer to the end of data &lea ($rem_8bit,".Lrem_8bit(%rip)"); &jmp (".Louter_loop"); $code.=".align 16\n.Louter_loop:\n"; &xor ($Zhi,"($inp)"); &mov ("%rdx","8($inp)"); &lea ($inp,"16($inp)"); &xor ("%rdx",$Zlo); &mov ("($Xi)",$Zhi); &mov ("8($Xi)","%rdx"); &shr ("%rdx",32); &xor ($nlo,$nlo); &rol ($dat,8); &mov (&LB($nlo),&LB($dat)); &movz ($nhi[0],&LB($dat)); &shl (&LB($nlo),4); &shr ($nhi[0],4); for ($j=11,$i=0;$i<15;$i++) { &rol ($dat,8); &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); &mov (&LB($nlo),&LB($dat)); &xor ($Zlo,$tmp) if ($i>0); &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); &movz ($nhi[1],&LB($dat)); &shl (&LB($nlo),4); &movzb ($rem[0],"(%rsp,$nhi[0])"); &shr ($nhi[1],4) if ($i<14); &and ($nhi[1],0xf0) if ($i==14); &shl ($rem[1],48) if ($i>0); &xor ($rem[0],$Zlo); &mov ($tmp,$Zhi); &xor ($Zhi,$rem[1]) if ($i>0); &shr ($Zlo,8); &movz ($rem[0],&LB($rem[0])); &mov ($dat,"$j($Xi)") if (--$j%4==0); &shr ($Zhi,8); &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); &shl ($tmp,56); &xor ($Zhi,"($Hshr4,$nhi[0],8)"); unshift (@nhi,pop(@nhi)); # "rotate" registers unshift (@rem,pop(@rem)); } &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); &xor ($Zlo,"8($Htbl,$nlo)"); &xor ($Zhi,"($Htbl,$nlo)"); &shl ($rem[1],48); &xor ($Zlo,$tmp); &xor ($Zhi,$rem[1]); &movz ($rem[0],&LB($Zlo)); &shr ($Zlo,4); &mov ($tmp,$Zhi); &shl (&LB($rem[0]),4); &shr ($Zhi,4); &xor ($Zlo,"8($Htbl,$nhi[0])"); &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); &shl ($tmp,60); &xor ($Zhi,"($Htbl,$nhi[0])"); &xor ($Zlo,$tmp); &shl ($rem[0],48); &bswap ($Zlo); &xor ($Zhi,$rem[0]); &bswap ($Zhi); &cmp ($inp,$len); &jb (".Louter_loop"); } $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) lea 280+48(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea 0(%rsi),%rsp .cfi_def_cfa_register %rsp .Lghash_epilogue: ret .cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit ___ ###################################################################### # PCLMULQDQ version. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); sub clmul64x64_T2 { # minimal register pressure my ($Xhi,$Xi,$Hkey,$HK)=@_; if (!defined($HK)) { $HK = $T2; $code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pshufd \$0b01001110,$Hkey,$T2 pxor $Xi,$T1 # pxor $Hkey,$T2 ___ } else { $code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 # ___ } $code.=<<___; pclmulqdq \$0x00,$Hkey,$Xi ####### pclmulqdq \$0x11,$Hkey,$Xhi ####### pclmulqdq \$0x00,$HK,$T1 ####### pxor $Xi,$T1 # pxor $Xhi,$T1 # movdqa $T1,$T2 # psrldq \$8,$T1 pslldq \$8,$T2 # pxor $T1,$Xhi pxor $T2,$Xi # ___ } sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; $code.=<<___; # 1st phase movdqa $Xi,$T2 # movdqa $Xi,$T1 psllq \$5,$Xi pxor $Xi,$T1 # psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # # 2nd phase movdqa $Xi,$T2 psrlq \$1,$Xi pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # psrlq \$1,$Xi # pxor $Xhi,$Xi # ___ } { my ($Htbl,$Xip)=@_4args; my $HK="%xmm6"; $code.=<<___; .globl gcm_init_clmul .type gcm_init_clmul,\@abi-omnipotent .align 16 gcm_init_clmul: .cfi_startproc .L_init_clmul: ___ $code.=<<___ if ($win64); .LSEH_begin_gcm_init_clmul: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) ___ $code.=<<___; movdqu ($Xip),$Hkey pshufd \$0b01001110,$Hkey,$Hkey # dword swap # <<1 twist pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword movdqa $Hkey,$T1 psllq \$1,$Hkey pxor $T3,$T3 # psrlq \$63,$T1 pcmpgtd $T2,$T3 # broadcast carry bit pslldq \$8,$T1 por $T1,$Hkey # H<<=1 # magic reduction pand .L0x1c2_polynomial(%rip),$T3 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial # calculate H^2 pshufd \$0b01001110,$Hkey,$HK movdqa $Hkey,$Xi pxor $Hkey,$HK ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); &reduction_alg9 ($Xhi,$Xi); $code.=<<___; pshufd \$0b01001110,$Hkey,$T1 pshufd \$0b01001110,$Xi,$T2 pxor $Hkey,$T1 # Karatsuba pre-processing movdqu $Hkey,0x00($Htbl) # save H pxor $Xi,$T2 # Karatsuba pre-processing movdqu $Xi,0x10($Htbl) # save H^2 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... movdqu $T2,0x20($Htbl) # save Karatsuba "salt" ___ if ($do4xaggr) { &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 &reduction_alg9 ($Xhi,$Xi); $code.=<<___; movdqa $Xi,$T3 ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 &reduction_alg9 ($Xhi,$Xi); $code.=<<___; pshufd \$0b01001110,$T3,$T1 pshufd \$0b01001110,$Xi,$T2 pxor $T3,$T1 # Karatsuba pre-processing movdqu $T3,0x30($Htbl) # save H^3 pxor $Xi,$T2 # Karatsuba pre-processing movdqu $Xi,0x40($Htbl) # save H^4 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... movdqu $T2,0x50($Htbl) # save Karatsuba "salt" ___ } $code.=<<___ if ($win64); movaps (%rsp),%xmm6 lea 0x18(%rsp),%rsp .LSEH_end_gcm_init_clmul: ___ $code.=<<___; ret .cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul ___ } { my ($Xip,$Htbl)=@_4args; $code.=<<___; .globl gcm_gmult_clmul .type gcm_gmult_clmul,\@abi-omnipotent .align 16 gcm_gmult_clmul: .cfi_startproc .L_gmult_clmul: movdqu ($Xip),$Xi movdqa .Lbswap_mask(%rip),$T3 movdqu ($Htbl),$Hkey movdqu 0x20($Htbl),$T2 pshufb $T3,$Xi ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff mov \$0x07,%r11d movq %rax,$T1 movq %r10,$T2 movq %r11,$T3 # borrow $T3 pand $Xi,$T3 pshufb $T3,$T2 # ($Xi&7)·0xE0 movq %rax,$T3 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) pxor $Xi,$T2 pslldq \$15,$T2 paddd $T2,$T2 # <<(64+56+1) pxor $T2,$Xi pclmulqdq \$0x01,$T3,$Xi movdqa .Lbswap_mask(%rip),$T3 # reload $T3 psrldq \$1,$T1 pxor $T1,$Xhi pslldq \$7,$Xi pxor $Xhi,$Xi ___ $code.=<<___; pshufb $T3,$Xi movdqu $Xi,($Xip) ret .cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul .type gcm_ghash_clmul,\@abi-omnipotent .align 32 gcm_ghash_clmul: .cfi_startproc .L_ghash_clmul: ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey movdqu 0x20($Htbl),$HK pshufb $T3,$Xi sub \$0x10,$len jz .Lodd_tail movdqu 0x10($Htbl),$Hkey2 ___ if ($do4xaggr) { my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; mov OPENSSL_ia32cap_P+4(%rip),%eax cmp \$0x30,$len jb .Lskip4x and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE cmp \$`1<<22`,%eax # check for MOVBE without XSAVE je .Lskip4x sub \$0x30,$len mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 ####### # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P # movdqu 0x30($inp),$Xln movdqu 0x20($inp),$Xl pshufb $T3,$Xln pshufb $T3,$Xl movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$Xmn movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x11,$Hkey2,$Xh pclmulqdq \$0x10,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn movups 0x50($Htbl),$HK xorps $Xm,$Xmn movdqu 0x10($inp),$Xl movdqu 0($inp),$T1 pshufb $T3,$Xl pshufb $T3,$T1 movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pxor $T1,$Xi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x11,$Hkey3,$Xh pclmulqdq \$0x00,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn lea 0x40($inp),$inp sub \$0x40,$len jc .Ltail4x jmp .Lmod4_loop .align 32 .Lmod4_loop: pclmulqdq \$0x00,$Hkey4,$Xi xorps $Xm,$Xmn movdqu 0x30($inp),$Xl pshufb $T3,$Xl pclmulqdq \$0x11,$Hkey4,$Xhi xorps $Xln,$Xi movdqu 0x20($inp),$Xln movdqa $Xl,$Xh pclmulqdq \$0x10,$HK,$T1 pshufd \$0b01001110,$Xl,$Xm xorps $Xhn,$Xhi pxor $Xl,$Xm pshufb $T3,$Xln movups 0x20($Htbl),$HK xorps $Xmn,$T1 pclmulqdq \$0x00,$Hkey,$Xl pshufd \$0b01001110,$Xln,$Xmn pxor $Xi,$T1 # aggregated Karatsuba post-processing movdqa $Xln,$Xhn pxor $Xhi,$T1 # pxor $Xln,$Xmn movdqa $T1,$T2 # pclmulqdq \$0x11,$Hkey,$Xh pslldq \$8,$T1 psrldq \$8,$T2 # pxor $T1,$Xi movdqa .L7_mask(%rip),$T1 pxor $T2,$Xhi # movq %rax,$T2 pand $Xi,$T1 # 1st phase pshufb $T1,$T2 # pxor $Xi,$T2 # pclmulqdq \$0x00,$HK,$Xm psllq \$57,$T2 # movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey2,$Xhn xorps $Xl,$Xln movdqu 0x10($inp),$Xl pshufb $T3,$Xl pclmulqdq \$0x10,$HK,$Xmn xorps $Xh,$Xhn movups 0x50($Htbl),$HK pshufb $T3,$T1 pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi movdqa $Xl,$Xh pxor $Xm,$Xmn pshufd \$0b01001110,$Xl,$Xm pxor $T2,$Xi # pxor $T1,$Xhi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl psrlq \$1,$Xi # pxor $Xhi,$Xi # movdqa $Xi,$Xhi pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x00,$HK,$Xm xorps $Xh,$Xhn lea 0x40($inp),$inp sub \$0x40,$len jnc .Lmod4_loop .Ltail4x: pclmulqdq \$0x00,$Hkey4,$Xi pclmulqdq \$0x11,$Hkey4,$Xhi pclmulqdq \$0x10,$HK,$T1 xorps $Xm,$Xmn xorps $Xln,$Xi xorps $Xhn,$Xhi pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xmn,$T1 pxor $Xhi,$T1 # pxor $Xi,$Xhi movdqa $T1,$T2 # psrldq \$8,$T1 pslldq \$8,$T2 # pxor $T1,$Xhi pxor $T2,$Xi # ___ &reduction_alg9($Xhi,$Xi); $code.=<<___; add \$0x40,$len jz .Ldone movdqu 0x20($Htbl),$HK sub \$0x10,$len jz .Lodd_tail .Lskip4x: ___ } $code.=<<___; ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # movdqu ($inp),$T1 # Ii movdqu 16($inp),$Xln # Ii+1 pshufb $T3,$T1 pshufb $T3,$Xln pxor $T1,$Xi # Ii+Xi movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 nop sub \$0x20,$len jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa $Xi,$Xhi movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$T2 # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$T2 movdqu 16($inp),$Xln # Ii+1 pxor $Xhi,$T1 pxor $T2,$Xhi # "Ii+Xi", consume early pxor $T1,$Xmn pshufb $T3,$Xln movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $Xmn,$Xi # movdqa $Xln,$Xhn # movdqa $Xi,$T2 # 1st phase movdqa $Xi,$T1 psllq \$5,$Xi pxor $Xi,$T1 # pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # pxor $Xhn,$Xmn # movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey,$Xhn ####### pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # lea 32($inp),$inp psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # sub \$0x20,$len ja .Lmod_loop .Leven_tail: movdqa $Xi,$Xhi movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi pxor $Xi,$T1 pxor $Xhi,$T1 pxor $T1,$Xmn movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; test $len,$len jnz .Ldone .Lodd_tail: movdqu ($inp),$T1 # Ii pshufb $T3,$T1 pxor $T1,$Xi # Ii+Xi ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) &reduction_alg9 ($Xhi,$Xi); $code.=<<___; .Ldone: pshufb $T3,$Xi movdqu $Xi,($Xip) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp .LSEH_end_gcm_ghash_clmul: ___ $code.=<<___; ret .cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul ___ } $code.=<<___; .globl gcm_init_avx .type gcm_init_avx,\@abi-omnipotent .align 32 gcm_init_avx: .cfi_startproc ___ if ($avx) { my ($Htbl,$Xip)=@_4args; my $HK="%xmm6"; $code.=<<___ if ($win64); .LSEH_begin_gcm_init_avx: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) ___ $code.=<<___; vzeroupper vmovdqu ($Xip),$Hkey vpshufd \$0b01001110,$Hkey,$Hkey # dword swap # <<1 twist vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword vpsrlq \$63,$Hkey,$T1 vpsllq \$1,$Hkey,$Hkey vpxor $T3,$T3,$T3 # vpcmpgtd $T2,$T3,$T3 # broadcast carry bit vpslldq \$8,$T1,$T1 vpor $T1,$Hkey,$Hkey # H<<=1 # magic reduction vpand .L0x1c2_polynomial(%rip),$T3,$T3 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial vpunpckhqdq $Hkey,$Hkey,$HK vmovdqa $Hkey,$Xi vpxor $Hkey,$HK,$HK mov \$4,%r10 # up to H^8 jmp .Linit_start_avx ___ sub clmul64x64_avx { my ($Xhi,$Xi,$Hkey,$HK)=@_; if (!defined($HK)) { $HK = $T2; $code.=<<___; vpunpckhqdq $Xi,$Xi,$T1 vpunpckhqdq $Hkey,$Hkey,$T2 vpxor $Xi,$T1,$T1 # vpxor $Hkey,$T2,$T2 ___ } else { $code.=<<___; vpunpckhqdq $Xi,$Xi,$T1 vpxor $Xi,$T1,$T1 # ___ } $code.=<<___; vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### vpclmulqdq \$0x00,$HK,$T1,$T1 ####### vpxor $Xi,$Xhi,$T2 # vpxor $T2,$T1,$T1 # vpslldq \$8,$T1,$T2 # vpsrldq \$8,$T1,$T1 vpxor $T2,$Xi,$Xi # vpxor $T1,$Xhi,$Xhi ___ } sub reduction_avx { my ($Xhi,$Xi) = @_; $code.=<<___; vpsllq \$57,$Xi,$T1 # 1st phase vpsllq \$62,$Xi,$T2 vpxor $T1,$T2,$T2 # vpsllq \$63,$Xi,$T1 vpxor $T1,$T2,$T2 # vpslldq \$8,$T2,$T1 # vpsrldq \$8,$T2,$T2 vpxor $T1,$Xi,$Xi # vpxor $T2,$Xhi,$Xhi vpsrlq \$1,$Xi,$T2 # 2nd phase vpxor $Xi,$Xhi,$Xhi vpxor $T2,$Xi,$Xi # vpsrlq \$5,$T2,$T2 vpxor $T2,$Xi,$Xi # vpsrlq \$1,$Xi,$Xi # vpxor $Xhi,$Xi,$Xi # ___ } $code.=<<___; .align 32 .Linit_loop_avx: vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" ___ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 &reduction_avx ($Xhi,$Xi); $code.=<<___; .Linit_start_avx: vmovdqa $Xi,$T3 ___ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 &reduction_avx ($Xhi,$Xi); $code.=<<___; vpshufd \$0b01001110,$T3,$T1 vpshufd \$0b01001110,$Xi,$T2 vpxor $T3,$T1,$T1 # Karatsuba pre-processing vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 lea 0x30($Htbl),$Htbl sub \$1,%r10 jnz .Linit_loop_avx vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped vmovdqu $T3,-0x10($Htbl) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 lea 0x18(%rsp),%rsp .LSEH_end_gcm_init_avx: ___ $code.=<<___; ret .cfi_endproc .size gcm_init_avx,.-gcm_init_avx ___ } else { $code.=<<___; jmp .L_init_clmul .cfi_endproc .size gcm_init_avx,.-gcm_init_avx ___ } $code.=<<___; .globl gcm_gmult_avx .type gcm_gmult_avx,\@abi-omnipotent .align 32 gcm_gmult_avx: .cfi_startproc jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx ___ $code.=<<___; .globl gcm_ghash_avx .type gcm_ghash_avx,\@abi-omnipotent .align 32 gcm_ghash_avx: .cfi_startproc ___ if ($avx) { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xlo,$Xhi,$Xmi, $Zlo,$Zhi,$Zmi, $Hkey,$HK,$T1,$T2, $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); $code.=<<___ if ($win64); lea -0x88(%rsp),%rax .LSEH_begin_gcm_ghash_avx: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) ___ $code.=<<___; vzeroupper vmovdqu ($Xip),$Xi # load $Xi lea .L0x1c2_polynomial(%rip),%r10 lea 0x40($Htbl),$Htbl # size optimization vmovdqu .Lbswap_mask(%rip),$bswap vpshufb $bswap,$Xi,$Xi cmp \$0x80,$len jb .Lshort_avx sub \$0x80,$len vmovdqu 0x70($inp),$Ii # I[7] vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vpshufb $bswap,$Ii,$Ii vmovdqu 0x20-0x40($Htbl),$HK vpunpckhqdq $Ii,$Ii,$T2 vmovdqu 0x60($inp),$Ij # I[6] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Ii,$T2,$T2 vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpunpckhqdq $Ij,$Ij,$T1 vmovdqu 0x50($inp),$Ii # I[5] vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpxor $Ii,$T2,$T2 vmovdqu 0x40($inp),$Ij # I[4] vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0x50-0x40($Htbl),$HK vpshufb $bswap,$Ij,$Ij vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vmovdqu 0x30($inp),$Ii # I[3] vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Zhi,$Xhi,$Xhi vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpxor $Zmi,$Xmi,$Xmi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0x80-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu 0x20($inp),$Ij # I[2] vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpxor $Xmi,$Zmi,$Zmi vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vmovdqu 0x10($inp),$Ii # I[1] vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Zhi,$Xhi,$Xhi vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpxor $Zmi,$Xmi,$Xmi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0xb0-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu ($inp),$Ij # I[0] vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x10,$HK,$T2,$Xmi lea 0x80($inp),$inp cmp \$0x80,$len jb .Ltail_avx vpxor $Xi,$Ij,$Ij # accumulate $Xi sub \$0x80,$len jmp .Loop8x_avx .align 32 .Loop8x_avx: vpunpckhqdq $Ij,$Ij,$T1 vmovdqu 0x70($inp),$Ii # I[7] vpxor $Xlo,$Zlo,$Zlo vpxor $Ij,$T1,$T1 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi vpshufb $bswap,$Ii,$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xo vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Tred vmovdqu 0x20-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu 0x60($inp),$Ij # I[6] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Zlo,$Xi,$Xi # collect result vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vxorps $Zhi,$Xo,$Xo vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x00,$HK, $T2,$Xmi vpxor $Zmi,$Tred,$Tred vxorps $Ij,$T1,$T1 vmovdqu 0x50($inp),$Ii # I[5] vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Xo,$Tred,$Tred vpslldq \$8,$Tred,$T2 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vpsrldq \$8,$Tred,$Tred vpxor $T2, $Xi, $Xi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpshufb $bswap,$Ii,$Ii vxorps $Tred,$Xo, $Xo vpxor $Xhi,$Zhi,$Zhi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0x50-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu 0x40($inp),$Ij # I[4] vpalignr \$8,$Xi,$Xi,$Tred # 1st phase vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpunpckhqdq $Ij,$Ij,$T1 vpxor $Zhi,$Xhi,$Xhi vpclmulqdq \$0x00,$HK, $T2,$Xmi vxorps $Ij,$T1,$T1 vpxor $Zmi,$Xmi,$Xmi vmovdqu 0x30($inp),$Ii # I[3] vpclmulqdq \$0x10,(%r10),$Xi,$Xi vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpshufb $bswap,$Ii,$Ii vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0x80-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu 0x20($inp),$Ij # I[2] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpunpckhqdq $Ij,$Ij,$T1 vpxor $Zhi,$Xhi,$Xhi vpclmulqdq \$0x00,$HK, $T2,$Xmi vpxor $Ij,$T1,$T1 vpxor $Zmi,$Xmi,$Xmi vxorps $Tred,$Xi,$Xi vmovdqu 0x10($inp),$Ii # I[1] vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpshufb $bswap,$Ii,$Ii vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpclmulqdq \$0x10,(%r10),$Xi,$Xi vxorps $Xo,$Tred,$Tred vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0xb0-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu ($inp),$Ij # I[0] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 vpxor $Tred,$Ij,$Ij vpclmulqdq \$0x10,$HK, $T2,$Xmi vpxor $Xi,$Ij,$Ij # accumulate $Xi lea 0x80($inp),$inp sub \$0x80,$len jnc .Loop8x_avx add \$0x80,$len jmp .Ltail_no_xor_avx .align 32 .Lshort_avx: vmovdqu -0x10($inp,$len),$Ii # very last word lea ($inp,$len),$inp vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vmovdqu 0x20-0x40($Htbl),$HK vpshufb $bswap,$Ii,$Ij vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, vmovdqa $Xhi,$Zhi # $Zhi and vmovdqa $Xmi,$Zmi # $Zmi sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x20($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x30($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu 0x50-0x40($Htbl),$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x40($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x50($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu 0x80-0x40($Htbl),$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x60($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x70($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovq 0xb8-0x40($Htbl),$HK sub \$0x10,$len jmp .Ltail_avx .align 32 .Ltail_avx: vpxor $Xi,$Ij,$Ij # accumulate $Xi .Ltail_no_xor_avx: vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu (%r10),$Tred vpxor $Xlo,$Zlo,$Xi vpxor $Xhi,$Zhi,$Xo vpxor $Xmi,$Zmi,$Zmi vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing vpxor $Xo, $Zmi,$Zmi vpslldq \$8, $Zmi,$T2 vpsrldq \$8, $Zmi,$Zmi vpxor $T2, $Xi, $Xi vpxor $Zmi,$Xo, $Xo vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase vpalignr \$8,$Xi,$Xi,$Xi vpxor $T2,$Xi,$Xi vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase vpalignr \$8,$Xi,$Xi,$Xi vpxor $Xo,$Xi,$Xi vpxor $T2,$Xi,$Xi cmp \$0,$len jne .Lshort_avx vpshufb $bswap,$Xi,$Xi vmovdqu $Xi,($Xip) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp .LSEH_end_gcm_ghash_avx: ___ $code.=<<___; ret .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx ___ } else { $code.=<<___; jmp .L_ghash_clmul .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx ___ } $code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .L7_mask_poly: .long 7,0,`0xE1<<1`,0 .align 64 .type .Lrem_4bit,\@object .Lrem_4bit: .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` .type .Lrem_8bit,\@object .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE .asciz "GHASH for x86_64, CRYPTOGAMS by " .align 64 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$`1232/8`,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_gcm_gmult_4bit .rva .LSEH_end_gcm_gmult_4bit .rva .LSEH_info_gcm_gmult_4bit .rva .LSEH_begin_gcm_ghash_4bit .rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit .rva .LSEH_begin_gcm_init_clmul .rva .LSEH_end_gcm_init_clmul .rva .LSEH_info_gcm_init_clmul .rva .LSEH_begin_gcm_ghash_clmul .rva .LSEH_end_gcm_ghash_clmul .rva .LSEH_info_gcm_ghash_clmul ___ $code.=<<___ if ($avx); .rva .LSEH_begin_gcm_init_avx .rva .LSEH_end_gcm_init_avx .rva .LSEH_info_gcm_init_clmul .rva .LSEH_begin_gcm_ghash_avx .rva .LSEH_end_gcm_ghash_avx .rva .LSEH_info_gcm_ghash_clmul ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_gcm_gmult_4bit: .byte 9,0,0,0 .rva se_handler .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData .LSEH_info_gcm_ghash_4bit: .byte 9,0,0,0 .rva se_handler .rva .Lghash_prologue,.Lghash_epilogue # HandlerData .LSEH_info_gcm_init_clmul: .byte 0x01,0x08,0x03,0x00 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 .LSEH_info_gcm_ghash_clmul: .byte 0x01,0x33,0x16,0x00 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 ___ } $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/poly1305/asm/poly1305-x86.pl =================================================================== --- head/crypto/openssl/crypto/poly1305/asm/poly1305-x86.pl (revision 364821) +++ head/crypto/openssl/crypto/poly1305/asm/poly1305-x86.pl (revision 364822) @@ -1,1815 +1,1815 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for x86. # # April 2015 # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # # IALU/gcc-3.4(*) SSE2(**) AVX2 # Pentium 15.7/+80% - # PIII 6.21/+90% - # P4 19.8/+40% 3.24 # Core 2 4.85/+90% 1.80 # Westmere 4.58/+100% 1.43 # Sandy Bridge 3.90/+100% 1.36 # Haswell 3.88/+70% 1.18 0.72 # Skylake 3.10/+60% 1.14 0.62 # Silvermont 11.0/+40% 4.80 # Goldmont 4.10/+200% 2.10 # VIA Nano 6.71/+90% 2.47 # Sledgehammer 3.51/+180% 4.27 # Bulldozer 4.53/+140% 1.31 # # (*) gcc 4.8 for some reason generated worse code; # (**) besides SSE2 there are floating-point and AVX options; FP # is deemed unnecessary, because pre-SSE2 processor are too # old to care about, while it's not the fastest option on # SSE2-capable ones; AVX is omitted, because it doesn't give # a lot of improvement, 5-10% depending on processor; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $sse2=$avx=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } if ($sse2) { &static_label("const_sse2"); &static_label("enter_blocks"); &static_label("enter_emit"); &external_label("OPENSSL_ia32cap_P"); if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } } ######################################################################## # Layout of opaque area is following. # # unsigned __int32 h[5]; # current hash value base 2^32 # unsigned __int32 pad; # is_base2_26 in vector context # unsigned __int32 r[4]; # key value base 2^32 &align(64); &function_begin("poly1305_init"); &mov ("edi",&wparam(0)); # context &mov ("esi",&wparam(1)); # key &mov ("ebp",&wparam(2)); # function table &xor ("eax","eax"); &mov (&DWP(4*0,"edi"),"eax"); # zero hash value &mov (&DWP(4*1,"edi"),"eax"); &mov (&DWP(4*2,"edi"),"eax"); &mov (&DWP(4*3,"edi"),"eax"); &mov (&DWP(4*4,"edi"),"eax"); &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26 &cmp ("esi",0); &je (&label("nokey")); if ($sse2) { &call (&label("pic_point")); &set_label("pic_point"); &blindpop("ebx"); &lea ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx")); &lea ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx")); &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point")); &mov ("ecx",&DWP(0,"edi")); &and ("ecx",1<<26|1<<24); &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM? &jne (&label("no_sse2")); &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx")); &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx")); if ($avx>1) { &mov ("ecx",&DWP(8,"edi")); &test ("ecx",1<<5); # AVX2? &jz (&label("no_sse2")); &lea ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx")); } &set_label("no_sse2"); &mov ("edi",&wparam(0)); # reload context &mov (&DWP(0,"ebp"),"eax"); # fill function table &mov (&DWP(4,"ebp"),"edx"); } &mov ("eax",&DWP(4*0,"esi")); # load input key &mov ("ebx",&DWP(4*1,"esi")); &mov ("ecx",&DWP(4*2,"esi")); &mov ("edx",&DWP(4*3,"esi")); &and ("eax",0x0fffffff); &and ("ebx",0x0ffffffc); &and ("ecx",0x0ffffffc); &and ("edx",0x0ffffffc); &mov (&DWP(4*6,"edi"),"eax"); &mov (&DWP(4*7,"edi"),"ebx"); &mov (&DWP(4*8,"edi"),"ecx"); &mov (&DWP(4*9,"edi"),"edx"); &mov ("eax",$sse2); &set_label("nokey"); &function_end("poly1305_init"); ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $r0,$r1,$r2,$r3, $s1,$s2,$s3)=map(4*$_,(0..15)); &function_begin("poly1305_blocks"); &mov ("edi",&wparam(0)); # ctx &mov ("esi",&wparam(1)); # inp &mov ("ecx",&wparam(2)); # len &set_label("enter_blocks"); &and ("ecx",-15); &jz (&label("nodata")); &stack_push(16); &mov ("eax",&DWP(4*6,"edi")); # r0 &mov ("ebx",&DWP(4*7,"edi")); # r1 &lea ("ebp",&DWP(0,"esi","ecx")); # end of input &mov ("ecx",&DWP(4*8,"edi")); # r2 &mov ("edx",&DWP(4*9,"edi")); # r3 &mov (&wparam(2),"ebp"); &mov ("ebp","esi"); &mov (&DWP($r0,"esp"),"eax"); # r0 &mov ("eax","ebx"); &shr ("eax",2); &mov (&DWP($r1,"esp"),"ebx"); # r1 &add ("eax","ebx"); # s1 &mov ("ebx","ecx"); &shr ("ebx",2); &mov (&DWP($r2,"esp"),"ecx"); # r2 &add ("ebx","ecx"); # s2 &mov ("ecx","edx"); &shr ("ecx",2); &mov (&DWP($r3,"esp"),"edx"); # r3 &add ("ecx","edx"); # s3 &mov (&DWP($s1,"esp"),"eax"); # s1 &mov (&DWP($s2,"esp"),"ebx"); # s2 &mov (&DWP($s3,"esp"),"ecx"); # s3 &mov ("eax",&DWP(4*0,"edi")); # load hash value &mov ("ebx",&DWP(4*1,"edi")); &mov ("ecx",&DWP(4*2,"edi")); &mov ("esi",&DWP(4*3,"edi")); &mov ("edi",&DWP(4*4,"edi")); &jmp (&label("loop")); &set_label("loop",32); &add ("eax",&DWP(4*0,"ebp")); # accumulate input &adc ("ebx",&DWP(4*1,"ebp")); &adc ("ecx",&DWP(4*2,"ebp")); &adc ("esi",&DWP(4*3,"ebp")); &lea ("ebp",&DWP(4*4,"ebp")); &adc ("edi",&wparam(3)); # padbit &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp] &mov (&DWP($h3,"esp"),"esi"); &mul (&DWP($r0,"esp")); # h0*r0 &mov (&DWP($h4,"esp"),"edi"); &mov ("edi","eax"); &mov ("eax","ebx"); # h1 &mov ("esi","edx"); &mul (&DWP($s3,"esp")); # h1*s3 &add ("edi","eax"); &mov ("eax","ecx"); # h2 &adc ("esi","edx"); &mul (&DWP($s2,"esp")); # h2*s2 &add ("edi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("esi","edx"); &mul (&DWP($s1,"esp")); # h3*s1 &add ("edi","eax"); &mov ("eax",&DWP($h0,"esp")); &adc ("esi","edx"); &mul (&DWP($r1,"esp")); # h0*r1 &mov (&DWP($d0,"esp"),"edi"); &xor ("edi","edi"); &add ("esi","eax"); &mov ("eax","ebx"); # h1 &adc ("edi","edx"); &mul (&DWP($r0,"esp")); # h1*r0 &add ("esi","eax"); &mov ("eax","ecx"); # h2 &adc ("edi","edx"); &mul (&DWP($s3,"esp")); # h2*s3 &add ("esi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("edi","edx"); &mul (&DWP($s2,"esp")); # h3*s2 &add ("esi","eax"); &mov ("eax",&DWP($h4,"esp")); &adc ("edi","edx"); &imul ("eax",&DWP($s1,"esp")); # h4*s1 &add ("esi","eax"); &mov ("eax",&DWP($h0,"esp")); &adc ("edi",0); &mul (&DWP($r2,"esp")); # h0*r2 &mov (&DWP($d1,"esp"),"esi"); &xor ("esi","esi"); &add ("edi","eax"); &mov ("eax","ebx"); # h1 &adc ("esi","edx"); &mul (&DWP($r1,"esp")); # h1*r1 &add ("edi","eax"); &mov ("eax","ecx"); # h2 &adc ("esi","edx"); &mul (&DWP($r0,"esp")); # h2*r0 &add ("edi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("esi","edx"); &mul (&DWP($s3,"esp")); # h3*s3 &add ("edi","eax"); &mov ("eax",&DWP($h4,"esp")); &adc ("esi","edx"); &imul ("eax",&DWP($s2,"esp")); # h4*s2 &add ("edi","eax"); &mov ("eax",&DWP($h0,"esp")); &adc ("esi",0); &mul (&DWP($r3,"esp")); # h0*r3 &mov (&DWP($d2,"esp"),"edi"); &xor ("edi","edi"); &add ("esi","eax"); &mov ("eax","ebx"); # h1 &adc ("edi","edx"); &mul (&DWP($r2,"esp")); # h1*r2 &add ("esi","eax"); &mov ("eax","ecx"); # h2 &adc ("edi","edx"); &mul (&DWP($r1,"esp")); # h2*r1 &add ("esi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("edi","edx"); &mul (&DWP($r0,"esp")); # h3*r0 &add ("esi","eax"); &mov ("ecx",&DWP($h4,"esp")); &adc ("edi","edx"); &mov ("edx","ecx"); &imul ("ecx",&DWP($s3,"esp")); # h4*s3 &add ("esi","ecx"); &mov ("eax",&DWP($d0,"esp")); &adc ("edi",0); &imul ("edx",&DWP($r0,"esp")); # h4*r0 &add ("edx","edi"); &mov ("ebx",&DWP($d1,"esp")); &mov ("ecx",&DWP($d2,"esp")); &mov ("edi","edx"); # last reduction step &shr ("edx",2); &and ("edi",3); &lea ("edx",&DWP(0,"edx","edx",4)); # *5 &add ("eax","edx"); &adc ("ebx",0); &adc ("ecx",0); &adc ("esi",0); &adc ("edi",0); &cmp ("ebp",&wparam(2)); # done yet? &jne (&label("loop")); &mov ("edx",&wparam(0)); # ctx &stack_pop(16); &mov (&DWP(4*0,"edx"),"eax"); # store hash value &mov (&DWP(4*1,"edx"),"ebx"); &mov (&DWP(4*2,"edx"),"ecx"); &mov (&DWP(4*3,"edx"),"esi"); &mov (&DWP(4*4,"edx"),"edi"); &set_label("nodata"); &function_end("poly1305_blocks"); &function_begin("poly1305_emit"); &mov ("ebp",&wparam(0)); # context &set_label("enter_emit"); &mov ("edi",&wparam(1)); # output &mov ("eax",&DWP(4*0,"ebp")); # load hash value &mov ("ebx",&DWP(4*1,"ebp")); &mov ("ecx",&DWP(4*2,"ebp")); &mov ("edx",&DWP(4*3,"ebp")); &mov ("esi",&DWP(4*4,"ebp")); &add ("eax",5); # compare to modulus &adc ("ebx",0); &adc ("ecx",0); &adc ("edx",0); &adc ("esi",0); &shr ("esi",2); # did it carry/borrow? &neg ("esi"); # do we choose hash-modulus? &and ("eax","esi"); &and ("ebx","esi"); &and ("ecx","esi"); &and ("edx","esi"); &mov (&DWP(4*0,"edi"),"eax"); &mov (&DWP(4*1,"edi"),"ebx"); &mov (&DWP(4*2,"edi"),"ecx"); &mov (&DWP(4*3,"edi"),"edx"); ¬ ("esi"); # or original hash value? &mov ("eax",&DWP(4*0,"ebp")); &mov ("ebx",&DWP(4*1,"ebp")); &mov ("ecx",&DWP(4*2,"ebp")); &mov ("edx",&DWP(4*3,"ebp")); &mov ("ebp",&wparam(2)); &and ("eax","esi"); &and ("ebx","esi"); &and ("ecx","esi"); &and ("edx","esi"); &or ("eax",&DWP(4*0,"edi")); &or ("ebx",&DWP(4*1,"edi")); &or ("ecx",&DWP(4*2,"edi")); &or ("edx",&DWP(4*3,"edi")); &add ("eax",&DWP(4*0,"ebp")); # accumulate key &adc ("ebx",&DWP(4*1,"ebp")); &adc ("ecx",&DWP(4*2,"ebp")); &adc ("edx",&DWP(4*3,"ebp")); &mov (&DWP(4*0,"edi"),"eax"); &mov (&DWP(4*1,"edi"),"ebx"); &mov (&DWP(4*2,"edi"),"ecx"); &mov (&DWP(4*3,"edi"),"edx"); &function_end("poly1305_emit"); if ($sse2) { ######################################################################## # Layout of opaque area is following. # # unsigned __int32 h[5]; # current hash value base 2^26 # unsigned __int32 is_base2_26; # unsigned __int32 r[4]; # key value base 2^32 # unsigned __int32 pad[2]; # struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9]; # # where r^n are base 2^26 digits of degrees of multiplier key. There are # 5 digits, but last four are interleaved with multiples of 5, totalling # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7)); my $MASK=$T2; # borrow and keep in mind &align (32); &function_begin_B("_poly1305_init_sse2"); &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 &lea ("edi",&DWP(16*3,"edi")); # size optimization &mov ("ebp","esp"); &sub ("esp",16*(9+5)); &and ("esp",-16); #&pand ($D4,&QWP(96,"ebx")); # magic mask &movq ($MASK,&QWP(64,"ebx")); &movdqa ($D0,$D4); &movdqa ($D1,$D4); &movdqa ($D2,$D4); &pand ($D0,$MASK); # -> base 2^26 &psrlq ($D1,26); &psrldq ($D2,6); &pand ($D1,$MASK); &movdqa ($D3,$D2); &psrlq ($D2,4) &psrlq ($D3,30); &pand ($D2,$MASK); &pand ($D3,$MASK); &psrldq ($D4,13); &lea ("edx",&DWP(16*9,"esp")); # size optimization &mov ("ecx",2); &set_label("square"); &movdqa (&QWP(16*0,"esp"),$D0); &movdqa (&QWP(16*1,"esp"),$D1); &movdqa (&QWP(16*2,"esp"),$D2); &movdqa (&QWP(16*3,"esp"),$D3); &movdqa (&QWP(16*4,"esp"),$D4); &movdqa ($T1,$D1); &movdqa ($T0,$D2); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D1); # *5 &paddd ($T0,$D2); # *5 &movdqa (&QWP(16*5,"esp"),$T1); &movdqa (&QWP(16*6,"esp"),$T0); &movdqa ($T1,$D3); &movdqa ($T0,$D4); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D3); # *5 &paddd ($T0,$D4); # *5 &movdqa (&QWP(16*7,"esp"),$T1); &movdqa (&QWP(16*8,"esp"),$T0); &pshufd ($T1,$D0,0b01000100); &movdqa ($T0,$D1); &pshufd ($D1,$D1,0b01000100); &pshufd ($D2,$D2,0b01000100); &pshufd ($D3,$D3,0b01000100); &pshufd ($D4,$D4,0b01000100); &movdqa (&QWP(16*0,"edx"),$T1); &movdqa (&QWP(16*1,"edx"),$D1); &movdqa (&QWP(16*2,"edx"),$D2); &movdqa (&QWP(16*3,"edx"),$D3); &movdqa (&QWP(16*4,"edx"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &pmuludq ($D4,$D0); # h4*r0 &pmuludq ($D3,$D0); # h3*r0 &pmuludq ($D2,$D0); # h2*r0 &pmuludq ($D1,$D0); # h1*r0 &pmuludq ($D0,$T1); # h0*r0 sub pmuladd { my $load = shift; my $base = shift; $base = "esp" if (!defined($base)); ################################################################ # As for choice to "rotate" $T0-$T2 in order to move paddq # past next multiplication. While it makes code harder to read # and doesn't have significant effect on most processors, it # makes a lot of difference on Atom, up to 30% improvement. &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3 &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2 &paddq ($D4,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1 &paddq ($D3,$T1); &$load ($T1,5); # s1 &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0 &paddq ($D2,$T2); &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4 &$load ($T2,2); # r2^n &paddq ($D1,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2 &paddq ($D0,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1 &paddq ($D4,$T2); &$load ($T2,6); # s2^n &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0 &paddq ($D3,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4 &paddq ($D2,$T1); &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3 &$load ($T1,3); # r3^n &paddq ($D1,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1 &paddq ($D0,$T0); &$load ($T0,7); # s3^n &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0 &paddq ($D4,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4 &paddq ($D3,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3 &paddq ($D2,$T0); &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2 &$load ($T0,4); # r4^n &paddq ($D1,$T1); &$load ($T1,8); # s4^n &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0 &paddq ($D0,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4 &paddq ($D4,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1 &paddq ($D3,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2 &paddq ($D0,$T2); &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3 &movdqa ($MASK,&QWP(64,"ebx")); &paddq ($D1,$T0); &paddq ($D2,$T1); } &pmuladd (sub { my ($reg,$i)=@_; &movdqa ($reg,&QWP(16*$i,"esp")); },"edx"); sub lazy_reduction { my $extra = shift; ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe # # [(*) see discussion in poly1305-armv4 module] &movdqa ($T0,$D3); &pand ($D3,$MASK); &psrlq ($T0,26); &$extra () if (defined($extra)); &paddq ($T0,$D4); # h3 -> h4 &movdqa ($T1,$D0); &pand ($D0,$MASK); &psrlq ($T1,26); &movdqa ($D4,$T0); &paddq ($T1,$D1); # h0 -> h1 &psrlq ($T0,26); &pand ($D4,$MASK); &movdqa ($D1,$T1); &psrlq ($T1,26); &paddd ($D0,$T0); # favour paddd when # possible, because # paddq is "broken" # on Atom &psllq ($T0,2); &paddq ($T1,$D2); # h1 -> h2 &paddq ($T0,$D0); # h4 -> h0 (*) &pand ($D1,$MASK); &movdqa ($D2,$T1); &psrlq ($T1,26); &pand ($D2,$MASK); &paddd ($T1,$D3); # h2 -> h3 &movdqa ($D0,$T0); &psrlq ($T0,26); &movdqa ($D3,$T1); &psrlq ($T1,26); &pand ($D0,$MASK); &paddd ($D1,$T0); # h0 -> h1 &pand ($D3,$MASK); &paddd ($D4,$T1); # h3 -> h4 } &lazy_reduction (); &dec ("ecx"); &jz (&label("square_break")); &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 &punpcklqdq ($D1,&QWP(16*1,"esp")); &punpcklqdq ($D2,&QWP(16*2,"esp")); &punpcklqdq ($D3,&QWP(16*3,"esp")); &punpcklqdq ($D4,&QWP(16*4,"esp")); &jmp (&label("square")); &set_label("square_break"); &psllq ($D0,32); # -> r^3:0:r^4:0 &psllq ($D1,32); &psllq ($D2,32); &psllq ($D3,32); &psllq ($D4,32); &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 &por ($D1,&QWP(16*1,"esp")); &por ($D2,&QWP(16*2,"esp")); &por ($D3,&QWP(16*3,"esp")); &por ($D4,&QWP(16*4,"esp")); &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 &pshufd ($D1,$D1,0b10001101); &pshufd ($D2,$D2,0b10001101); &pshufd ($D3,$D3,0b10001101); &pshufd ($D4,$D4,0b10001101); &movdqu (&QWP(16*0,"edi"),$D0); # save the table &movdqu (&QWP(16*1,"edi"),$D1); &movdqu (&QWP(16*2,"edi"),$D2); &movdqu (&QWP(16*3,"edi"),$D3); &movdqu (&QWP(16*4,"edi"),$D4); &movdqa ($T1,$D1); &movdqa ($T0,$D2); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D1); # *5 &paddd ($T0,$D2); # *5 &movdqu (&QWP(16*5,"edi"),$T1); &movdqu (&QWP(16*6,"edi"),$T0); &movdqa ($T1,$D3); &movdqa ($T0,$D4); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D3); # *5 &paddd ($T0,$D4); # *5 &movdqu (&QWP(16*7,"edi"),$T1); &movdqu (&QWP(16*8,"edi"),$T0); &mov ("esp","ebp"); &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization &ret (); &function_end_B("_poly1305_init_sse2"); &align (32); &function_begin("_poly1305_blocks_sse2"); &mov ("edi",&wparam(0)); # ctx &mov ("esi",&wparam(1)); # inp &mov ("ecx",&wparam(2)); # len &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 &and ("ecx",-16); &jz (&label("nodata")); &cmp ("ecx",64); &jae (&label("enter_sse2")); &test ("eax","eax"); # is_base2_26? &jz (&label("enter_blocks")); &set_label("enter_sse2",16); &call (&label("pic_point")); &set_label("pic_point"); &blindpop("ebx"); &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); &test ("eax","eax"); # is_base2_26? &jnz (&label("base2_26")); &call ("_poly1305_init_sse2"); ################################################# base 2^32 -> base 2^26 &mov ("eax",&DWP(0,"edi")); &mov ("ecx",&DWP(3,"edi")); &mov ("edx",&DWP(6,"edi")); &mov ("esi",&DWP(9,"edi")); &mov ("ebp",&DWP(13,"edi")); &mov (&DWP(4*5,"edi"),1); # is_base2_26 &shr ("ecx",2); &and ("eax",0x3ffffff); &shr ("edx",4); &and ("ecx",0x3ffffff); &shr ("esi",6); &and ("edx",0x3ffffff); &movd ($D0,"eax"); &movd ($D1,"ecx"); &movd ($D2,"edx"); &movd ($D3,"esi"); &movd ($D4,"ebp"); &mov ("esi",&wparam(1)); # [reload] inp &mov ("ecx",&wparam(2)); # [reload] len &jmp (&label("base2_32")); &set_label("base2_26",16); &movd ($D0,&DWP(4*0,"edi")); # load hash value &movd ($D1,&DWP(4*1,"edi")); &movd ($D2,&DWP(4*2,"edi")); &movd ($D3,&DWP(4*3,"edi")); &movd ($D4,&DWP(4*4,"edi")); &movdqa ($MASK,&QWP(64,"ebx")); &set_label("base2_32"); &mov ("eax",&wparam(3)); # padbit &mov ("ebp","esp"); &sub ("esp",16*(5+5+5+9+9)); &and ("esp",-16); &lea ("edi",&DWP(16*3,"edi")); # size optimization &shl ("eax",24); # padbit &test ("ecx",31); &jz (&label("even")); ################################################################ # process single block, with SSE2, because it's still faster # even though half of result is discarded &movdqu ($T1,&QWP(0,"esi")); # input &lea ("esi",&DWP(16,"esi")); &movdqa ($T0,$T1); # -> base 2^26 ... &pand ($T1,$MASK); &paddd ($D0,$T1); # ... and accumulate &movdqa ($T1,$T0); &psrlq ($T0,26); &psrldq ($T1,6); &pand ($T0,$MASK); &paddd ($D1,$T0); &movdqa ($T0,$T1); &psrlq ($T1,4); &pand ($T1,$MASK); &paddd ($D2,$T1); &movdqa ($T1,$T0); &psrlq ($T0,30); &pand ($T0,$MASK); &psrldq ($T1,7); &paddd ($D3,$T0); &movd ($T0,"eax"); # padbit &paddd ($D4,$T1); &movd ($T1,&DWP(16*0+12,"edi")); # r0 &paddd ($D4,$T0); &movdqa (&QWP(16*0,"esp"),$D0); &movdqa (&QWP(16*1,"esp"),$D1); &movdqa (&QWP(16*2,"esp"),$D2); &movdqa (&QWP(16*3,"esp"),$D3); &movdqa (&QWP(16*4,"esp"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &pmuludq ($D0,$T1); # h4*r0 &pmuludq ($D1,$T1); # h3*r0 &pmuludq ($D2,$T1); # h2*r0 &movd ($T0,&DWP(16*1+12,"edi")); # r1 &pmuludq ($D3,$T1); # h1*r0 &pmuludq ($D4,$T1); # h0*r0 &pmuladd (sub { my ($reg,$i)=@_; &movd ($reg,&DWP(16*$i+12,"edi")); }); &lazy_reduction (); &sub ("ecx",16); &jz (&label("done")); &set_label("even"); &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization &lea ("eax",&DWP(-16*2,"esi")); &sub ("ecx",64); ################################################################ # expand and copy pre-calculated table to stack &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4 &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4 &cmovb ("esi","eax"); &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2 &movdqa (&QWP(16*0,"edx"),$T1); &lea ("eax",&DWP(16*10,"esp")); &movdqu ($T1,&QWP(16*1,"edi")); &movdqa (&QWP(16*(0-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*1,"edx"),$T0); &movdqu ($T0,&QWP(16*2,"edi")); &movdqa (&QWP(16*(1-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*2,"edx"),$T1); &movdqu ($T1,&QWP(16*3,"edi")); &movdqa (&QWP(16*(2-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*3,"edx"),$T0); &movdqu ($T0,&QWP(16*4,"edi")); &movdqa (&QWP(16*(3-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*4,"edx"),$T1); &movdqu ($T1,&QWP(16*5,"edi")); &movdqa (&QWP(16*(4-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*5,"edx"),$T0); &movdqu ($T0,&QWP(16*6,"edi")); &movdqa (&QWP(16*(5-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*6,"edx"),$T1); &movdqu ($T1,&QWP(16*7,"edi")); &movdqa (&QWP(16*(6-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*7,"edx"),$T0); &movdqu ($T0,&QWP(16*8,"edi")); &movdqa (&QWP(16*(7-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*8,"edx"),$T1); &movdqa (&QWP(16*(8-9),"edx"),$T0); sub load_input { my ($inpbase,$offbase)=@_; &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input &movdqu ($T1,&QWP($inpbase+16,"esi")); &lea ("esi",&DWP(16*2,"esi")); &movdqa (&QWP($offbase+16*2,"esp"),$D2); &movdqa (&QWP($offbase+16*3,"esp"),$D3); &movdqa (&QWP($offbase+16*4,"esp"),$D4); &movdqa ($D2,$T0); # splat input &movdqa ($D3,$T1); &psrldq ($D2,6); &psrldq ($D3,6); &movdqa ($D4,$T0); &punpcklqdq ($D2,$D3); # 2:3 &punpckhqdq ($D4,$T1); # 4 &punpcklqdq ($T0,$T1); # 0:1 &movdqa ($D3,$D2); &psrlq ($D2,4); &psrlq ($D3,30); &movdqa ($T1,$T0); &psrlq ($D4,40); # 4 &psrlq ($T1,26); &pand ($T0,$MASK); # 0 &pand ($T1,$MASK); # 1 &pand ($D2,$MASK); # 2 &pand ($D3,$MASK); # 3 &por ($D4,&QWP(0,"ebx")); # padbit, yes, always &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase); &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase); } &load_input (16*2,16*5); &jbe (&label("skip_loop")); &jmp (&label("loop")); &set_label("loop",32); ################################################################ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r # \___________________/ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r # \___________________/ \____________________/ ################################################################ &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2 &movdqa (&QWP(16*1,"eax"),$T1); &movdqa (&QWP(16*2,"eax"),$D2); &movdqa (&QWP(16*3,"eax"),$D3); &movdqa (&QWP(16*4,"eax"),$D4); ################################################################ # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 &movdqa ($D1,$T0); &pmuludq ($T0,$T2); # h0*r0 &movdqa ($D0,$T1); &pmuludq ($T1,$T2); # h1*r0 &pmuludq ($D2,$T2); # h2*r0 &pmuludq ($D3,$T2); # h3*r0 &pmuludq ($D4,$T2); # h4*r0 sub pmuladd_alt { my $addr = shift; &pmuludq ($D0,&$addr(8)); # h1*s4 &movdqa ($T2,$D1); &pmuludq ($D1,&$addr(1)); # h0*r1 &paddq ($D0,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(2)); # h0*r2 &paddq ($D1,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(3)); # h0*r3 &paddq ($D2,$T2); &movdqa ($T2,&QWP(16*1,"eax")); # pull h1 &pmuludq ($T1,&$addr(4)); # h0*r4 &paddq ($D3,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(1)); # h1*r1 &paddq ($D4,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(2)); # h1*r2 &paddq ($D2,$T2); &movdqa ($T2,&QWP(16*2,"eax")); # pull h2 &pmuludq ($T1,&$addr(3)); # h1*r3 &paddq ($D3,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(7)); # h2*s3 &paddq ($D4,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(8)); # h2*s4 &paddq ($D0,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&$addr(1)); # h2*r1 &paddq ($D1,$T0); &movdqa ($T0,&QWP(16*3,"eax")); # pull h3 &pmuludq ($T2,&$addr(2)); # h2*r2 &paddq ($D3,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(6)); # h3*s2 &paddq ($D4,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&$addr(7)); # h3*s3 &paddq ($D0,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(8)); # h3*s4 &paddq ($D1,$T1); &movdqa ($T1,&QWP(16*4,"eax")); # pull h4 &pmuludq ($T0,&$addr(1)); # h3*r1 &paddq ($D2,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&$addr(8)); # h4*s4 &paddq ($D4,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(5)); # h4*s1 &paddq ($D3,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(6)); # h4*s2 &paddq ($D0,$T2); &movdqa ($MASK,&QWP(64,"ebx")); &pmuludq ($T1,&$addr(7)); # h4*s3 &paddq ($D1,$T0); &paddq ($D2,$T1); } &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); }); &load_input (-16*2,0); &lea ("eax",&DWP(-16*2,"esi")); &sub ("ecx",64); &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value &paddd ($T1,&QWP(16*(5+1),"esp")); &paddd ($D2,&QWP(16*(5+2),"esp")); &paddd ($D3,&QWP(16*(5+3),"esp")); &paddd ($D4,&QWP(16*(5+4),"esp")); &cmovb ("esi","eax"); &lea ("eax",&DWP(16*10,"esp")); &movdqa ($T2,&QWP(16*0,"edx")); # r0^4 &movdqa (&QWP(16*1,"esp"),$D1); &movdqa (&QWP(16*1,"eax"),$T1); &movdqa (&QWP(16*2,"eax"),$D2); &movdqa (&QWP(16*3,"eax"),$D3); &movdqa (&QWP(16*4,"eax"),$D4); ################################################################ # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 &movdqa ($D1,$T0); &pmuludq ($T0,$T2); # h0*r0 &paddq ($T0,$D0); &movdqa ($D0,$T1); &pmuludq ($T1,$T2); # h1*r0 &pmuludq ($D2,$T2); # h2*r0 &pmuludq ($D3,$T2); # h3*r0 &pmuludq ($D4,$T2); # h4*r0 &paddq ($T1,&QWP(16*1,"esp")); &paddq ($D2,&QWP(16*2,"esp")); &paddq ($D3,&QWP(16*3,"esp")); &paddq ($D4,&QWP(16*4,"esp")); &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); }); &lazy_reduction (); &load_input (16*2,16*5); &ja (&label("loop")); &set_label("skip_loop"); ################################################################ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n &add ("ecx",32); &jnz (&label("long_tail")); &paddd ($T0,$D0); # add hash value &paddd ($T1,$D1); &paddd ($D2,&QWP(16*7,"esp")); &paddd ($D3,&QWP(16*8,"esp")); &paddd ($D4,&QWP(16*9,"esp")); &set_label("long_tail"); &movdqa (&QWP(16*0,"eax"),$T0); &movdqa (&QWP(16*1,"eax"),$T1); &movdqa (&QWP(16*2,"eax"),$D2); &movdqa (&QWP(16*3,"eax"),$D3); &movdqa (&QWP(16*4,"eax"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &pmuludq ($T0,$T2); # h0*r0 &pmuludq ($T1,$T2); # h1*r0 &pmuludq ($D2,$T2); # h2*r0 &movdqa ($D0,$T0); &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n &pmuludq ($D3,$T2); # h3*r0 &movdqa ($D1,$T1); &pmuludq ($D4,$T2); # h4*r0 &pmuladd (sub { my ($reg,$i)=@_; &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10); },"eax"); &jz (&label("short_tail")); &load_input (-16*2,0); &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n &paddd ($T0,&QWP(16*5,"esp")); # add hash value &paddd ($T1,&QWP(16*6,"esp")); &paddd ($D2,&QWP(16*7,"esp")); &paddd ($D3,&QWP(16*8,"esp")); &paddd ($D4,&QWP(16*9,"esp")); ################################################################ # multiply inp[0:1] by r^4:r^3 and accumulate &movdqa (&QWP(16*0,"esp"),$T0); &pmuludq ($T0,$T2); # h0*r0 &movdqa (&QWP(16*1,"esp"),$T1); &pmuludq ($T1,$T2); # h1*r0 &paddq ($D0,$T0); &movdqa ($T0,$D2); &pmuludq ($D2,$T2); # h2*r0 &paddq ($D1,$T1); &movdqa ($T1,$D3); &pmuludq ($D3,$T2); # h3*r0 &paddq ($D2,&QWP(16*2,"esp")); &movdqa (&QWP(16*2,"esp"),$T0); &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n &paddq ($D3,&QWP(16*3,"esp")); &movdqa (&QWP(16*3,"esp"),$T1); &movdqa ($T1,$D4); &pmuludq ($D4,$T2); # h4*r0 &paddq ($D4,&QWP(16*4,"esp")); &movdqa (&QWP(16*4,"esp"),$T1); &pmuladd (sub { my ($reg,$i)=@_; &pshufd ($reg,&QWP(16*$i,"edx"),0x10); }); &set_label("short_tail"); ################################################################ # horizontal addition &pshufd ($T1,$D4,0b01001110); &pshufd ($T0,$D3,0b01001110); &paddq ($D4,$T1); &paddq ($D3,$T0); &pshufd ($T1,$D0,0b01001110); &pshufd ($T0,$D1,0b01001110); &paddq ($D0,$T1); &paddq ($D1,$T0); &pshufd ($T1,$D2,0b01001110); #&paddq ($D2,$T1); &lazy_reduction (sub { &paddq ($D2,$T1) }); &set_label("done"); &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value &movd (&DWP(-16*3+4*1,"edi"),$D1); &movd (&DWP(-16*3+4*2,"edi"),$D2); &movd (&DWP(-16*3+4*3,"edi"),$D3); &movd (&DWP(-16*3+4*4,"edi"),$D4); &mov ("esp","ebp"); &set_label("nodata"); &function_end("_poly1305_blocks_sse2"); &align (32); &function_begin("_poly1305_emit_sse2"); &mov ("ebp",&wparam(0)); # context &cmp (&DWP(4*5,"ebp"),0); # is_base2_26? &je (&label("enter_emit")); &mov ("eax",&DWP(4*0,"ebp")); # load hash value &mov ("edi",&DWP(4*1,"ebp")); &mov ("ecx",&DWP(4*2,"ebp")); &mov ("edx",&DWP(4*3,"ebp")); &mov ("esi",&DWP(4*4,"ebp")); &mov ("ebx","edi"); # base 2^26 -> base 2^32 &shl ("edi",26); &shr ("ebx",6); &add ("eax","edi"); &mov ("edi","ecx"); &adc ("ebx",0); &shl ("edi",20); &shr ("ecx",12); &add ("ebx","edi"); &mov ("edi","edx"); &adc ("ecx",0); &shl ("edi",14); &shr ("edx",18); &add ("ecx","edi"); &mov ("edi","esi"); &adc ("edx",0); &shl ("edi",8); &shr ("esi",24); &add ("edx","edi"); &adc ("esi",0); # can be partially reduced &mov ("edi","esi"); # final reduction &and ("esi",3); &shr ("edi",2); &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 &mov ("edi",&wparam(1)); # output &add ("eax","ebp"); &mov ("ebp",&wparam(2)); # key &adc ("ebx",0); &adc ("ecx",0); &adc ("edx",0); &adc ("esi",0); &movd ($D0,"eax"); # offload original hash value &add ("eax",5); # compare to modulus &movd ($D1,"ebx"); &adc ("ebx",0); &movd ($D2,"ecx"); &adc ("ecx",0); &movd ($D3,"edx"); &adc ("edx",0); &adc ("esi",0); &shr ("esi",2); # did it carry/borrow? &neg ("esi"); # do we choose (hash-modulus) ... &and ("eax","esi"); &and ("ebx","esi"); &and ("ecx","esi"); &and ("edx","esi"); &mov (&DWP(4*0,"edi"),"eax"); &movd ("eax",$D0); &mov (&DWP(4*1,"edi"),"ebx"); &movd ("ebx",$D1); &mov (&DWP(4*2,"edi"),"ecx"); &movd ("ecx",$D2); &mov (&DWP(4*3,"edi"),"edx"); &movd ("edx",$D3); ¬ ("esi"); # ... or original hash value? &and ("eax","esi"); &and ("ebx","esi"); &or ("eax",&DWP(4*0,"edi")); &and ("ecx","esi"); &or ("ebx",&DWP(4*1,"edi")); &and ("edx","esi"); &or ("ecx",&DWP(4*2,"edi")); &or ("edx",&DWP(4*3,"edi")); &add ("eax",&DWP(4*0,"ebp")); # accumulate key &adc ("ebx",&DWP(4*1,"ebp")); &mov (&DWP(4*0,"edi"),"eax"); &adc ("ecx",&DWP(4*2,"ebp")); &mov (&DWP(4*1,"edi"),"ebx"); &adc ("edx",&DWP(4*3,"ebp")); &mov (&DWP(4*2,"edi"),"ecx"); &mov (&DWP(4*3,"edi"),"edx"); &function_end("_poly1305_emit_sse2"); if ($avx>1) { ######################################################################## # Note that poly1305_init_avx2 operates on %xmm, I could have used # poly1305_init_sse2... &align (32); &function_begin_B("_poly1305_init_avx2"); &vmovdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 &lea ("edi",&DWP(16*3,"edi")); # size optimization &mov ("ebp","esp"); &sub ("esp",16*(9+5)); &and ("esp",-16); #&vpand ($D4,$D4,&QWP(96,"ebx")); # magic mask &vmovdqa ($MASK,&QWP(64,"ebx")); &vpand ($D0,$D4,$MASK); # -> base 2^26 &vpsrlq ($D1,$D4,26); &vpsrldq ($D3,$D4,6); &vpand ($D1,$D1,$MASK); &vpsrlq ($D2,$D3,4) &vpsrlq ($D3,$D3,30); &vpand ($D2,$D2,$MASK); &vpand ($D3,$D3,$MASK); &vpsrldq ($D4,$D4,13); &lea ("edx",&DWP(16*9,"esp")); # size optimization &mov ("ecx",2); &set_label("square"); &vmovdqa (&QWP(16*0,"esp"),$D0); &vmovdqa (&QWP(16*1,"esp"),$D1); &vmovdqa (&QWP(16*2,"esp"),$D2); &vmovdqa (&QWP(16*3,"esp"),$D3); &vmovdqa (&QWP(16*4,"esp"),$D4); &vpslld ($T1,$D1,2); &vpslld ($T0,$D2,2); &vpaddd ($T1,$T1,$D1); # *5 &vpaddd ($T0,$T0,$D2); # *5 &vmovdqa (&QWP(16*5,"esp"),$T1); &vmovdqa (&QWP(16*6,"esp"),$T0); &vpslld ($T1,$D3,2); &vpslld ($T0,$D4,2); &vpaddd ($T1,$T1,$D3); # *5 &vpaddd ($T0,$T0,$D4); # *5 &vmovdqa (&QWP(16*7,"esp"),$T1); &vmovdqa (&QWP(16*8,"esp"),$T0); &vpshufd ($T0,$D0,0b01000100); &vmovdqa ($T1,$D1); &vpshufd ($D1,$D1,0b01000100); &vpshufd ($D2,$D2,0b01000100); &vpshufd ($D3,$D3,0b01000100); &vpshufd ($D4,$D4,0b01000100); &vmovdqa (&QWP(16*0,"edx"),$T0); &vmovdqa (&QWP(16*1,"edx"),$D1); &vmovdqa (&QWP(16*2,"edx"),$D2); &vmovdqa (&QWP(16*3,"edx"),$D3); &vmovdqa (&QWP(16*4,"edx"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &vpmuludq ($D4,$D4,$D0); # h4*r0 &vpmuludq ($D3,$D3,$D0); # h3*r0 &vpmuludq ($D2,$D2,$D0); # h2*r0 &vpmuludq ($D1,$D1,$D0); # h1*r0 &vpmuludq ($D0,$T0,$D0); # h0*r0 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3 &vpaddq ($D4,$D4,$T0); &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2 &vpaddq ($D3,$D3,$T2); &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1 &vpaddq ($D2,$D2,$T0); &vmovdqa ($T2,&QWP(16*5,"esp")); # s1 &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0 &vpaddq ($D1,$D1,$T1); &vmovdqa ($T0,&QWP(16*2,"esp")); # r2 &vpmuludq ($T2,$T2,&QWP(16*4,"edx")); # s1*h4 &vpaddq ($D0,$D0,$T2); &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2 &vpaddq ($D4,$D4,$T1); &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1 &vpaddq ($D3,$D3,$T2); &vmovdqa ($T1,&QWP(16*6,"esp")); # s2 &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0 &vpaddq ($D2,$D2,$T0); &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4 &vpaddq ($D1,$D1,$T2); &vmovdqa ($T0,&QWP(16*3,"esp")); # r3 &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3 &vpaddq ($D0,$D0,$T1); &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1 &vpaddq ($D4,$D4,$T2); &vmovdqa ($T1,&QWP(16*7,"esp")); # s3 &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0 &vpaddq ($D3,$D3,$T0); &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4 &vpaddq ($D2,$D2,$T2); &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3 &vpaddq ($D1,$D1,$T0); &vmovdqa ($T2,&QWP(16*4,"esp")); # r4 &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2 &vpaddq ($D0,$D0,$T1); &vmovdqa ($T0,&QWP(16*8,"esp")); # s4 &vpmuludq ($T2,$T2,&QWP(16*0,"edx")); # r4*h0 &vpaddq ($D4,$D4,$T2); &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4 &vpaddq ($D3,$D3,$T1); &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1 &vpaddq ($D0,$D0,$T2); &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2 &vpaddq ($D1,$D1,$T1); &vmovdqa ($MASK,&QWP(64,"ebx")); &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3 &vpaddq ($D2,$D2,$T0); ################################################################ # lazy reduction &vpsrlq ($T0,$D3,26); &vpand ($D3,$D3,$MASK); &vpsrlq ($T1,$D0,26); &vpand ($D0,$D0,$MASK); &vpaddq ($D4,$D4,$T0); # h3 -> h4 &vpaddq ($D1,$D1,$T1); # h0 -> h1 &vpsrlq ($T0,$D4,26); &vpand ($D4,$D4,$MASK); &vpsrlq ($T1,$D1,26); &vpand ($D1,$D1,$MASK); &vpaddq ($D2,$D2,$T1); # h1 -> h2 &vpaddd ($D0,$D0,$T0); &vpsllq ($T0,$T0,2); &vpsrlq ($T1,$D2,26); &vpand ($D2,$D2,$MASK); &vpaddd ($D0,$D0,$T0); # h4 -> h0 &vpaddd ($D3,$D3,$T1); # h2 -> h3 &vpsrlq ($T1,$D3,26); &vpsrlq ($T0,$D0,26); &vpand ($D0,$D0,$MASK); &vpand ($D3,$D3,$MASK); &vpaddd ($D1,$D1,$T0); # h0 -> h1 &vpaddd ($D4,$D4,$T1); # h3 -> h4 &dec ("ecx"); &jz (&label("square_break")); &vpunpcklqdq ($D0,$D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 &vpunpcklqdq ($D1,$D1,&QWP(16*1,"esp")); &vpunpcklqdq ($D2,$D2,&QWP(16*2,"esp")); &vpunpcklqdq ($D3,$D3,&QWP(16*3,"esp")); &vpunpcklqdq ($D4,$D4,&QWP(16*4,"esp")); &jmp (&label("square")); &set_label("square_break"); &vpsllq ($D0,$D0,32); # -> r^3:0:r^4:0 &vpsllq ($D1,$D1,32); &vpsllq ($D2,$D2,32); &vpsllq ($D3,$D3,32); &vpsllq ($D4,$D4,32); &vpor ($D0,$D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 &vpor ($D1,$D1,&QWP(16*1,"esp")); &vpor ($D2,$D2,&QWP(16*2,"esp")); &vpor ($D3,$D3,&QWP(16*3,"esp")); &vpor ($D4,$D4,&QWP(16*4,"esp")); &vpshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 &vpshufd ($D1,$D1,0b10001101); &vpshufd ($D2,$D2,0b10001101); &vpshufd ($D3,$D3,0b10001101); &vpshufd ($D4,$D4,0b10001101); &vmovdqu (&QWP(16*0,"edi"),$D0); # save the table &vmovdqu (&QWP(16*1,"edi"),$D1); &vmovdqu (&QWP(16*2,"edi"),$D2); &vmovdqu (&QWP(16*3,"edi"),$D3); &vmovdqu (&QWP(16*4,"edi"),$D4); &vpslld ($T1,$D1,2); &vpslld ($T0,$D2,2); &vpaddd ($T1,$T1,$D1); # *5 &vpaddd ($T0,$T0,$D2); # *5 &vmovdqu (&QWP(16*5,"edi"),$T1); &vmovdqu (&QWP(16*6,"edi"),$T0); &vpslld ($T1,$D3,2); &vpslld ($T0,$D4,2); &vpaddd ($T1,$T1,$D3); # *5 &vpaddd ($T0,$T0,$D4); # *5 &vmovdqu (&QWP(16*7,"edi"),$T1); &vmovdqu (&QWP(16*8,"edi"),$T0); &mov ("esp","ebp"); &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization &ret (); &function_end_B("_poly1305_init_avx2"); ######################################################################## # now it's time to switch to %ymm my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7)); my $MASK=$T2; sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; } &align (32); &function_begin("_poly1305_blocks_avx2"); &mov ("edi",&wparam(0)); # ctx &mov ("esi",&wparam(1)); # inp &mov ("ecx",&wparam(2)); # len &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 &and ("ecx",-16); &jz (&label("nodata")); &cmp ("ecx",64); &jae (&label("enter_avx2")); &test ("eax","eax"); # is_base2_26? &jz (&label("enter_blocks")); &set_label("enter_avx2"); &vzeroupper (); &call (&label("pic_point")); &set_label("pic_point"); &blindpop("ebx"); &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); &test ("eax","eax"); # is_base2_26? &jnz (&label("base2_26")); &call ("_poly1305_init_avx2"); ################################################# base 2^32 -> base 2^26 &mov ("eax",&DWP(0,"edi")); &mov ("ecx",&DWP(3,"edi")); &mov ("edx",&DWP(6,"edi")); &mov ("esi",&DWP(9,"edi")); &mov ("ebp",&DWP(13,"edi")); &shr ("ecx",2); &and ("eax",0x3ffffff); &shr ("edx",4); &and ("ecx",0x3ffffff); &shr ("esi",6); &and ("edx",0x3ffffff); &mov (&DWP(4*0,"edi"),"eax"); &mov (&DWP(4*1,"edi"),"ecx"); &mov (&DWP(4*2,"edi"),"edx"); &mov (&DWP(4*3,"edi"),"esi"); &mov (&DWP(4*4,"edi"),"ebp"); &mov (&DWP(4*5,"edi"),1); # is_base2_26 &mov ("esi",&wparam(1)); # [reload] inp &mov ("ecx",&wparam(2)); # [reload] len &set_label("base2_26"); &mov ("eax",&wparam(3)); # padbit &mov ("ebp","esp"); &sub ("esp",32*(5+9)); &and ("esp",-512); # ensure that frame # doesn't cross page # boundary, which is # essential for # misaligned 32-byte # loads ################################################################ # expand and copy pre-calculated table to stack &vmovdqu (&X($D0),&QWP(16*(3+0),"edi")); &lea ("edx",&DWP(32*5+128,"esp")); # +128 size optimization &vmovdqu (&X($D1),&QWP(16*(3+1),"edi")); &vmovdqu (&X($D2),&QWP(16*(3+2),"edi")); &vmovdqu (&X($D3),&QWP(16*(3+3),"edi")); &vmovdqu (&X($D4),&QWP(16*(3+4),"edi")); &lea ("edi",&DWP(16*3,"edi")); # size optimization &vpermq ($D0,$D0,0b01000000); # 00001234 -> 12343434 &vpermq ($D1,$D1,0b01000000); &vpermq ($D2,$D2,0b01000000); &vpermq ($D3,$D3,0b01000000); &vpermq ($D4,$D4,0b01000000); &vpshufd ($D0,$D0,0b11001000); # 12343434 -> 14243444 &vpshufd ($D1,$D1,0b11001000); &vpshufd ($D2,$D2,0b11001000); &vpshufd ($D3,$D3,0b11001000); &vpshufd ($D4,$D4,0b11001000); &vmovdqa (&QWP(32*0-128,"edx"),$D0); &vmovdqu (&X($D0),&QWP(16*5,"edi")); &vmovdqa (&QWP(32*1-128,"edx"),$D1); &vmovdqu (&X($D1),&QWP(16*6,"edi")); &vmovdqa (&QWP(32*2-128,"edx"),$D2); &vmovdqu (&X($D2),&QWP(16*7,"edi")); &vmovdqa (&QWP(32*3-128,"edx"),$D3); &vmovdqu (&X($D3),&QWP(16*8,"edi")); &vmovdqa (&QWP(32*4-128,"edx"),$D4); &vpermq ($D0,$D0,0b01000000); &vpermq ($D1,$D1,0b01000000); &vpermq ($D2,$D2,0b01000000); &vpermq ($D3,$D3,0b01000000); &vpshufd ($D0,$D0,0b11001000); &vpshufd ($D1,$D1,0b11001000); &vpshufd ($D2,$D2,0b11001000); &vpshufd ($D3,$D3,0b11001000); &vmovdqa (&QWP(32*5-128,"edx"),$D0); &vmovd (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value &vmovdqa (&QWP(32*6-128,"edx"),$D1); &vmovd (&X($D1),&DWP(-16*3+4*1,"edi")); &vmovdqa (&QWP(32*7-128,"edx"),$D2); &vmovd (&X($D2),&DWP(-16*3+4*2,"edi")); &vmovdqa (&QWP(32*8-128,"edx"),$D3); &vmovd (&X($D3),&DWP(-16*3+4*3,"edi")); &vmovd (&X($D4),&DWP(-16*3+4*4,"edi")); &vmovdqa ($MASK,&QWP(64,"ebx")); &neg ("eax"); # padbit &test ("ecx",63); &jz (&label("even")); &mov ("edx","ecx"); &and ("ecx",-64); &and ("edx",63); &vmovdqu (&X($T0),&QWP(16*0,"esi")); &cmp ("edx",32); &jb (&label("one")); &vmovdqu (&X($T1),&QWP(16*1,"esi")); &je (&label("two")); &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); &lea ("esi",&DWP(16*3,"esi")); &lea ("ebx",&DWP(8,"ebx")); # three padbits &lea ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*) &jmp (&label("tail")); &set_label("two"); &lea ("esi",&DWP(16*2,"esi")); &lea ("ebx",&DWP(16,"ebx")); # two padbits &lea ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*) &jmp (&label("tail")); &set_label("one"); &lea ("esi",&DWP(16*1,"esi")); &vpxor ($T1,$T1,$T1); &lea ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits &lea ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*) &jmp (&label("tail")); # (*) spots marked with '--' are data from next table entry, but they # are multiplied by 0 and therefore rendered insignificant &set_label("even",32); &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input &vmovdqu (&X($T1),&QWP(16*1,"esi")); &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); &lea ("esi",&DWP(16*4,"esi")); &sub ("ecx",64); &jz (&label("tail")); &set_label("loop"); ################################################################ # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4 # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3 # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2 # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1 # \________/ \_______/ ################################################################ sub vsplat_input { &vmovdqa (&QWP(32*2,"esp"),$D2); &vpsrldq ($D2,$T0,6); # splat input &vmovdqa (&QWP(32*0,"esp"),$D0); &vpsrldq ($D0,$T1,6); &vmovdqa (&QWP(32*1,"esp"),$D1); &vpunpckhqdq ($D1,$T0,$T1); # 4 &vpunpcklqdq ($T0,$T0,$T1); # 0:1 &vpunpcklqdq ($D2,$D2,$D0); # 2:3 &vpsrlq ($D0,$D2,30); &vpsrlq ($D2,$D2,4); &vpsrlq ($T1,$T0,26); &vpsrlq ($D1,$D1,40); # 4 &vpand ($D2,$D2,$MASK); # 2 &vpand ($T0,$T0,$MASK); # 0 &vpand ($T1,$T1,$MASK); # 1 &vpand ($D0,$D0,$MASK); # 3 (*) &vpor ($D1,$D1,&QWP(0,"ebx")); # padbit, yes, always # (*) note that output is counterintuitive, inp[3:4] is # returned in $D1-2, while $D3-4 are preserved; } &vsplat_input (); sub vpmuladd { my $addr = shift; &vpaddq ($D2,$D2,&QWP(32*2,"esp")); # add hash value &vpaddq ($T0,$T0,&QWP(32*0,"esp")); &vpaddq ($T1,$T1,&QWP(32*1,"esp")); &vpaddq ($D0,$D0,$D3); &vpaddq ($D1,$D1,$D4); ################################################################ # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 &vpmuludq ($D3,$D2,&$addr(1)); # d3 = h2*r1 &vmovdqa (QWP(32*1,"esp"),$T1); &vpmuludq ($D4,$D2,&$addr(2)); # d4 = h2*r2 &vmovdqa (QWP(32*3,"esp"),$D0); &vpmuludq ($D0,$D2,&$addr(7)); # d0 = h2*s3 &vmovdqa (QWP(32*4,"esp"),$D1); &vpmuludq ($D1,$D2,&$addr(8)); # d1 = h2*s4 &vpmuludq ($D2,$D2,&$addr(0)); # d2 = h2*r0 &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3 &vpaddq ($D3,$D3,$T2); # d3 += h0*r3 &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4 &vpaddq ($D4,$D4,$T1); # d4 + h0*r4 &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0 &vpaddq ($D0,$D0,$T2); # d0 + h0*r0 &vmovdqa ($T2,&QWP(32*1,"esp")); # h1 &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1 &vpaddq ($D1,$D1,$T1); # d1 += h0*r1 &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2 &vpaddq ($D2,$D2,$T0); # d2 += h0*r2 &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2 &vpaddq ($D3,$D3,$T1); # d3 += h1*r2 &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3 &vpaddq ($D4,$D4,$T0); # d4 += h1*r3 &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4 &vpaddq ($D0,$D0,$T1); # d0 += h1*s4 &vmovdqa ($T1,&QWP(32*3,"esp")); # h3 &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0 &vpaddq ($D1,$D1,$T0); # d1 += h1*r0 &vpmuludq ($T2,$T2,&$addr(1)); # h1*r1 &vpaddq ($D2,$D2,$T2); # d2 += h1*r1 &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0 &vpaddq ($D3,$D3,$T0); # d3 += h3*r0 &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1 &vpaddq ($D4,$D4,$T2); # d4 += h3*r1 &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2 &vpaddq ($D0,$D0,$T0); # d0 += h3*s2 &vmovdqa ($T0,&QWP(32*4,"esp")); # h4 &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3 &vpaddq ($D1,$D1,$T2); # d1+= h3*s3 &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4 &vpaddq ($D2,$D2,$T1); # d2 += h3*s4 &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4 &vpaddq ($D3,$D3,$T2); # d3 += h4*s4 &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1 &vpaddq ($D0,$D0,$T1); # d0 += h4*s1 &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0 &vpaddq ($D4,$D4,$T2); # d4 += h4*r0 &vmovdqa ($MASK,&QWP(64,"ebx")); &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2 &vpaddq ($D1,$D1,$T1); # d1 += h4*s2 &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3 &vpaddq ($D2,$D2,$T0); # d2 += h4*s3 } &vpmuladd (sub { my $i=shift; &QWP(32*$i-128,"edx"); }); sub vlazy_reduction { ################################################################ # lazy reduction &vpsrlq ($T0,$D3,26); &vpand ($D3,$D3,$MASK); &vpsrlq ($T1,$D0,26); &vpand ($D0,$D0,$MASK); &vpaddq ($D4,$D4,$T0); # h3 -> h4 &vpaddq ($D1,$D1,$T1); # h0 -> h1 &vpsrlq ($T0,$D4,26); &vpand ($D4,$D4,$MASK); &vpsrlq ($T1,$D1,26); &vpand ($D1,$D1,$MASK); &vpaddq ($D2,$D2,$T1); # h1 -> h2 &vpaddq ($D0,$D0,$T0); &vpsllq ($T0,$T0,2); &vpsrlq ($T1,$D2,26); &vpand ($D2,$D2,$MASK); &vpaddq ($D0,$D0,$T0); # h4 -> h0 &vpaddq ($D3,$D3,$T1); # h2 -> h3 &vpsrlq ($T1,$D3,26); &vpsrlq ($T0,$D0,26); &vpand ($D0,$D0,$MASK); &vpand ($D3,$D3,$MASK); &vpaddq ($D1,$D1,$T0); # h0 -> h1 &vpaddq ($D4,$D4,$T1); # h3 -> h4 } &vlazy_reduction(); &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input &vmovdqu (&X($T1),&QWP(16*1,"esi")); &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); &lea ("esi",&DWP(16*4,"esi")); &sub ("ecx",64); &jnz (&label("loop")); &set_label("tail"); &vsplat_input (); &and ("ebx",-64); # restore pointer &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); }); ################################################################ # horizontal addition &vpsrldq ($T0,$D4,8); &vpsrldq ($T1,$D3,8); &vpaddq ($D4,$D4,$T0); &vpsrldq ($T0,$D0,8); &vpaddq ($D3,$D3,$T1); &vpsrldq ($T1,$D1,8); &vpaddq ($D0,$D0,$T0); &vpsrldq ($T0,$D2,8); &vpaddq ($D1,$D1,$T1); &vpermq ($T1,$D4,2); # keep folding &vpaddq ($D2,$D2,$T0); &vpermq ($T0,$D3,2); &vpaddq ($D4,$D4,$T1); &vpermq ($T1,$D0,2); &vpaddq ($D3,$D3,$T0); &vpermq ($T0,$D1,2); &vpaddq ($D0,$D0,$T1); &vpermq ($T1,$D2,2); &vpaddq ($D1,$D1,$T0); &vpaddq ($D2,$D2,$T1); &vlazy_reduction(); &cmp ("ecx",0); &je (&label("done")); ################################################################ # clear all but single word &vpshufd (&X($D0),&X($D0),0b11111100); &lea ("edx",&DWP(32*5+128,"esp")); # restore pointer &vpshufd (&X($D1),&X($D1),0b11111100); &vpshufd (&X($D2),&X($D2),0b11111100); &vpshufd (&X($D3),&X($D3),0b11111100); &vpshufd (&X($D4),&X($D4),0b11111100); &jmp (&label("even")); &set_label("done",16); &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1)); &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2)); &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3)); &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4)); &vzeroupper (); &mov ("esp","ebp"); &set_label("nodata"); &function_end("_poly1305_blocks_avx2"); } &set_label("const_sse2",64); &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0); &data_word(0,0, 0,0, 0,0, 0,0); &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0); &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc); } &asciz ("Poly1305 for x86, CRYPTOGAMS by "); &align (4); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/poly1305/asm/poly1305-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/poly1305/asm/poly1305-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/poly1305/asm/poly1305-x86_64.pl (revision 364822) @@ -1,4183 +1,4183 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for x86_64. # # March 2015 # # Initial release. # # December 2016 # # Add AVX512F+VL+BW code path. # # November 2017 # # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be # executed even on Knights Landing. Trigger for modification was # observation that AVX512 code paths can negatively affect overall # Skylake-X system performance. Since we are likely to suppress # AVX512F capability flag [at least on Skylake-X], conversion serves # as kind of "investment protection". Note that next *lake processor, # Cannolake, has AVX512IFMA code path to execute... # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 # P4 4.46/+120% - # Core 2 2.41/+90% - # Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 # Haswell 1.14/+175% 1.11 0.65 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] # Silvermont 2.83/+95% - # Knights L 3.60/? 1.65 1.10 0.41(***) # Goldmont 1.70/+180% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - # Bulldozer 2.30/+130% 0.97 # Ryzen 1.15/+200% 1.08 1.18 # # (*) improvement coefficients relative to clang are more modest and # are ~50% on most processors, in both cases we are comparing to # __int128 code; # (**) SSE2 implementation was attempted, but among non-AVX processors # it was faster than integer-only code only on older Intel P4 and # Core processors, 50-30%, less newer processor is, but slower on # contemporary ones, for example almost 2x slower on Atom, and as # former are naturally disappearing, SSE2 is deemed unnecessary; # (***) strangely enough performance seems to vary from core to core, # listed result is best case; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); $avx += 2 if ($1==2.11 && $2>=8); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); my ($mac,$nonce)=($inp,$len); # *_emit arguments my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); sub poly1305_iteration { # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 # output: $h0-$h2 *= $r0-$r1 $code.=<<___; mulq $h0 # h0*r1 mov %rax,$d2 mov $r0,%rax mov %rdx,$d3 mulq $h0 # h0*r0 mov %rax,$h0 # future $h0 mov $r0,%rax mov %rdx,$d1 mulq $h1 # h1*r0 add %rax,$d2 mov $s1,%rax adc %rdx,$d3 mulq $h1 # h1*s1 mov $h2,$h1 # borrow $h1 add %rax,$h0 adc %rdx,$d1 imulq $s1,$h1 # h2*s1 add $h1,$d2 mov $d1,$h1 adc \$0,$d3 imulq $r0,$h2 # h2*r0 add $d2,$h1 mov \$-4,%rax # mask value adc $h2,$d3 and $d3,%rax # last reduction step mov $d3,$h2 shr \$2,$d3 and \$3,$h2 add $d3,%rax add %rax,$h0 adc \$0,$h1 adc \$0,$h2 ___ } ######################################################################## # Layout of opaque area is following. # # unsigned __int64 h[3]; # current hash value base 2^64 # unsigned __int64 r[2]; # key value base 2^64 $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl poly1305_init .hidden poly1305_init .globl poly1305_blocks .hidden poly1305_blocks .globl poly1305_emit .hidden poly1305_emit .type poly1305_init,\@function,3 .align 32 poly1305_init: .cfi_startproc xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) cmp \$0,$inp je .Lno_key lea poly1305_blocks(%rip),%r10 lea poly1305_emit(%rip),%r11 ___ $code.=<<___ if ($avx); mov OPENSSL_ia32cap_P+4(%rip),%r9 lea poly1305_blocks_avx(%rip),%rax lea poly1305_emit_avx(%rip),%rcx bt \$`60-32`,%r9 # AVX? cmovc %rax,%r10 cmovc %rcx,%r11 ___ $code.=<<___ if ($avx>1); lea poly1305_blocks_avx2(%rip),%rax bt \$`5+32`,%r9 # AVX2? cmovc %rax,%r10 ___ $code.=<<___ if ($avx>3); mov \$`(1<<31|1<<21|1<<16)`,%rax shr \$32,%r9 and %rax,%r9 cmp %rax,%r9 je .Linit_base2_44 ___ $code.=<<___; mov \$0x0ffffffc0fffffff,%rax mov \$0x0ffffffc0ffffffc,%rcx and 0($inp),%rax and 8($inp),%rcx mov %rax,24($ctx) mov %rcx,32($ctx) ___ $code.=<<___ if ($flavour !~ /elf32/); mov %r10,0(%rdx) mov %r11,8(%rdx) ___ $code.=<<___ if ($flavour =~ /elf32/); mov %r10d,0(%rdx) mov %r11d,4(%rdx) ___ $code.=<<___; mov \$1,%eax .Lno_key: ret .cfi_endproc .size poly1305_init,.-poly1305_init .type poly1305_blocks,\@function,4 .align 32 poly1305_blocks: .cfi_startproc .Lblocks: shr \$4,$len jz .Lno_data # too short push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lblocks_body: mov $len,%r15 # reassign $len mov 24($ctx),$r0 # load r mov 32($ctx),$s1 mov 0($ctx),$h0 # load hash value mov 8($ctx),$h1 mov 16($ctx),$h2 mov $s1,$r1 shr \$2,$s1 mov $r1,%rax add $r1,$s1 # s1 = r1 + (r1 >> 2) jmp .Loop .align 32 .Loop: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 ___ &poly1305_iteration(); $code.=<<___; mov $r1,%rax dec %r15 # len-=16 jnz .Loop mov $h0,0($ctx) # store hash value mov $h1,8($ctx) mov $h2,16($ctx) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: ret .cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,\@function,3 .align 32 poly1305_emit: .cfi_startproc .Lemit: mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 mov 16($ctx),%r10 mov %r8,%rax add \$5,%r8 # compare to modulus mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx add 0($nonce),%rax # accumulate nonce adc 8($nonce),%rcx mov %rax,0($mac) # write result mov %rcx,8($mac) ret .cfi_endproc .size poly1305_emit,.-poly1305_emit ___ if ($avx) { ######################################################################## # Layout of opaque area is following. # # unsigned __int32 h[5]; # current hash value base 2^26 # unsigned __int32 is_base2_26; # unsigned __int64 r[2]; # key value base 2^64 # unsigned __int64 pad; # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; # # where r^n are base 2^26 digits of degrees of multiplier key. There are # 5 digits, but last four are interleaved with multiples of 5, totalling # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = map("%xmm$_",(0..15)); $code.=<<___; .type __poly1305_block,\@abi-omnipotent .align 32 __poly1305_block: .cfi_startproc ___ &poly1305_iteration(); $code.=<<___; ret .cfi_endproc .size __poly1305_block,.-__poly1305_block .type __poly1305_init_avx,\@abi-omnipotent .align 32 __poly1305_init_avx: .cfi_startproc mov $r0,$h0 mov $r1,$h1 xor $h2,$h2 lea 48+64($ctx),$ctx # size optimization mov $r1,%rax call __poly1305_block # r^2 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 mov \$0x3ffffff,%edx mov $h0,$d1 and $h0#d,%eax mov $r0,$d2 and $r0#d,%edx mov %eax,`16*0+0-64`($ctx) shr \$26,$d1 mov %edx,`16*0+4-64`($ctx) shr \$26,$d2 mov \$0x3ffffff,%eax mov \$0x3ffffff,%edx and $d1#d,%eax and $d2#d,%edx mov %eax,`16*1+0-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov %edx,`16*1+4-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 mov %eax,`16*2+0-64`($ctx) shr \$26,$d1 mov %edx,`16*2+4-64`($ctx) shr \$26,$d2 mov $h1,%rax mov $r1,%rdx shl \$12,%rax shl \$12,%rdx or $d1,%rax or $d2,%rdx and \$0x3ffffff,%eax and \$0x3ffffff,%edx mov %eax,`16*3+0-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov %edx,`16*3+4-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 mov %eax,`16*4+0-64`($ctx) mov $h1,$d1 mov %edx,`16*4+4-64`($ctx) mov $r1,$d2 mov \$0x3ffffff,%eax mov \$0x3ffffff,%edx shr \$14,$d1 shr \$14,$d2 and $d1#d,%eax and $d2#d,%edx mov %eax,`16*5+0-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov %edx,`16*5+4-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 mov %eax,`16*6+0-64`($ctx) shr \$26,$d1 mov %edx,`16*6+4-64`($ctx) shr \$26,$d2 mov $h2,%rax shl \$24,%rax or %rax,$d1 mov $d1#d,`16*7+0-64`($ctx) lea ($d1,$d1,4),$d1 # *5 mov $d2#d,`16*7+4-64`($ctx) lea ($d2,$d2,4),$d2 # *5 mov $d1#d,`16*8+0-64`($ctx) mov $d2#d,`16*8+4-64`($ctx) mov $r1,%rax call __poly1305_block # r^3 mov \$0x3ffffff,%eax # save r^3 base 2^26 mov $h0,$d1 and $h0#d,%eax shr \$26,$d1 mov %eax,`16*0+12-64`($ctx) mov \$0x3ffffff,%edx and $d1#d,%edx mov %edx,`16*1+12-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*2+12-64`($ctx) mov $h1,%rax shl \$12,%rax or $d1,%rax and \$0x3ffffff,%eax mov %eax,`16*3+12-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov $h1,$d1 mov %eax,`16*4+12-64`($ctx) mov \$0x3ffffff,%edx shr \$14,$d1 and $d1#d,%edx mov %edx,`16*5+12-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*6+12-64`($ctx) mov $h2,%rax shl \$24,%rax or %rax,$d1 mov $d1#d,`16*7+12-64`($ctx) lea ($d1,$d1,4),$d1 # *5 mov $d1#d,`16*8+12-64`($ctx) mov $r1,%rax call __poly1305_block # r^4 mov \$0x3ffffff,%eax # save r^4 base 2^26 mov $h0,$d1 and $h0#d,%eax shr \$26,$d1 mov %eax,`16*0+8-64`($ctx) mov \$0x3ffffff,%edx and $d1#d,%edx mov %edx,`16*1+8-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*2+8-64`($ctx) mov $h1,%rax shl \$12,%rax or $d1,%rax and \$0x3ffffff,%eax mov %eax,`16*3+8-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov $h1,$d1 mov %eax,`16*4+8-64`($ctx) mov \$0x3ffffff,%edx shr \$14,$d1 and $d1#d,%edx mov %edx,`16*5+8-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*6+8-64`($ctx) mov $h2,%rax shl \$24,%rax or %rax,$d1 mov $d1#d,`16*7+8-64`($ctx) lea ($d1,$d1,4),$d1 # *5 mov $d1#d,`16*8+8-64`($ctx) lea -48-64($ctx),$ctx # size [de-]optimization ret .cfi_endproc .size __poly1305_init_avx,.-__poly1305_init_avx .type poly1305_blocks_avx,\@function,4 .align 32 poly1305_blocks_avx: .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx test %r8d,%r8d jz .Lblocks .Lblocks_avx: and \$-16,$len jz .Lno_data_avx vzeroupper test %r8d,%r8d jz .Lbase2_64_avx test \$31,$len jz .Leven_avx push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lblocks_avx_body: mov $len,%r15 # reassign $len mov 0($ctx),$d1 # load hash value mov 8($ctx),$d2 mov 16($ctx),$h2#d mov 24($ctx),$r0 # load r mov 32($ctx),$s1 ################################# base 2^26 -> base 2^64 mov $d1#d,$h0#d and \$`-1*(1<<31)`,$d1 mov $d2,$r1 # borrow $r1 mov $d2#d,$h1#d and \$`-1*(1<<31)`,$d2 shr \$6,$d1 shl \$52,$r1 add $d1,$h0 shr \$12,$h1 shr \$18,$d2 add $r1,$h0 adc $d2,$h1 mov $h2,$d1 shl \$40,$d1 shr \$24,$h2 add $d1,$h1 adc \$0,$h2 # can be partially reduced... mov \$-4,$d2 # ... so reduce mov $h2,$d1 and $h2,$d2 shr \$2,$d1 and \$3,$h2 add $d2,$d1 # =*5 add $d1,$h0 adc \$0,$h1 adc \$0,$h2 mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 call __poly1305_block test $padbit,$padbit # if $padbit is zero, jz .Lstore_base2_64_avx # store hash in base 2^64 format ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$r0 mov $h1,$r1 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$r0 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $r0,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$r1 and \$0x3ffffff,$h1 # h[3] or $r1,$h2 # h[4] sub \$16,%r15 jz .Lstore_base2_26_avx vmovd %rax#d,$H0 vmovd %rdx#d,$H1 vmovd $h0#d,$H2 vmovd $h1#d,$H3 vmovd $h2#d,$H4 jmp .Lproceed_avx .align 32 .Lstore_base2_64_avx: mov $h0,0($ctx) mov $h1,8($ctx) mov $h2,16($ctx) # note that is_base2_26 is zeroed jmp .Ldone_avx .align 16 .Lstore_base2_26_avx: mov %rax#d,0($ctx) # store hash value base 2^26 mov %rdx#d,4($ctx) mov $h0#d,8($ctx) mov $h1#d,12($ctx) mov $h2#d,16($ctx) .align 16 .Ldone_avx: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data_avx: .Lblocks_avx_epilogue: ret .cfi_endproc .align 32 .Lbase2_64_avx: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lbase2_64_avx_body: mov $len,%r15 # reassign $len mov 24($ctx),$r0 # load r mov 32($ctx),$s1 mov 0($ctx),$h0 # load hash value mov 8($ctx),$h1 mov 16($ctx),$h2#d mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) test \$31,$len jz .Linit_avx add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 sub \$16,%r15 call __poly1305_block .Linit_avx: ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$d1 mov $h1,$d2 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$d1 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $d1,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$d2 and \$0x3ffffff,$h1 # h[3] or $d2,$h2 # h[4] vmovd %rax#d,$H0 vmovd %rdx#d,$H1 vmovd $h0#d,$H2 vmovd $h1#d,$H3 vmovd $h2#d,$H4 movl \$1,20($ctx) # set is_base2_26 call __poly1305_init_avx .Lproceed_avx: mov %r15,$len mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lbase2_64_avx_epilogue: jmp .Ldo_avx .cfi_endproc .align 32 .Leven_avx: .cfi_startproc vmovd 4*0($ctx),$H0 # load hash value vmovd 4*1($ctx),$H1 vmovd 4*2($ctx),$H2 vmovd 4*3($ctx),$H3 vmovd 4*4($ctx),$H4 .Ldo_avx: ___ $code.=<<___ if (!$win64); lea -0x58(%rsp),%r11 .cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); lea -0xf8(%rsp),%r11 sub \$0x218,%rsp vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) vmovdqa %xmm9,0x80(%r11) vmovdqa %xmm10,0x90(%r11) vmovdqa %xmm11,0xa0(%r11) vmovdqa %xmm12,0xb0(%r11) vmovdqa %xmm13,0xc0(%r11) vmovdqa %xmm14,0xd0(%r11) vmovdqa %xmm15,0xe0(%r11) .Ldo_avx_body: ___ $code.=<<___; sub \$64,$len lea -32($inp),%rax cmovc %rax,$inp vmovdqu `16*3`($ctx),$D4 # preload r0^2 lea `16*3+64`($ctx),$ctx # size optimization lea .Lconst(%rip),%rcx ################################################################ # load input vmovdqu 16*2($inp),$T0 vmovdqu 16*3($inp),$T1 vmovdqa 64(%rcx),$MASK # .Lmask26 vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpunpcklqdq $T3,$T2,$T3 # 2:3 vpsrlq \$40,$T4,$T4 # 4 vpsrlq \$26,$T0,$T1 vpand $MASK,$T0,$T0 # 0 vpsrlq \$4,$T3,$T2 vpand $MASK,$T1,$T1 # 1 vpsrlq \$30,$T3,$T3 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always jbe .Lskip_loop_avx # expand and copy pre-calculated table to stack vmovdqu `16*1-64`($ctx),$D1 vmovdqu `16*2-64`($ctx),$D2 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 vmovdqa $D3,-0x90(%r11) vmovdqa $D0,0x00(%rsp) vpshufd \$0xEE,$D1,$D4 vmovdqu `16*3-64`($ctx),$D0 vpshufd \$0x44,$D1,$D1 vmovdqa $D4,-0x80(%r11) vmovdqa $D1,0x10(%rsp) vpshufd \$0xEE,$D2,$D3 vmovdqu `16*4-64`($ctx),$D1 vpshufd \$0x44,$D2,$D2 vmovdqa $D3,-0x70(%r11) vmovdqa $D2,0x20(%rsp) vpshufd \$0xEE,$D0,$D4 vmovdqu `16*5-64`($ctx),$D2 vpshufd \$0x44,$D0,$D0 vmovdqa $D4,-0x60(%r11) vmovdqa $D0,0x30(%rsp) vpshufd \$0xEE,$D1,$D3 vmovdqu `16*6-64`($ctx),$D0 vpshufd \$0x44,$D1,$D1 vmovdqa $D3,-0x50(%r11) vmovdqa $D1,0x40(%rsp) vpshufd \$0xEE,$D2,$D4 vmovdqu `16*7-64`($ctx),$D1 vpshufd \$0x44,$D2,$D2 vmovdqa $D4,-0x40(%r11) vmovdqa $D2,0x50(%rsp) vpshufd \$0xEE,$D0,$D3 vmovdqu `16*8-64`($ctx),$D2 vpshufd \$0x44,$D0,$D0 vmovdqa $D3,-0x30(%r11) vmovdqa $D0,0x60(%rsp) vpshufd \$0xEE,$D1,$D4 vpshufd \$0x44,$D1,$D1 vmovdqa $D4,-0x20(%r11) vmovdqa $D1,0x70(%rsp) vpshufd \$0xEE,$D2,$D3 vmovdqa 0x00(%rsp),$D4 # preload r0^2 vpshufd \$0x44,$D2,$D2 vmovdqa $D3,-0x10(%r11) vmovdqa $D2,0x80(%rsp) jmp .Loop_avx .align 32 .Loop_avx: ################################################################ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r # \___________________/ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r # \___________________/ \____________________/ # # Note that we start with inp[2:3]*r^2. This is because it # doesn't depend on reduction in previous iteration. ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 # # though note that $Tx and $Hx are "reversed" in this section, # and $D4 is preloaded with r0^2... vpmuludq $T0,$D4,$D0 # d0 = h0*r0 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 vmovdqa $H2,0x20(%r11) # offload hash vpmuludq $T2,$D4,$D2 # d3 = h2*r0 vmovdqa 0x10(%rsp),$H2 # r1^2 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 vmovdqa $H0,0x00(%r11) # vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 vmovdqa $H1,0x10(%r11) # vpmuludq $T3,$H2,$H1 # h3*r1 vpaddq $H0,$D0,$D0 # d0 += h4*s1 vpaddq $H1,$D4,$D4 # d4 += h3*r1 vmovdqa $H3,0x30(%r11) # vpmuludq $T2,$H2,$H0 # h2*r1 vpmuludq $T1,$H2,$H1 # h1*r1 vpaddq $H0,$D3,$D3 # d3 += h2*r1 vmovdqa 0x30(%rsp),$H3 # r2^2 vpaddq $H1,$D2,$D2 # d2 += h1*r1 vmovdqa $H4,0x40(%r11) # vpmuludq $T0,$H2,$H2 # h0*r1 vpmuludq $T2,$H3,$H0 # h2*r2 vpaddq $H2,$D1,$D1 # d1 += h0*r1 vmovdqa 0x40(%rsp),$H4 # s2^2 vpaddq $H0,$D4,$D4 # d4 += h2*r2 vpmuludq $T1,$H3,$H1 # h1*r2 vpmuludq $T0,$H3,$H3 # h0*r2 vpaddq $H1,$D3,$D3 # d3 += h1*r2 vmovdqa 0x50(%rsp),$H2 # r3^2 vpaddq $H3,$D2,$D2 # d2 += h0*r2 vpmuludq $T4,$H4,$H0 # h4*s2 vpmuludq $T3,$H4,$H4 # h3*s2 vpaddq $H0,$D1,$D1 # d1 += h4*s2 vmovdqa 0x60(%rsp),$H3 # s3^2 vpaddq $H4,$D0,$D0 # d0 += h3*s2 vmovdqa 0x80(%rsp),$H4 # s4^2 vpmuludq $T1,$H2,$H1 # h1*r3 vpmuludq $T0,$H2,$H2 # h0*r3 vpaddq $H1,$D4,$D4 # d4 += h1*r3 vpaddq $H2,$D3,$D3 # d3 += h0*r3 vpmuludq $T4,$H3,$H0 # h4*s3 vpmuludq $T3,$H3,$H1 # h3*s3 vpaddq $H0,$D2,$D2 # d2 += h4*s3 vmovdqu 16*0($inp),$H0 # load input vpaddq $H1,$D1,$D1 # d1 += h3*s3 vpmuludq $T2,$H3,$H3 # h2*s3 vpmuludq $T2,$H4,$T2 # h2*s4 vpaddq $H3,$D0,$D0 # d0 += h2*s3 vmovdqu 16*1($inp),$H1 # vpaddq $T2,$D1,$D1 # d1 += h2*s4 vpmuludq $T3,$H4,$T3 # h3*s4 vpmuludq $T4,$H4,$T4 # h4*s4 vpsrldq \$6,$H0,$H2 # splat input vpaddq $T3,$D2,$D2 # d2 += h3*s4 vpaddq $T4,$D3,$D3 # d3 += h4*s4 vpsrldq \$6,$H1,$H3 # vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 vpmuludq $T1,$H4,$T0 # h1*s4 vpunpckhqdq $H1,$H0,$H4 # 4 vpaddq $T4,$D4,$D4 # d4 += h0*r4 vmovdqa -0x90(%r11),$T4 # r0^4 vpaddq $T0,$D0,$D0 # d0 += h1*s4 vpunpcklqdq $H1,$H0,$H0 # 0:1 vpunpcklqdq $H3,$H2,$H3 # 2:3 #vpsrlq \$40,$H4,$H4 # 4 vpsrldq \$`40/8`,$H4,$H4 # 4 vpsrlq \$26,$H0,$H1 vpand $MASK,$H0,$H0 # 0 vpsrlq \$4,$H3,$H2 vpand $MASK,$H1,$H1 # 1 vpand 0(%rcx),$H4,$H4 # .Lmask24 vpsrlq \$30,$H3,$H3 vpand $MASK,$H2,$H2 # 2 vpand $MASK,$H3,$H3 # 3 vpor 32(%rcx),$H4,$H4 # padbit, yes, always vpaddq 0x00(%r11),$H0,$H0 # add hash value vpaddq 0x10(%r11),$H1,$H1 vpaddq 0x20(%r11),$H2,$H2 vpaddq 0x30(%r11),$H3,$H3 vpaddq 0x40(%r11),$H4,$H4 lea 16*2($inp),%rax lea 16*4($inp),$inp sub \$64,$len cmovc %rax,$inp ################################################################ # Now we accumulate (inp[0:1]+hash)*r^4 ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 vpmuludq $H0,$T4,$T0 # h0*r0 vpmuludq $H1,$T4,$T1 # h1*r0 vpaddq $T0,$D0,$D0 vpaddq $T1,$D1,$D1 vmovdqa -0x80(%r11),$T2 # r1^4 vpmuludq $H2,$T4,$T0 # h2*r0 vpmuludq $H3,$T4,$T1 # h3*r0 vpaddq $T0,$D2,$D2 vpaddq $T1,$D3,$D3 vpmuludq $H4,$T4,$T4 # h4*r0 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 vpaddq $T4,$D4,$D4 vpaddq $T0,$D0,$D0 # d0 += h4*s1 vpmuludq $H2,$T2,$T1 # h2*r1 vpmuludq $H3,$T2,$T0 # h3*r1 vpaddq $T1,$D3,$D3 # d3 += h2*r1 vmovdqa -0x60(%r11),$T3 # r2^4 vpaddq $T0,$D4,$D4 # d4 += h3*r1 vpmuludq $H1,$T2,$T1 # h1*r1 vpmuludq $H0,$T2,$T2 # h0*r1 vpaddq $T1,$D2,$D2 # d2 += h1*r1 vpaddq $T2,$D1,$D1 # d1 += h0*r1 vmovdqa -0x50(%r11),$T4 # s2^4 vpmuludq $H2,$T3,$T0 # h2*r2 vpmuludq $H1,$T3,$T1 # h1*r2 vpaddq $T0,$D4,$D4 # d4 += h2*r2 vpaddq $T1,$D3,$D3 # d3 += h1*r2 vmovdqa -0x40(%r11),$T2 # r3^4 vpmuludq $H0,$T3,$T3 # h0*r2 vpmuludq $H4,$T4,$T0 # h4*s2 vpaddq $T3,$D2,$D2 # d2 += h0*r2 vpaddq $T0,$D1,$D1 # d1 += h4*s2 vmovdqa -0x30(%r11),$T3 # s3^4 vpmuludq $H3,$T4,$T4 # h3*s2 vpmuludq $H1,$T2,$T1 # h1*r3 vpaddq $T4,$D0,$D0 # d0 += h3*s2 vmovdqa -0x10(%r11),$T4 # s4^4 vpaddq $T1,$D4,$D4 # d4 += h1*r3 vpmuludq $H0,$T2,$T2 # h0*r3 vpmuludq $H4,$T3,$T0 # h4*s3 vpaddq $T2,$D3,$D3 # d3 += h0*r3 vpaddq $T0,$D2,$D2 # d2 += h4*s3 vmovdqu 16*2($inp),$T0 # load input vpmuludq $H3,$T3,$T2 # h3*s3 vpmuludq $H2,$T3,$T3 # h2*s3 vpaddq $T2,$D1,$D1 # d1 += h3*s3 vmovdqu 16*3($inp),$T1 # vpaddq $T3,$D0,$D0 # d0 += h2*s3 vpmuludq $H2,$T4,$H2 # h2*s4 vpmuludq $H3,$T4,$H3 # h3*s4 vpsrldq \$6,$T0,$T2 # splat input vpaddq $H2,$D1,$D1 # d1 += h2*s4 vpmuludq $H4,$T4,$H4 # h4*s4 vpsrldq \$6,$T1,$T3 # vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 vpmuludq $H1,$T4,$H0 vpunpckhqdq $T1,$T0,$T4 # 4 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpunpcklqdq $T3,$T2,$T3 # 2:3 #vpsrlq \$40,$T4,$T4 # 4 vpsrldq \$`40/8`,$T4,$T4 # 4 vpsrlq \$26,$T0,$T1 vmovdqa 0x00(%rsp),$D4 # preload r0^2 vpand $MASK,$T0,$T0 # 0 vpsrlq \$4,$T3,$T2 vpand $MASK,$T1,$T1 # 1 vpand 0(%rcx),$T4,$T4 # .Lmask24 vpsrlq \$30,$T3,$T3 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$D1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D0 vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D0,$H0,$H0 vpsllq \$2,$D0,$D0 vpaddq $D0,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 ja .Loop_avx .Lskip_loop_avx: ################################################################ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 add \$32,$len jnz .Long_tail_avx vpaddq $H2,$T2,$T2 vpaddq $H0,$T0,$T0 vpaddq $H1,$T1,$T1 vpaddq $H3,$T3,$T3 vpaddq $H4,$T4,$T4 .Long_tail_avx: vmovdqa $H2,0x20(%r11) vmovdqa $H0,0x00(%r11) vmovdqa $H1,0x10(%r11) vmovdqa $H3,0x30(%r11) vmovdqa $H4,0x40(%r11) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n vpmuludq $T1,$D4,$D1 # d1 = h1*r0 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 vpmuludq $T3,$H2,$H0 # h3*r1 vpaddq $H0,$D4,$D4 # d4 += h3*r1 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n vpmuludq $T2,$H2,$H1 # h2*r1 vpaddq $H1,$D3,$D3 # d3 += h2*r1 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n vpmuludq $T1,$H2,$H0 # h1*r1 vpaddq $H0,$D2,$D2 # d2 += h1*r1 vpmuludq $T0,$H2,$H2 # h0*r1 vpaddq $H2,$D1,$D1 # d1 += h0*r1 vpmuludq $T4,$H3,$H3 # h4*s1 vpaddq $H3,$D0,$D0 # d0 += h4*s1 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n vpmuludq $T2,$H4,$H1 # h2*r2 vpaddq $H1,$D4,$D4 # d4 += h2*r2 vpmuludq $T1,$H4,$H0 # h1*r2 vpaddq $H0,$D3,$D3 # d3 += h1*r2 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n vpmuludq $T0,$H4,$H4 # h0*r2 vpaddq $H4,$D2,$D2 # d2 += h0*r2 vpmuludq $T4,$H2,$H1 # h4*s2 vpaddq $H1,$D1,$D1 # d1 += h4*s2 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n vpmuludq $T3,$H2,$H2 # h3*s2 vpaddq $H2,$D0,$D0 # d0 += h3*s2 vpmuludq $T1,$H3,$H0 # h1*r3 vpaddq $H0,$D4,$D4 # d4 += h1*r3 vpmuludq $T0,$H3,$H3 # h0*r3 vpaddq $H3,$D3,$D3 # d3 += h0*r3 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n vpmuludq $T4,$H4,$H1 # h4*s3 vpaddq $H1,$D2,$D2 # d2 += h4*s3 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n vpmuludq $T3,$H4,$H0 # h3*s3 vpaddq $H0,$D1,$D1 # d1 += h3*s3 vpmuludq $T2,$H4,$H4 # h2*s3 vpaddq $H4,$D0,$D0 # d0 += h2*s3 vpmuludq $T0,$H2,$H2 # h0*r4 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 vpmuludq $T4,$H3,$H1 # h4*s4 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 vpmuludq $T3,$H3,$H0 # h3*s4 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 vpmuludq $T2,$H3,$H1 # h2*s4 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 vpmuludq $T1,$H3,$H3 # h1*s4 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 jz .Lshort_tail_avx vmovdqu 16*0($inp),$H0 # load input vmovdqu 16*1($inp),$H1 vpsrldq \$6,$H0,$H2 # splat input vpsrldq \$6,$H1,$H3 vpunpckhqdq $H1,$H0,$H4 # 4 vpunpcklqdq $H1,$H0,$H0 # 0:1 vpunpcklqdq $H3,$H2,$H3 # 2:3 vpsrlq \$40,$H4,$H4 # 4 vpsrlq \$26,$H0,$H1 vpand $MASK,$H0,$H0 # 0 vpsrlq \$4,$H3,$H2 vpand $MASK,$H1,$H1 # 1 vpsrlq \$30,$H3,$H3 vpand $MASK,$H2,$H2 # 2 vpand $MASK,$H3,$H3 # 3 vpor 32(%rcx),$H4,$H4 # padbit, yes, always vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 vpaddq 0x00(%r11),$H0,$H0 vpaddq 0x10(%r11),$H1,$H1 vpaddq 0x20(%r11),$H2,$H2 vpaddq 0x30(%r11),$H3,$H3 vpaddq 0x40(%r11),$H4,$H4 ################################################################ # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate vpmuludq $H0,$T4,$T0 # h0*r0 vpaddq $T0,$D0,$D0 # d0 += h0*r0 vpmuludq $H1,$T4,$T1 # h1*r0 vpaddq $T1,$D1,$D1 # d1 += h1*r0 vpmuludq $H2,$T4,$T0 # h2*r0 vpaddq $T0,$D2,$D2 # d2 += h2*r0 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n vpmuludq $H3,$T4,$T1 # h3*r0 vpaddq $T1,$D3,$D3 # d3 += h3*r0 vpmuludq $H4,$T4,$T4 # h4*r0 vpaddq $T4,$D4,$D4 # d4 += h4*r0 vpmuludq $H3,$T2,$T0 # h3*r1 vpaddq $T0,$D4,$D4 # d4 += h3*r1 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 vpmuludq $H2,$T2,$T1 # h2*r1 vpaddq $T1,$D3,$D3 # d3 += h2*r1 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 vpmuludq $H1,$T2,$T0 # h1*r1 vpaddq $T0,$D2,$D2 # d2 += h1*r1 vpmuludq $H0,$T2,$T2 # h0*r1 vpaddq $T2,$D1,$D1 # d1 += h0*r1 vpmuludq $H4,$T3,$T3 # h4*s1 vpaddq $T3,$D0,$D0 # d0 += h4*s1 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 vpmuludq $H2,$T4,$T1 # h2*r2 vpaddq $T1,$D4,$D4 # d4 += h2*r2 vpmuludq $H1,$T4,$T0 # h1*r2 vpaddq $T0,$D3,$D3 # d3 += h1*r2 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 vpmuludq $H0,$T4,$T4 # h0*r2 vpaddq $T4,$D2,$D2 # d2 += h0*r2 vpmuludq $H4,$T2,$T1 # h4*s2 vpaddq $T1,$D1,$D1 # d1 += h4*s2 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 vpmuludq $H3,$T2,$T2 # h3*s2 vpaddq $T2,$D0,$D0 # d0 += h3*s2 vpmuludq $H1,$T3,$T0 # h1*r3 vpaddq $T0,$D4,$D4 # d4 += h1*r3 vpmuludq $H0,$T3,$T3 # h0*r3 vpaddq $T3,$D3,$D3 # d3 += h0*r3 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 vpmuludq $H4,$T4,$T1 # h4*s3 vpaddq $T1,$D2,$D2 # d2 += h4*s3 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 vpmuludq $H3,$T4,$T0 # h3*s3 vpaddq $T0,$D1,$D1 # d1 += h3*s3 vpmuludq $H2,$T4,$T4 # h2*s3 vpaddq $T4,$D0,$D0 # d0 += h2*s3 vpmuludq $H0,$T2,$T2 # h0*r4 vpaddq $T2,$D4,$D4 # d4 += h0*r4 vpmuludq $H4,$T3,$T1 # h4*s4 vpaddq $T1,$D3,$D3 # d3 += h4*s4 vpmuludq $H3,$T3,$T0 # h3*s4 vpaddq $T0,$D2,$D2 # d2 += h3*s4 vpmuludq $H2,$T3,$T1 # h2*s4 vpaddq $T1,$D1,$D1 # d1 += h2*s4 vpmuludq $H1,$T3,$T3 # h1*s4 vpaddq $T3,$D0,$D0 # d0 += h1*s4 .Lshort_tail_avx: ################################################################ # horizontal addition vpsrldq \$8,$D4,$T4 vpsrldq \$8,$D3,$T3 vpsrldq \$8,$D1,$T1 vpsrldq \$8,$D0,$T0 vpsrldq \$8,$D2,$T2 vpaddq $T3,$D3,$D3 vpaddq $T4,$D4,$D4 vpaddq $T0,$D0,$D0 vpaddq $T1,$D1,$D1 vpaddq $T2,$D2,$D2 ################################################################ # lazy reduction vpsrlq \$26,$D3,$H3 vpand $MASK,$D3,$D3 vpaddq $H3,$D4,$D4 # h3 -> h4 vpsrlq \$26,$D0,$H0 vpand $MASK,$D0,$D0 vpaddq $H0,$D1,$D1 # h0 -> h1 vpsrlq \$26,$D4,$H4 vpand $MASK,$D4,$D4 vpsrlq \$26,$D1,$H1 vpand $MASK,$D1,$D1 vpaddq $H1,$D2,$D2 # h1 -> h2 vpaddq $H4,$D0,$D0 vpsllq \$2,$H4,$H4 vpaddq $H4,$D0,$D0 # h4 -> h0 vpsrlq \$26,$D2,$H2 vpand $MASK,$D2,$D2 vpaddq $H2,$D3,$D3 # h2 -> h3 vpsrlq \$26,$D0,$H0 vpand $MASK,$D0,$D0 vpaddq $H0,$D1,$D1 # h0 -> h1 vpsrlq \$26,$D3,$H3 vpand $MASK,$D3,$D3 vpaddq $H3,$D4,$D4 # h3 -> h4 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced vmovd $D1,`4*1-48-64`($ctx) vmovd $D2,`4*2-48-64`($ctx) vmovd $D3,`4*3-48-64`($ctx) vmovd $D4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); vmovdqa 0x50(%r11),%xmm6 vmovdqa 0x60(%r11),%xmm7 vmovdqa 0x70(%r11),%xmm8 vmovdqa 0x80(%r11),%xmm9 vmovdqa 0x90(%r11),%xmm10 vmovdqa 0xa0(%r11),%xmm11 vmovdqa 0xb0(%r11),%xmm12 vmovdqa 0xc0(%r11),%xmm13 vmovdqa 0xd0(%r11),%xmm14 vmovdqa 0xe0(%r11),%xmm15 lea 0xf8(%r11),%rsp .Ldo_avx_epilogue: ___ $code.=<<___ if (!$win64); lea 0x58(%r11),%rsp .cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret .cfi_endproc .size poly1305_blocks_avx,.-poly1305_blocks_avx .type poly1305_emit_avx,\@function,3 .align 32 poly1305_emit_avx: .cfi_startproc cmpl \$0,20($ctx) # is_base2_26? je .Lemit mov 0($ctx),%eax # load hash value base 2^26 mov 4($ctx),%ecx mov 8($ctx),%r8d mov 12($ctx),%r11d mov 16($ctx),%r10d shl \$26,%rcx # base 2^26 -> base 2^64 mov %r8,%r9 shl \$52,%r8 add %rcx,%rax shr \$12,%r9 add %rax,%r8 # h0 adc \$0,%r9 shl \$14,%r11 mov %r10,%rax shr \$24,%r10 add %r11,%r9 shl \$40,%rax add %rax,%r9 # h1 adc \$0,%r10 # h2 mov %r10,%rax # could be partially reduced, so reduce mov %r10,%rcx and \$3,%r10 shr \$2,%rax and \$-4,%rcx add %rcx,%rax add %rax,%r8 adc \$0,%r9 adc \$0,%r10 mov %r8,%rax add \$5,%r8 # compare to modulus mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx add 0($nonce),%rax # accumulate nonce adc 8($nonce),%rcx mov %rax,0($mac) # write result mov %rcx,8($mac) ret .cfi_endproc .size poly1305_emit_avx,.-poly1305_emit_avx ___ if ($avx>1) { my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = map("%ymm$_",(0..15)); my $S4=$MASK; $code.=<<___; .type poly1305_blocks_avx2,\@function,4 .align 32 poly1305_blocks_avx2: .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx2 test %r8d,%r8d jz .Lblocks .Lblocks_avx2: and \$-16,$len jz .Lno_data_avx2 vzeroupper test %r8d,%r8d jz .Lbase2_64_avx2 test \$63,$len jz .Leven_avx2 push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lblocks_avx2_body: mov $len,%r15 # reassign $len mov 0($ctx),$d1 # load hash value mov 8($ctx),$d2 mov 16($ctx),$h2#d mov 24($ctx),$r0 # load r mov 32($ctx),$s1 ################################# base 2^26 -> base 2^64 mov $d1#d,$h0#d and \$`-1*(1<<31)`,$d1 mov $d2,$r1 # borrow $r1 mov $d2#d,$h1#d and \$`-1*(1<<31)`,$d2 shr \$6,$d1 shl \$52,$r1 add $d1,$h0 shr \$12,$h1 shr \$18,$d2 add $r1,$h0 adc $d2,$h1 mov $h2,$d1 shl \$40,$d1 shr \$24,$h2 add $d1,$h1 adc \$0,$h2 # can be partially reduced... mov \$-4,$d2 # ... so reduce mov $h2,$d1 and $h2,$d2 shr \$2,$d1 and \$3,$h2 add $d2,$d1 # =*5 add $d1,$h0 adc \$0,$h1 adc \$0,$h2 mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) .Lbase2_26_pre_avx2: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 sub \$16,%r15 call __poly1305_block mov $r1,%rax test \$63,%r15 jnz .Lbase2_26_pre_avx2 test $padbit,$padbit # if $padbit is zero, jz .Lstore_base2_64_avx2 # store hash in base 2^64 format ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$r0 mov $h1,$r1 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$r0 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $r0,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$r1 and \$0x3ffffff,$h1 # h[3] or $r1,$h2 # h[4] test %r15,%r15 jz .Lstore_base2_26_avx2 vmovd %rax#d,%x#$H0 vmovd %rdx#d,%x#$H1 vmovd $h0#d,%x#$H2 vmovd $h1#d,%x#$H3 vmovd $h2#d,%x#$H4 jmp .Lproceed_avx2 .align 32 .Lstore_base2_64_avx2: mov $h0,0($ctx) mov $h1,8($ctx) mov $h2,16($ctx) # note that is_base2_26 is zeroed jmp .Ldone_avx2 .align 16 .Lstore_base2_26_avx2: mov %rax#d,0($ctx) # store hash value base 2^26 mov %rdx#d,4($ctx) mov $h0#d,8($ctx) mov $h1#d,12($ctx) mov $h2#d,16($ctx) .align 16 .Ldone_avx2: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data_avx2: .Lblocks_avx2_epilogue: ret .cfi_endproc .align 32 .Lbase2_64_avx2: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lbase2_64_avx2_body: mov $len,%r15 # reassign $len mov 24($ctx),$r0 # load r mov 32($ctx),$s1 mov 0($ctx),$h0 # load hash value mov 8($ctx),$h1 mov 16($ctx),$h2#d mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) test \$63,$len jz .Linit_avx2 .Lbase2_64_pre_avx2: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 sub \$16,%r15 call __poly1305_block mov $r1,%rax test \$63,%r15 jnz .Lbase2_64_pre_avx2 .Linit_avx2: ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$d1 mov $h1,$d2 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$d1 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $d1,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$d2 and \$0x3ffffff,$h1 # h[3] or $d2,$h2 # h[4] vmovd %rax#d,%x#$H0 vmovd %rdx#d,%x#$H1 vmovd $h0#d,%x#$H2 vmovd $h1#d,%x#$H3 vmovd $h2#d,%x#$H4 movl \$1,20($ctx) # set is_base2_26 call __poly1305_init_avx .Lproceed_avx2: mov %r15,$len # restore $len mov OPENSSL_ia32cap_P+8(%rip),%r10d mov \$`(1<<31|1<<30|1<<16)`,%r11d mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lbase2_64_avx2_epilogue: jmp .Ldo_avx2 .cfi_endproc .align 32 .Leven_avx2: .cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%r10d vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 vmovd 4*3($ctx),%x#$H3 vmovd 4*4($ctx),%x#$H4 .Ldo_avx2: ___ $code.=<<___ if ($avx>2); cmp \$512,$len jb .Lskip_avx512 and %r11d,%r10d test \$`1<<16`,%r10d # check for AVX512F jnz .Lblocks_avx512 .Lskip_avx512: ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 .cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); lea -0xf8(%rsp),%r11 sub \$0x1c8,%rsp vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) vmovdqa %xmm9,0x80(%r11) vmovdqa %xmm10,0x90(%r11) vmovdqa %xmm11,0xa0(%r11) vmovdqa %xmm12,0xb0(%r11) vmovdqa %xmm13,0xc0(%r11) vmovdqa %xmm14,0xd0(%r11) vmovdqa %xmm15,0xe0(%r11) .Ldo_avx2_body: ___ $code.=<<___; lea .Lconst(%rip),%rcx lea 48+64($ctx),$ctx # size optimization vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 # expand and copy pre-calculated table to stack vmovdqu `16*0-64`($ctx),%x#$T2 and \$-512,%rsp vmovdqu `16*1-64`($ctx),%x#$T3 vmovdqu `16*2-64`($ctx),%x#$T4 vmovdqu `16*3-64`($ctx),%x#$D0 vmovdqu `16*4-64`($ctx),%x#$D1 vmovdqu `16*5-64`($ctx),%x#$D2 lea 0x90(%rsp),%rax # size optimization vmovdqu `16*6-64`($ctx),%x#$D3 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 vmovdqu `16*7-64`($ctx),%x#$D4 vpermd $T3,$T0,$T3 vmovdqu `16*8-64`($ctx),%x#$MASK vpermd $T4,$T0,$T4 vmovdqa $T2,0x00(%rsp) vpermd $D0,$T0,$D0 vmovdqa $T3,0x20-0x90(%rax) vpermd $D1,$T0,$D1 vmovdqa $T4,0x40-0x90(%rax) vpermd $D2,$T0,$D2 vmovdqa $D0,0x60-0x90(%rax) vpermd $D3,$T0,$D3 vmovdqa $D1,0x80-0x90(%rax) vpermd $D4,$T0,$D4 vmovdqa $D2,0xa0-0x90(%rax) vpermd $MASK,$T0,$MASK vmovdqa $D3,0xc0-0x90(%rax) vmovdqa $D4,0xe0-0x90(%rax) vmovdqa $MASK,0x100-0x90(%rax) vmovdqa 64(%rcx),$MASK # .Lmask26 ################################################################ # load input vmovdqu 16*0($inp),%x#$T0 vmovdqu 16*1($inp),%x#$T1 vinserti128 \$1,16*2($inp),$T0,$T0 vinserti128 \$1,16*3($inp),$T1,$T1 lea 16*4($inp),$inp vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpunpcklqdq $T3,$T2,$T2 # 2:3 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpsrlq \$30,$T2,$T3 vpsrlq \$4,$T2,$T2 vpsrlq \$26,$T0,$T1 vpsrlq \$40,$T4,$T4 # 4 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T0,$T0 # 0 vpand $MASK,$T1,$T1 # 1 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpaddq $H2,$T2,$H2 # accumulate input sub \$64,$len jz .Ltail_avx2 jmp .Loop_avx2 .align 32 .Loop_avx2: ################################################################ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 # \________/\__________/ ################################################################ #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 vmovdqa `32*0`(%rsp),$T0 # r0^4 vpaddq $H1,$T1,$H1 vmovdqa `32*1`(%rsp),$T1 # r1^4 vpaddq $H3,$T3,$H3 vmovdqa `32*3`(%rsp),$T2 # r2^4 vpaddq $H4,$T4,$H4 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 # # however, as h2 is "chronologically" first one available pull # corresponding operations up, so it's # # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpmuludq $H0,$T1,$T4 # h0*r1 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp vpaddq $T4,$D1,$D1 # d1 += h0*r1 vpaddq $H2,$D2,$D2 # d2 += h1*r1 vpmuludq $H3,$T1,$T4 # h3*r1 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 vpaddq $T4,$D4,$D4 # d4 += h3*r1 vpaddq $H2,$D0,$D0 # d0 += h4*s1 vmovdqa `32*4-0x90`(%rax),$T1 # s2 vpmuludq $H0,$T0,$T4 # h0*r0 vpmuludq $H1,$T0,$H2 # h1*r0 vpaddq $T4,$D0,$D0 # d0 += h0*r0 vpaddq $H2,$D1,$D1 # d1 += h1*r0 vpmuludq $H3,$T0,$T4 # h3*r0 vpmuludq $H4,$T0,$H2 # h4*r0 vmovdqu 16*0($inp),%x#$T0 # load input vpaddq $T4,$D3,$D3 # d3 += h3*r0 vpaddq $H2,$D4,$D4 # d4 += h4*r0 vinserti128 \$1,16*2($inp),$T0,$T0 vpmuludq $H3,$T1,$T4 # h3*s2 vpmuludq $H4,$T1,$H2 # h4*s2 vmovdqu 16*1($inp),%x#$T1 vpaddq $T4,$D0,$D0 # d0 += h3*s2 vpaddq $H2,$D1,$D1 # d1 += h4*s2 vmovdqa `32*5-0x90`(%rax),$H2 # r3 vpmuludq $H1,$T2,$T4 # h1*r2 vpmuludq $H0,$T2,$T2 # h0*r2 vpaddq $T4,$D3,$D3 # d3 += h1*r2 vpaddq $T2,$D2,$D2 # d2 += h0*r2 vinserti128 \$1,16*3($inp),$T1,$T1 lea 16*4($inp),$inp vpmuludq $H1,$H2,$T4 # h1*r3 vpmuludq $H0,$H2,$H2 # h0*r3 vpsrldq \$6,$T0,$T2 # splat input vpaddq $T4,$D4,$D4 # d4 += h1*r3 vpaddq $H2,$D3,$D3 # d3 += h0*r3 vpmuludq $H3,$T3,$T4 # h3*s3 vpmuludq $H4,$T3,$H2 # h4*s3 vpsrldq \$6,$T1,$T3 vpaddq $T4,$D1,$D1 # d1 += h3*s3 vpaddq $H2,$D2,$D2 # d2 += h4*s3 vpunpckhqdq $T1,$T0,$T4 # 4 vpmuludq $H3,$S4,$H3 # h3*s4 vpmuludq $H4,$S4,$H4 # h4*s4 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 vpunpcklqdq $T3,$T2,$T3 # 2:3 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 vpmuludq $H1,$S4,$H0 # h1*s4 vmovdqa 64(%rcx),$MASK # .Lmask26 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 ################################################################ # lazy reduction (interleaved with tail of input splat) vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$D1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 vpand $MASK,$H4,$H4 vpsrlq \$4,$T3,$T2 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpand $MASK,$T2,$T2 # 2 vpsrlq \$26,$T0,$T1 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpaddq $D2,$H3,$H3 # h2 -> h3 vpaddq $T2,$H2,$H2 # modulo-scheduled vpsrlq \$30,$T3,$T3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$40,$T4,$T4 # 4 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpand $MASK,$T0,$T0 # 0 vpand $MASK,$T1,$T1 # 1 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always sub \$64,$len jnz .Loop_avx2 .byte 0x66,0x90 .Ltail_avx2: ################################################################ # while above multiplications were by r^4 in all lanes, in last # iteration we multiply least significant lane by r^4 and most # significant one by r, so copy of above except that references # to the precomputed table are displaced by 4... #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 vpaddq $H1,$T1,$H1 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 vpaddq $H3,$T3,$H3 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 vpaddq $H4,$T4,$H4 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpmuludq $H0,$T1,$T4 # h0*r1 vpmuludq $H1,$T1,$H2 # h1*r1 vpaddq $T4,$D1,$D1 # d1 += h0*r1 vpaddq $H2,$D2,$D2 # d2 += h1*r1 vpmuludq $H3,$T1,$T4 # h3*r1 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 vpaddq $T4,$D4,$D4 # d4 += h3*r1 vpaddq $H2,$D0,$D0 # d0 += h4*s1 vpmuludq $H0,$T0,$T4 # h0*r0 vpmuludq $H1,$T0,$H2 # h1*r0 vpaddq $T4,$D0,$D0 # d0 += h0*r0 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 vpaddq $H2,$D1,$D1 # d1 += h1*r0 vpmuludq $H3,$T0,$T4 # h3*r0 vpmuludq $H4,$T0,$H2 # h4*r0 vpaddq $T4,$D3,$D3 # d3 += h3*r0 vpaddq $H2,$D4,$D4 # d4 += h4*r0 vpmuludq $H3,$T1,$T4 # h3*s2 vpmuludq $H4,$T1,$H2 # h4*s2 vpaddq $T4,$D0,$D0 # d0 += h3*s2 vpaddq $H2,$D1,$D1 # d1 += h4*s2 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 vpmuludq $H1,$T2,$T4 # h1*r2 vpmuludq $H0,$T2,$T2 # h0*r2 vpaddq $T4,$D3,$D3 # d3 += h1*r2 vpaddq $T2,$D2,$D2 # d2 += h0*r2 vpmuludq $H1,$H2,$T4 # h1*r3 vpmuludq $H0,$H2,$H2 # h0*r3 vpaddq $T4,$D4,$D4 # d4 += h1*r3 vpaddq $H2,$D3,$D3 # d3 += h0*r3 vpmuludq $H3,$T3,$T4 # h3*s3 vpmuludq $H4,$T3,$H2 # h4*s3 vpaddq $T4,$D1,$D1 # d1 += h3*s3 vpaddq $H2,$D2,$D2 # d2 += h4*s3 vpmuludq $H3,$S4,$H3 # h3*s4 vpmuludq $H4,$S4,$H4 # h4*s4 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 vpmuludq $H1,$S4,$H0 # h1*s4 vmovdqa 64(%rcx),$MASK # .Lmask26 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 ################################################################ # horizontal addition vpsrldq \$8,$D1,$T1 vpsrldq \$8,$H2,$T2 vpsrldq \$8,$H3,$T3 vpsrldq \$8,$H4,$T4 vpsrldq \$8,$H0,$T0 vpaddq $T1,$D1,$D1 vpaddq $T2,$H2,$H2 vpaddq $T3,$H3,$H3 vpaddq $T4,$H4,$H4 vpaddq $T0,$H0,$H0 vpermq \$0x2,$H3,$T3 vpermq \$0x2,$H4,$T4 vpermq \$0x2,$H0,$T0 vpermq \$0x2,$D1,$T1 vpermq \$0x2,$H2,$T2 vpaddq $T3,$H3,$H3 vpaddq $T4,$H4,$H4 vpaddq $T0,$H0,$H0 vpaddq $T1,$D1,$D1 vpaddq $T2,$H2,$H2 ################################################################ # lazy reduction vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$D1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced vmovd %x#$H1,`4*1-48-64`($ctx) vmovd %x#$H2,`4*2-48-64`($ctx) vmovd %x#$H3,`4*3-48-64`($ctx) vmovd %x#$H4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); vmovdqa 0x50(%r11),%xmm6 vmovdqa 0x60(%r11),%xmm7 vmovdqa 0x70(%r11),%xmm8 vmovdqa 0x80(%r11),%xmm9 vmovdqa 0x90(%r11),%xmm10 vmovdqa 0xa0(%r11),%xmm11 vmovdqa 0xb0(%r11),%xmm12 vmovdqa 0xc0(%r11),%xmm13 vmovdqa 0xd0(%r11),%xmm14 vmovdqa 0xe0(%r11),%xmm15 lea 0xf8(%r11),%rsp .Ldo_avx2_epilogue: ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp .cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret .cfi_endproc .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ ####################################################################### if ($avx>2) { # On entry we have input length divisible by 64. But since inner loop # processes 128 bytes per iteration, cases when length is not divisible # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this # reason stack layout is kept identical to poly1305_blocks_avx2. If not # for this tail, we wouldn't have to even allocate stack frame... my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30"; map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); map(s/%y/%z/,($MASK)); $code.=<<___; .type poly1305_blocks_avx512,\@function,4 .align 32 poly1305_blocks_avx512: .cfi_startproc .Lblocks_avx512: mov \$15,%eax kmovw %eax,%k2 ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 .cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); lea -0xf8(%rsp),%r11 sub \$0x1c8,%rsp vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) vmovdqa %xmm9,0x80(%r11) vmovdqa %xmm10,0x90(%r11) vmovdqa %xmm11,0xa0(%r11) vmovdqa %xmm12,0xb0(%r11) vmovdqa %xmm13,0xc0(%r11) vmovdqa %xmm14,0xd0(%r11) vmovdqa %xmm15,0xe0(%r11) .Ldo_avx512_body: ___ $code.=<<___; lea .Lconst(%rip),%rcx lea 48+64($ctx),$ctx # size optimization vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 # expand pre-calculated table vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} and \$-512,%rsp vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} mov \$0x20,%rax vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} vpermd $D0,$T2,$R0 # 00003412 -> 14243444 vpbroadcastq 64(%rcx),$MASK # .Lmask26 vpermd $D1,$T2,$R1 vpermd $T0,$T2,$S1 vpermd $D2,$T2,$R2 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 vpermd $T1,$T2,$S2 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} vpsrlq \$32,$R1,$T1 vpermd $D3,$T2,$R3 vmovdqa64 $S1,0x40(%rsp){%k2} vpermd $T3,$T2,$S3 vpermd $D4,$T2,$R4 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} vpermd $T4,$T2,$S4 vmovdqa64 $S2,0x80(%rsp){%k2} vmovdqu64 $R3,0x80(%rsp,%rax){%k2} vmovdqa64 $S3,0xc0(%rsp){%k2} vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} vmovdqa64 $S4,0x100(%rsp){%k2} ################################################################ # calculate 5th through 8th powers of the key # # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 vpsrlq \$32,$R2,$T2 vpmuludq $T1,$S4,$M0 vpmuludq $T1,$R0,$M1 vpmuludq $T1,$R1,$M2 vpmuludq $T1,$R2,$M3 vpmuludq $T1,$R3,$M4 vpsrlq \$32,$R3,$T3 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 vpmuludq $T2,$S3,$M0 vpmuludq $T2,$S4,$M1 vpmuludq $T2,$R1,$M3 vpmuludq $T2,$R2,$M4 vpmuludq $T2,$R0,$M2 vpsrlq \$32,$R4,$T4 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 vpmuludq $T3,$S2,$M0 vpmuludq $T3,$R0,$M3 vpmuludq $T3,$R1,$M4 vpmuludq $T3,$S3,$M1 vpmuludq $T3,$S4,$M2 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 vpmuludq $T4,$S4,$M3 vpmuludq $T4,$R0,$M4 vpmuludq $T4,$S1,$M0 vpmuludq $T4,$S2,$M1 vpmuludq $T4,$S3,$M2 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 ################################################################ # load input vmovdqu64 16*0($inp),%z#$T3 vmovdqu64 16*4($inp),%z#$T4 lea 16*8($inp),$inp ################################################################ # lazy reduction vpsrlq \$26,$D3,$M3 vpandq $MASK,$D3,$D3 vpaddq $M3,$D4,$D4 # d3 -> d4 vpsrlq \$26,$D0,$M0 vpandq $MASK,$D0,$D0 vpaddq $M0,$D1,$D1 # d0 -> d1 vpsrlq \$26,$D4,$M4 vpandq $MASK,$D4,$D4 vpsrlq \$26,$D1,$M1 vpandq $MASK,$D1,$D1 vpaddq $M1,$D2,$D2 # d1 -> d2 vpaddq $M4,$D0,$D0 vpsllq \$2,$M4,$M4 vpaddq $M4,$D0,$D0 # d4 -> d0 vpsrlq \$26,$D2,$M2 vpandq $MASK,$D2,$D2 vpaddq $M2,$D3,$D3 # d2 -> d3 vpsrlq \$26,$D0,$M0 vpandq $MASK,$D0,$D0 vpaddq $M0,$D1,$D1 # d0 -> d1 vpsrlq \$26,$D3,$M3 vpandq $MASK,$D3,$D3 vpaddq $M3,$D4,$D4 # d3 -> d4 ################################################################ # at this point we have 14243444 in $R0-$S4 and 05060708 in # $D0-$D4, ... vpunpcklqdq $T4,$T3,$T0 # transpose input vpunpckhqdq $T4,$T3,$T4 # ... since input 64-bit lanes are ordered as 73625140, we could # "vperm" it to 76543210 (here and in each loop iteration), *or* # we could just flow along, hence the goal for $R0-$S4 is # 1858286838784888 ... vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: mov \$0x7777,%eax kmovw %eax,%k1 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- vpermd $R1,$M0,$R1 vpermd $R2,$M0,$R2 vpermd $R3,$M0,$R3 vpermd $R4,$M0,$R4 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 vpermd $D1,$M0,${R1}{%k1} vpermd $D2,$M0,${R2}{%k1} vpermd $D3,$M0,${R3}{%k1} vpermd $D4,$M0,${R4}{%k1} vpslld \$2,$R1,$S1 # *5 vpslld \$2,$R2,$S2 vpslld \$2,$R3,$S3 vpslld \$2,$R4,$S4 vpaddd $R1,$S1,$S1 vpaddd $R2,$S2,$S2 vpaddd $R3,$S3,$S3 vpaddd $R4,$S4,$S4 vpbroadcastq 32(%rcx),$PADBIT # .L129 vpsrlq \$52,$T0,$T2 # splat input vpsllq \$12,$T4,$T3 vporq $T3,$T2,$T2 vpsrlq \$26,$T0,$T1 vpsrlq \$14,$T4,$T3 vpsrlq \$40,$T4,$T4 # 4 vpandq $MASK,$T2,$T2 # 2 vpandq $MASK,$T0,$T0 # 0 #vpandq $MASK,$T1,$T1 # 1 #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H2,$T2,$H2 # accumulate input sub \$192,$len jbe .Ltail_avx512 jmp .Loop_avx512 .align 32 .Loop_avx512: ################################################################ # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 # \________/\___________/ ################################################################ #vpaddq $H2,$T2,$H2 # accumulate input # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 # # however, as h2 is "chronologically" first one available pull # corresponding operations up, so it's # # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpaddq $H0,$T0,$H0 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vpaddq $H1,$T1,$H1 # accumulate input vpaddq $H3,$T3,$H3 vpaddq $H4,$T4,$H4 vmovdqu64 16*0($inp),$T3 # load input vmovdqu64 16*4($inp),$T4 lea 16*8($inp),$inp vpmuludq $H0,$R3,$M3 vpmuludq $H0,$R4,$M4 vpmuludq $H0,$R0,$M0 vpmuludq $H0,$R1,$M1 vpaddq $M3,$D3,$D3 # d3 += h0*r3 vpaddq $M4,$D4,$D4 # d4 += h0*r4 vpaddq $M0,$D0,$D0 # d0 += h0*r0 vpaddq $M1,$D1,$D1 # d1 += h0*r1 vpmuludq $H1,$R2,$M3 vpmuludq $H1,$R3,$M4 vpmuludq $H1,$S4,$M0 vpmuludq $H0,$R2,$M2 vpaddq $M3,$D3,$D3 # d3 += h1*r2 vpaddq $M4,$D4,$D4 # d4 += h1*r3 vpaddq $M0,$D0,$D0 # d0 += h1*s4 vpaddq $M2,$D2,$D2 # d2 += h0*r2 vpunpcklqdq $T4,$T3,$T0 # transpose input vpunpckhqdq $T4,$T3,$T4 vpmuludq $H3,$R0,$M3 vpmuludq $H3,$R1,$M4 vpmuludq $H1,$R0,$M1 vpmuludq $H1,$R1,$M2 vpaddq $M3,$D3,$D3 # d3 += h3*r0 vpaddq $M4,$D4,$D4 # d4 += h3*r1 vpaddq $M1,$D1,$D1 # d1 += h1*r0 vpaddq $M2,$D2,$D2 # d2 += h1*r1 vpmuludq $H4,$S4,$M3 vpmuludq $H4,$R0,$M4 vpmuludq $H3,$S2,$M0 vpmuludq $H3,$S3,$M1 vpaddq $M3,$D3,$D3 # d3 += h4*s4 vpmuludq $H3,$S4,$M2 vpaddq $M4,$D4,$D4 # d4 += h4*r0 vpaddq $M0,$D0,$D0 # d0 += h3*s2 vpaddq $M1,$D1,$D1 # d1 += h3*s3 vpaddq $M2,$D2,$D2 # d2 += h3*s4 vpmuludq $H4,$S1,$M0 vpmuludq $H4,$S2,$M1 vpmuludq $H4,$S3,$M2 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 ################################################################ # lazy reduction (interleaved with input splat) vpsrlq \$52,$T0,$T2 # splat input vpsllq \$12,$T4,$T3 vpsrlq \$26,$D3,$H3 vpandq $MASK,$D3,$D3 vpaddq $H3,$D4,$H4 # h3 -> h4 vporq $T3,$T2,$T2 vpsrlq \$26,$H0,$D0 vpandq $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpandq $MASK,$T2,$T2 # 2 vpsrlq \$26,$H4,$D4 vpandq $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpandq $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpaddq $T2,$H2,$H2 # modulo-scheduled vpsrlq \$26,$T0,$T1 vpsrlq \$26,$H2,$D2 vpandq $MASK,$H2,$H2 vpaddq $D2,$D3,$H3 # h2 -> h3 vpsrlq \$14,$T4,$T3 vpsrlq \$26,$H0,$D0 vpandq $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$40,$T4,$T4 # 4 vpsrlq \$26,$H3,$D3 vpandq $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpandq $MASK,$T0,$T0 # 0 #vpandq $MASK,$T1,$T1 # 1 #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always sub \$128,$len ja .Loop_avx512 .Ltail_avx512: ################################################################ # while above multiplications were by r^8 in all lanes, in last # iteration we multiply least significant lane by r^8 and most # significant one by r, that's why table gets shifted... vpsrlq \$32,$R0,$R0 # 0105020603070408 vpsrlq \$32,$R1,$R1 vpsrlq \$32,$R2,$R2 vpsrlq \$32,$S3,$S3 vpsrlq \$32,$S4,$S4 vpsrlq \$32,$R3,$R3 vpsrlq \$32,$R4,$R4 vpsrlq \$32,$S1,$S1 vpsrlq \$32,$S2,$S2 ################################################################ # load either next or last 64 byte of input lea ($inp,$len),$inp #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H1,$T1,$H1 # accumulate input vpaddq $H3,$T3,$H3 vpaddq $H4,$T4,$H4 vmovdqu 16*0($inp),%x#$T0 vpmuludq $H0,$R3,$M3 vpmuludq $H0,$R4,$M4 vpmuludq $H0,$R0,$M0 vpmuludq $H0,$R1,$M1 vpaddq $M3,$D3,$D3 # d3 += h0*r3 vpaddq $M4,$D4,$D4 # d4 += h0*r4 vpaddq $M0,$D0,$D0 # d0 += h0*r0 vpaddq $M1,$D1,$D1 # d1 += h0*r1 vmovdqu 16*1($inp),%x#$T1 vpmuludq $H1,$R2,$M3 vpmuludq $H1,$R3,$M4 vpmuludq $H1,$S4,$M0 vpmuludq $H0,$R2,$M2 vpaddq $M3,$D3,$D3 # d3 += h1*r2 vpaddq $M4,$D4,$D4 # d4 += h1*r3 vpaddq $M0,$D0,$D0 # d0 += h1*s4 vpaddq $M2,$D2,$D2 # d2 += h0*r2 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 vpmuludq $H3,$R0,$M3 vpmuludq $H3,$R1,$M4 vpmuludq $H1,$R0,$M1 vpmuludq $H1,$R1,$M2 vpaddq $M3,$D3,$D3 # d3 += h3*r0 vpaddq $M4,$D4,$D4 # d4 += h3*r1 vpaddq $M1,$D1,$D1 # d1 += h1*r0 vpaddq $M2,$D2,$D2 # d2 += h1*r1 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 vpmuludq $H4,$S4,$M3 vpmuludq $H4,$R0,$M4 vpmuludq $H3,$S2,$M0 vpmuludq $H3,$S3,$M1 vpmuludq $H3,$S4,$M2 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 vpaddq $M4,$D4,$D4 # d4 += h4*r0 vpaddq $M0,$D0,$D0 # d0 += h3*s2 vpaddq $M1,$D1,$D1 # d1 += h3*s3 vpaddq $M2,$D2,$D2 # d2 += h3*s4 vpmuludq $H4,$S1,$M0 vpmuludq $H4,$S2,$M1 vpmuludq $H4,$S3,$M2 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 ################################################################ # horizontal addition mov \$1,%eax vpermq \$0xb1,$H3,$D3 vpermq \$0xb1,$D4,$H4 vpermq \$0xb1,$H0,$D0 vpermq \$0xb1,$H1,$D1 vpermq \$0xb1,$H2,$D2 vpaddq $D3,$H3,$H3 vpaddq $D4,$H4,$H4 vpaddq $D0,$H0,$H0 vpaddq $D1,$H1,$H1 vpaddq $D2,$H2,$H2 kmovw %eax,%k3 vpermq \$0x2,$H3,$D3 vpermq \$0x2,$H4,$D4 vpermq \$0x2,$H0,$D0 vpermq \$0x2,$H1,$D1 vpermq \$0x2,$H2,$D2 vpaddq $D3,$H3,$H3 vpaddq $D4,$H4,$H4 vpaddq $D0,$H0,$H0 vpaddq $D1,$H1,$H1 vpaddq $D2,$H2,$H2 vextracti64x4 \$0x1,$H3,%y#$D3 vextracti64x4 \$0x1,$H4,%y#$D4 vextracti64x4 \$0x1,$H0,%y#$D0 vextracti64x4 \$0x1,$H1,%y#$D1 vextracti64x4 \$0x1,$H2,%y#$D2 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 vpaddq $D0,$H0,${H0}{%k3}{z} vpaddq $D1,$H1,${H1}{%k3}{z} vpaddq $D2,$H2,${H2}{%k3}{z} ___ map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); $code.=<<___; ################################################################ # lazy reduction (interleaved with input splat) vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpunpcklqdq $T3,$T2,$T2 # 2:3 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpsrlq \$30,$T2,$T3 vpsrlq \$4,$T2,$T2 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpsrlq \$26,$T0,$T1 vpsrlq \$40,$T4,$T4 # 4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T0,$T0 # 0 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 vpand $MASK,$T1,$T1 # 1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpaddq $D3,$H4,$H4 # h3 -> h4 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 add \$64,$len jnz .Ltail_avx2 vpsubq $T2,$H2,$H2 # undo input accumulation vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced vmovd %x#$H1,`4*1-48-64`($ctx) vmovd %x#$H2,`4*2-48-64`($ctx) vmovd %x#$H3,`4*3-48-64`($ctx) vmovd %x#$H4,`4*4-48-64`($ctx) vzeroall ___ $code.=<<___ if ($win64); movdqa 0x50(%r11),%xmm6 movdqa 0x60(%r11),%xmm7 movdqa 0x70(%r11),%xmm8 movdqa 0x80(%r11),%xmm9 movdqa 0x90(%r11),%xmm10 movdqa 0xa0(%r11),%xmm11 movdqa 0xb0(%r11),%xmm12 movdqa 0xc0(%r11),%xmm13 movdqa 0xd0(%r11),%xmm14 movdqa 0xe0(%r11),%xmm15 lea 0xf8(%r11),%rsp .Ldo_avx512_epilogue: ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp .cfi_def_cfa %rsp,8 ___ $code.=<<___; ret .cfi_endproc .size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ if ($avx>3) { ######################################################################## # VPMADD52 version using 2^44 radix. # # One can argue that base 2^52 would be more natural. Well, even though # some operations would be more natural, one has to recognize couple of # things. Base 2^52 doesn't provide advantage over base 2^44 if you look # at amount of multiply-n-accumulate operations. Secondly, it makes it # impossible to pre-compute multiples of 5 [referred to as s[]/sN in # reference implementations], which means that more such operations # would have to be performed in inner loop, which in turn makes critical # path longer. In other words, even though base 2^44 reduction might # look less elegant, overall critical path is actually shorter... ######################################################################## # Layout of opaque area is following. # # unsigned __int64 h[3]; # current hash value base 2^44 # unsigned __int64 s[2]; # key value*20 base 2^44 # unsigned __int64 r[3]; # key value base 2^44 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; # # r^n positions reflect # # placement in register, not # # memory, R[3] is R[1]*20 $code.=<<___; .type poly1305_init_base2_44,\@function,3 .align 32 poly1305_init_base2_44: .cfi_startproc xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) .Linit_base2_44: lea poly1305_blocks_vpmadd52(%rip),%r10 lea poly1305_emit_base2_44(%rip),%r11 mov \$0x0ffffffc0fffffff,%rax mov \$0x0ffffffc0ffffffc,%rcx and 0($inp),%rax mov \$0x00000fffffffffff,%r8 and 8($inp),%rcx mov \$0x00000fffffffffff,%r9 and %rax,%r8 shrd \$44,%rcx,%rax mov %r8,40($ctx) # r0 and %r9,%rax shr \$24,%rcx mov %rax,48($ctx) # r1 lea (%rax,%rax,4),%rax # *5 mov %rcx,56($ctx) # r2 shl \$2,%rax # magic <<2 lea (%rcx,%rcx,4),%rcx # *5 shl \$2,%rcx # magic <<2 mov %rax,24($ctx) # s1 mov %rcx,32($ctx) # s2 movq \$-1,64($ctx) # write impossible value ___ $code.=<<___ if ($flavour !~ /elf32/); mov %r10,0(%rdx) mov %r11,8(%rdx) ___ $code.=<<___ if ($flavour =~ /elf32/); mov %r10d,0(%rdx) mov %r11d,4(%rdx) ___ $code.=<<___; mov \$1,%eax ret .cfi_endproc .size poly1305_init_base2_44,.-poly1305_init_base2_44 ___ { my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); $code.=<<___; .type poly1305_blocks_vpmadd52,\@function,4 .align 32 poly1305_blocks_vpmadd52: .cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52 # too short shl \$40,$padbit mov 64($ctx),%r8 # peek on power of the key # if powers of the key are not calculated yet, process up to 3 # blocks with this single-block subroutine, otherwise ensure that # length is divisible by 2 blocks and pass the rest down to next # subroutine... mov \$3,%rax mov \$1,%r10 cmp \$4,$len # is input long cmovae %r10,%rax test %r8,%r8 # is power value impossible? cmovns %r10,%rax and $len,%rax # is input of favourable length? jz .Lblocks_vpmadd52_4x sub %rax,$len mov \$7,%r10d mov \$1,%r11d kmovw %r10d,%k7 lea .L2_44_inp_permd(%rip),%r10 kmovw %r11d,%k1 vmovq $padbit,%x#$PAD vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift vpermq \$0xcf,$PAD,$PAD vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft jmp .Loop_vpmadd52 .align 32 .Loop_vpmadd52: vmovdqu32 0($inp),%x#$T0 # load input as ----3210 lea 16($inp),$inp vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 vpsrlvq $inp_shift,$T0,$T0 vpandq $reduc_mask,$T0,$T0 vporq $PAD,$T0,$T0 vpaddq $T0,$Dlo,$Dlo # accumulate input vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} vpxord $Dlo,$Dlo,$Dlo vpxord $Dhi,$Dhi,$Dhi vpmadd52luq $r2r1r0,$H0,$Dlo vpmadd52huq $r2r1r0,$H0,$Dhi vpmadd52luq $r1r0s2,$H1,$Dlo vpmadd52huq $r1r0s2,$H1,$Dhi vpmadd52luq $r0s2s1,$H2,$Dlo vpmadd52huq $r0s2s1,$H2,$Dhi vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword vpandq $reduc_mask,$Dlo,$Dlo vpaddq $T0,$Dhi,$Dhi vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word vpandq $reduc_mask,$Dlo,$Dlo vpermq \$0b10010011,$T0,$T0 vpaddq $T0,$Dlo,$Dlo vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} vpaddq $T0,$Dlo,$Dlo vpsllq \$2,$T0,$T0 vpaddq $T0,$Dlo,$Dlo dec %rax # len-=16 jnz .Loop_vpmadd52 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value test $len,$len jnz .Lblocks_vpmadd52_4x .Lno_data_vpmadd52: ret .cfi_endproc .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 ___ } { ######################################################################## # As implied by its name 4x subroutine processes 4 blocks in parallel # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power # and is handled in 256-bit %ymm registers. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); $code.=<<___; .type poly1305_blocks_vpmadd52_4x,\@function,4 .align 32 poly1305_blocks_vpmadd52_4x: .cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52_4x # too short shl \$40,$padbit mov 64($ctx),%r8 # peek on power of the key .Lblocks_vpmadd52_4x: vpbroadcastq $padbit,$PAD vmovdqa64 .Lx_mask44(%rip),$mask44 mov \$5,%eax vmovdqa64 .Lx_mask42(%rip),$mask42 kmovw %eax,%k1 # used in 2x path test %r8,%r8 # is power value impossible? js .Linit_vpmadd52 # if it is, then init R[4] vmovq 0($ctx),%x#$H0 # load current hash value vmovq 8($ctx),%x#$H1 vmovq 16($ctx),%x#$H2 test \$3,$len # is length 4*n+2? jnz .Lblocks_vpmadd52_2x_do .Lblocks_vpmadd52_4x_do: vpbroadcastq 64($ctx),$R0 # load 4th power of the key vpbroadcastq 96($ctx),$R1 vpbroadcastq 128($ctx),$R2 vpbroadcastq 160($ctx),$S1 .Lblocks_vpmadd52_4x_key_loaded: vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R2,$S2,$S2 vpsllq \$2,$S2,$S2 test \$7,$len # is len 8*n? jz .Lblocks_vpmadd52_8x vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*2($inp),$T3 lea 16*4($inp),$inp vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 # at this point 64-bit lanes are ordered as 3-1-2-0 vpsrlq \$24,$T3,$T2 # splat the data vporq $PAD,$T2,$T2 vpaddq $T2,$H2,$H2 # accumulate input vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 sub \$4,$len jz .Ltail_vpmadd52_4x jmp .Loop_vpmadd52_4x ud2 .align 32 .Linit_vpmadd52: vmovq 24($ctx),%x#$S1 # load key vmovq 56($ctx),%x#$H2 vmovq 32($ctx),%x#$S2 vmovq 40($ctx),%x#$R0 vmovq 48($ctx),%x#$R1 vmovdqa $R0,$H0 vmovdqa $R1,$H1 vmovdqa $H2,$R2 mov \$2,%eax .Lmul_init_vpmadd52: vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 dec %eax jz .Ldone_init_vpmadd52 vpunpcklqdq $R1,$H1,$R1 # 1,2 vpbroadcastq %x#$H1,%x#$H1 # 2,2 vpunpcklqdq $R2,$H2,$R2 vpbroadcastq %x#$H2,%x#$H2 vpunpcklqdq $R0,$H0,$R0 vpbroadcastq %x#$H0,%x#$H0 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R1,$S1,$S1 vpaddq $R2,$S2,$S2 vpsllq \$2,$S1,$S1 vpsllq \$2,$S2,$S2 jmp .Lmul_init_vpmadd52 ud2 .align 32 .Ldone_init_vpmadd52: vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 vinserti128 \$1,%x#$R2,$H2,$R2 vinserti128 \$1,%x#$R0,$H0,$R0 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 vpermq \$0b11011000,$R2,$R2 vpermq \$0b11011000,$R0,$R0 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 vpaddq $R1,$S1,$S1 vpsllq \$2,$S1,$S1 vmovq 0($ctx),%x#$H0 # load current hash value vmovq 8($ctx),%x#$H1 vmovq 16($ctx),%x#$H2 test \$3,$len # is length 4*n+2? jnz .Ldone_init_vpmadd52_2x vmovdqu64 $R0,64($ctx) # save key powers vpbroadcastq %x#$R0,$R0 # broadcast 4th power vmovdqu64 $R1,96($ctx) vpbroadcastq %x#$R1,$R1 vmovdqu64 $R2,128($ctx) vpbroadcastq %x#$R2,$R2 vmovdqu64 $S1,160($ctx) vpbroadcastq %x#$S1,$S1 jmp .Lblocks_vpmadd52_4x_key_loaded ud2 .align 32 .Ldone_init_vpmadd52_2x: vmovdqu64 $R0,64($ctx) # save key powers vpsrldq \$8,$R0,$R0 # 0-1-0-2 vmovdqu64 $R1,96($ctx) vpsrldq \$8,$R1,$R1 vmovdqu64 $R2,128($ctx) vpsrldq \$8,$R2,$R2 vmovdqu64 $S1,160($ctx) vpsrldq \$8,$S1,$S1 jmp .Lblocks_vpmadd52_2x_key_loaded ud2 .align 32 .Lblocks_vpmadd52_2x_do: vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers vmovdqu64 160+8($ctx),${S1}{%k1}{z} vmovdqu64 64+8($ctx),${R0}{%k1}{z} vmovdqu64 96+8($ctx),${R1}{%k1}{z} .Lblocks_vpmadd52_2x_key_loaded: vmovdqu64 16*0($inp),$T2 # load data vpxorq $T3,$T3,$T3 lea 16*2($inp),$inp vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 # at this point 64-bit lanes are ordered as x-1-x-0 vpsrlq \$24,$T3,$T2 # splat the data vporq $PAD,$T2,$T2 vpaddq $T2,$H2,$H2 # accumulate input vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 jmp .Ltail_vpmadd52_2x ud2 .align 32 .Loop_vpmadd52_4x: #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*2($inp),$T3 lea 16*4($inp),$inp vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # partial reduction (interleaved with data splat) vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpsrlq \$24,$T3,$T2 vporq $PAD,$T2,$T2 vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $T2,$H2,$H2 # accumulate input vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 sub \$4,$len # len-=64 jnz .Loop_vpmadd52_4x .Ltail_vpmadd52_4x: vmovdqu64 128($ctx),$R2 # load all key powers vmovdqu64 160($ctx),$S1 vmovdqu64 64($ctx),$R0 vmovdqu64 96($ctx),$R1 .Ltail_vpmadd52_2x: vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R2,$S2,$S2 vpsllq \$2,$S2,$S2 #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # horizontal addition mov \$1,%eax kmovw %eax,%k1 vpsrldq \$8,$D0lo,$T0 vpsrldq \$8,$D0hi,$H0 vpsrldq \$8,$D1lo,$T1 vpsrldq \$8,$D1hi,$H1 vpaddq $T0,$D0lo,$D0lo vpaddq $H0,$D0hi,$D0hi vpsrldq \$8,$D2lo,$T2 vpsrldq \$8,$D2hi,$H2 vpaddq $T1,$D1lo,$D1lo vpaddq $H1,$D1hi,$D1hi vpermq \$0x2,$D0lo,$T0 vpermq \$0x2,$D0hi,$H0 vpaddq $T2,$D2lo,$D2lo vpaddq $H2,$D2hi,$D2hi vpermq \$0x2,$D1lo,$T1 vpermq \$0x2,$D1hi,$H1 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} vpaddq $H0,$D0hi,${D0hi}{%k1}{z} vpermq \$0x2,$D2lo,$T2 vpermq \$0x2,$D2hi,$H2 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} vpaddq $H1,$D1hi,${D1hi}{%k1}{z} vpaddq $T2,$D2lo,${D2lo}{%k1}{z} vpaddq $H2,$D2hi,${D2hi}{%k1}{z} ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 # at this point $len is # either 4*n+2 or 0... sub \$2,$len # len-=32 ja .Lblocks_vpmadd52_4x_do vmovq %x#$H0,0($ctx) vmovq %x#$H1,8($ctx) vmovq %x#$H2,16($ctx) vzeroall .Lno_data_vpmadd52_4x: ret .cfi_endproc .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x ___ } { ######################################################################## # As implied by its name 8x subroutine processes 8 blocks in parallel... # This is intermediate version, as it's used only in cases when input # length is either 8*n, 8*n+1 or 8*n+2... my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); $code.=<<___; .type poly1305_blocks_vpmadd52_8x,\@function,4 .align 32 poly1305_blocks_vpmadd52_8x: .cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52_8x # too short shl \$40,$padbit mov 64($ctx),%r8 # peek on power of the key vmovdqa64 .Lx_mask44(%rip),$mask44 vmovdqa64 .Lx_mask42(%rip),$mask42 test %r8,%r8 # is power value impossible? js .Linit_vpmadd52 # if it is, then init R[4] vmovq 0($ctx),%x#$H0 # load current hash value vmovq 8($ctx),%x#$H1 vmovq 16($ctx),%x#$H2 .Lblocks_vpmadd52_8x: ################################################################ # fist we calculate more key powers vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers vmovdqu64 160($ctx),$S1 vmovdqu64 64($ctx),$R0 vmovdqu64 96($ctx),$R1 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R2,$S2,$S2 vpsllq \$2,$S2,$S2 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power vpbroadcastq %x#$R0,$RR0 vpbroadcastq %x#$R1,$RR1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $RR2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $RR2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $RR2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $RR2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $RR2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $RR2,$R0,$D2hi vpmadd52luq $RR0,$R0,$D0lo vpmadd52huq $RR0,$R0,$D0hi vpmadd52luq $RR0,$R1,$D1lo vpmadd52huq $RR0,$R1,$D1hi vpmadd52luq $RR0,$R2,$D2lo vpmadd52huq $RR0,$R2,$D2hi vpmadd52luq $RR1,$S2,$D0lo vpmadd52huq $RR1,$S2,$D0hi vpmadd52luq $RR1,$R0,$D1lo vpmadd52huq $RR1,$R0,$D1hi vpmadd52luq $RR1,$R1,$D2lo vpmadd52huq $RR1,$R1,$D2hi ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$RR0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$RR1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$RR2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$RR0,$RR0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$RR0,$RR0 vpsrlq \$44,$RR0,$tmp # additional step vpandq $mask44,$RR0,$RR0 vpaddq $tmp,$RR1,$RR1 ################################################################ # At this point Rx holds 1324 powers, RRx - 5768, and the goal # is 15263748, which reflects how data is loaded... vpunpcklqdq $R2,$RR2,$T2 # 3748 vpunpckhqdq $R2,$RR2,$R2 # 1526 vpunpcklqdq $R0,$RR0,$T0 vpunpckhqdq $R0,$RR0,$R0 vpunpcklqdq $R1,$RR1,$T1 vpunpckhqdq $R1,$RR1,$R1 ___ ######## switch to %zmm map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); $code.=<<___; vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 vshufi64x2 \$0x44,$R0,$T0,$RR0 vshufi64x2 \$0x44,$R1,$T1,$RR1 vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*4($inp),$T3 lea 16*8($inp),$inp vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 vpaddq $RR2,$SS2,$SS2 vpaddq $RR1,$SS1,$SS1 vpsllq \$2,$SS2,$SS2 vpsllq \$2,$SS1,$SS1 vpbroadcastq $padbit,$PAD vpbroadcastq %x#$mask44,$mask44 vpbroadcastq %x#$mask42,$mask42 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power vpbroadcastq %x#$SS2,$S2 vpbroadcastq %x#$RR0,$R0 vpbroadcastq %x#$RR1,$R1 vpbroadcastq %x#$RR2,$R2 vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 # at this point 64-bit lanes are ordered as 73625140 vpsrlq \$24,$T3,$T2 # splat the data vporq $PAD,$T2,$T2 vpaddq $T2,$H2,$H2 # accumulate input vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 sub \$8,$len jz .Ltail_vpmadd52_8x jmp .Loop_vpmadd52_8x .align 32 .Loop_vpmadd52_8x: #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*4($inp),$T3 lea 16*8($inp),$inp vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # partial reduction (interleaved with data splat) vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpsrlq \$24,$T3,$T2 vporq $PAD,$T2,$T2 vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $T2,$H2,$H2 # accumulate input vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 sub \$8,$len # len-=128 jnz .Loop_vpmadd52_8x .Ltail_vpmadd52_8x: #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$SS1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$SS1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$SS2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$SS2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$RR0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$RR0,$D2hi vpmadd52luq $H0,$RR0,$D0lo vpmadd52huq $H0,$RR0,$D0hi vpmadd52luq $H0,$RR1,$D1lo vpmadd52huq $H0,$RR1,$D1hi vpmadd52luq $H0,$RR2,$D2lo vpmadd52huq $H0,$RR2,$D2hi vpmadd52luq $H1,$SS2,$D0lo vpmadd52huq $H1,$SS2,$D0hi vpmadd52luq $H1,$RR0,$D1lo vpmadd52huq $H1,$RR0,$D1hi vpmadd52luq $H1,$RR1,$D2lo vpmadd52huq $H1,$RR1,$D2hi ################################################################ # horizontal addition mov \$1,%eax kmovw %eax,%k1 vpsrldq \$8,$D0lo,$T0 vpsrldq \$8,$D0hi,$H0 vpsrldq \$8,$D1lo,$T1 vpsrldq \$8,$D1hi,$H1 vpaddq $T0,$D0lo,$D0lo vpaddq $H0,$D0hi,$D0hi vpsrldq \$8,$D2lo,$T2 vpsrldq \$8,$D2hi,$H2 vpaddq $T1,$D1lo,$D1lo vpaddq $H1,$D1hi,$D1hi vpermq \$0x2,$D0lo,$T0 vpermq \$0x2,$D0hi,$H0 vpaddq $T2,$D2lo,$D2lo vpaddq $H2,$D2hi,$D2hi vpermq \$0x2,$D1lo,$T1 vpermq \$0x2,$D1hi,$H1 vpaddq $T0,$D0lo,$D0lo vpaddq $H0,$D0hi,$D0hi vpermq \$0x2,$D2lo,$T2 vpermq \$0x2,$D2hi,$H2 vpaddq $T1,$D1lo,$D1lo vpaddq $H1,$D1hi,$D1hi vextracti64x4 \$1,$D0lo,%y#$T0 vextracti64x4 \$1,$D0hi,%y#$H0 vpaddq $T2,$D2lo,$D2lo vpaddq $H2,$D2hi,$D2hi vextracti64x4 \$1,$D1lo,%y#$T1 vextracti64x4 \$1,$D1hi,%y#$H1 vextracti64x4 \$1,$D2lo,%y#$T2 vextracti64x4 \$1,$D2hi,%y#$H2 ___ ######## switch back to %ymm map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); $code.=<<___; vpaddq $T0,$D0lo,${D0lo}{%k1}{z} vpaddq $H0,$D0hi,${D0hi}{%k1}{z} vpaddq $T1,$D1lo,${D1lo}{%k1}{z} vpaddq $H1,$D1hi,${D1hi}{%k1}{z} vpaddq $T2,$D2lo,${D2lo}{%k1}{z} vpaddq $H2,$D2hi,${D2hi}{%k1}{z} ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 ################################################################ vmovq %x#$H0,0($ctx) vmovq %x#$H1,8($ctx) vmovq %x#$H2,16($ctx) vzeroall .Lno_data_vpmadd52_8x: ret .cfi_endproc .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x ___ } $code.=<<___; .type poly1305_emit_base2_44,\@function,3 .align 32 poly1305_emit_base2_44: .cfi_startproc mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 mov 16($ctx),%r10 mov %r9,%rax shr \$20,%r9 shl \$44,%rax mov %r10,%rcx shr \$40,%r10 shl \$24,%rcx add %rax,%r8 adc %rcx,%r9 adc \$0,%r10 mov %r8,%rax add \$5,%r8 # compare to modulus mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx add 0($nonce),%rax # accumulate nonce adc 8($nonce),%rcx mov %rax,0($mac) # write result mov %rcx,8($mac) ret .cfi_endproc .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } $code.=<<___; .align 64 .Lconst: .Lmask24: .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 .L129: .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 .Lmask26: .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 .Lpermd_avx2: .long 2,2,2,3,2,0,2,1 .Lpermd_avx512: .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 .L2_44_inp_permd: .long 0,1,1,2,2,3,7,7 .L2_44_inp_shift: .quad 0,12,24,64 .L2_44_mask: .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff .L2_44_shift_rgt: .quad 44,44,42,64 .L2_44_shift_lft: .quad 8,8,10,64 .align 64 .Lx_mask44: .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff .Lx_mask42: .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff ___ } $code.=<<___; .asciz "Poly1305 for x86_64, CRYPTOGAMS by " .align 16 ___ { # chacha20-poly1305 helpers my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code.=<<___; .globl xor128_encrypt_n_pad .type xor128_encrypt_n_pad,\@abi-omnipotent .align 16 xor128_encrypt_n_pad: .cfi_startproc sub $otp,$inp sub $otp,$out mov $len,%r10 # put len aside shr \$4,$len # len / 16 jz .Ltail_enc nop .Loop_enc_xmm: movdqu ($inp,$otp),%xmm0 pxor ($otp),%xmm0 movdqu %xmm0,($out,$otp) movdqa %xmm0,($otp) lea 16($otp),$otp dec $len jnz .Loop_enc_xmm and \$15,%r10 # len % 16 jz .Ldone_enc .Ltail_enc: mov \$16,$len sub %r10,$len xor %eax,%eax .Loop_enc_byte: mov ($inp,$otp),%al xor ($otp),%al mov %al,($out,$otp) mov %al,($otp) lea 1($otp),$otp dec %r10 jnz .Loop_enc_byte xor %eax,%eax .Loop_enc_pad: mov %al,($otp) lea 1($otp),$otp dec $len jnz .Loop_enc_pad .Ldone_enc: mov $otp,%rax ret .cfi_endproc .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad .globl xor128_decrypt_n_pad .type xor128_decrypt_n_pad,\@abi-omnipotent .align 16 xor128_decrypt_n_pad: .cfi_startproc sub $otp,$inp sub $otp,$out mov $len,%r10 # put len aside shr \$4,$len # len / 16 jz .Ltail_dec nop .Loop_dec_xmm: movdqu ($inp,$otp),%xmm0 movdqa ($otp),%xmm1 pxor %xmm0,%xmm1 movdqu %xmm1,($out,$otp) movdqa %xmm0,($otp) lea 16($otp),$otp dec $len jnz .Loop_dec_xmm pxor %xmm1,%xmm1 and \$15,%r10 # len % 16 jz .Ldone_dec .Ltail_dec: mov \$16,$len sub %r10,$len xor %eax,%eax xor %r11,%r11 .Loop_dec_byte: mov ($inp,$otp),%r11b mov ($otp),%al xor %r11b,%al mov %al,($out,$otp) mov %r11b,($otp) lea 1($otp),$otp dec %r10 jnz .Loop_dec_byte xor %eax,%eax .Loop_dec_pad: mov %al,($otp) lea 1($otp),$otp dec $len jnz .Loop_dec_pad .Ldone_dec: mov $otp,%rax ret .cfi_endproc .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad ___ } # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler .type avx_handler,\@abi-omnipotent .align 16 avx_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 208($context),%rax # pull context->R11 lea 0x50(%rax),%rsi lea 0xf8(%rax),%rax lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size avx_handler,.-avx_handler .section .pdata .align 4 .rva .LSEH_begin_poly1305_init .rva .LSEH_end_poly1305_init .rva .LSEH_info_poly1305_init .rva .LSEH_begin_poly1305_blocks .rva .LSEH_end_poly1305_blocks .rva .LSEH_info_poly1305_blocks .rva .LSEH_begin_poly1305_emit .rva .LSEH_end_poly1305_emit .rva .LSEH_info_poly1305_emit ___ $code.=<<___ if ($avx); .rva .LSEH_begin_poly1305_blocks_avx .rva .Lbase2_64_avx .rva .LSEH_info_poly1305_blocks_avx_1 .rva .Lbase2_64_avx .rva .Leven_avx .rva .LSEH_info_poly1305_blocks_avx_2 .rva .Leven_avx .rva .LSEH_end_poly1305_blocks_avx .rva .LSEH_info_poly1305_blocks_avx_3 .rva .LSEH_begin_poly1305_emit_avx .rva .LSEH_end_poly1305_emit_avx .rva .LSEH_info_poly1305_emit_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_poly1305_blocks_avx2 .rva .Lbase2_64_avx2 .rva .LSEH_info_poly1305_blocks_avx2_1 .rva .Lbase2_64_avx2 .rva .Leven_avx2 .rva .LSEH_info_poly1305_blocks_avx2_2 .rva .Leven_avx2 .rva .LSEH_end_poly1305_blocks_avx2 .rva .LSEH_info_poly1305_blocks_avx2_3 ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_poly1305_blocks_avx512 .rva .LSEH_end_poly1305_blocks_avx512 .rva .LSEH_info_poly1305_blocks_avx512 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_poly1305_init: .byte 9,0,0,0 .rva se_handler .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init .LSEH_info_poly1305_blocks: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_body,.Lblocks_epilogue .LSEH_info_poly1305_emit: .byte 9,0,0,0 .rva se_handler .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit ___ $code.=<<___ if ($avx); .LSEH_info_poly1305_blocks_avx_1: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx_2: .byte 9,0,0,0 .rva se_handler .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx_3: .byte 9,0,0,0 .rva avx_handler .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] .LSEH_info_poly1305_emit_avx: .byte 9,0,0,0 .rva se_handler .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx ___ $code.=<<___ if ($avx>1); .LSEH_info_poly1305_blocks_avx2_1: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx2_2: .byte 9,0,0,0 .rva se_handler .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx2_3: .byte 9,0,0,0 .rva avx_handler .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] ___ $code.=<<___ if ($avx>2); .LSEH_info_poly1305_blocks_avx512: .byte 9,0,0,0 .rva avx_handler .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] ___ } foreach (split('\n',$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/%r([a-z]+)#d/%e$1/g; s/%r([0-9]+)#d/%r$1d/g; s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/sha/asm/sha1-586.pl =================================================================== --- head/crypto/openssl/crypto/sha/asm/sha1-586.pl (revision 364821) +++ head/crypto/openssl/crypto/sha/asm/sha1-586.pl (revision 364822) @@ -1,1491 +1,1491 @@ #! /usr/bin/env perl # Copyright 1998-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # [Re]written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # "[Re]written" was achieved in two major overhauls. In 2004 BODY_* # functions were re-implemented to address P4 performance issue [see # commentary below], and in 2006 the rest was rewritten in order to # gain freedom to liberate licensing terms. # January, September 2004. # # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* # SHA1 assembler implementation. To address this problem (and # prove that humans are still better than machines:-), the # original code was overhauled, which resulted in following # performance changes: # # compared with original compared with Intel cc # assembler impl. generated code # Pentium -16% +48% # PIII/AMD +8% +16% # P4 +85%(!) +45% # # As you can see Pentium came out as looser:-( Yet I reckoned that # improvement on P4 outweighs the loss and incorporate this # re-tuned code to 0.9.7 and later. # ---------------------------------------------------------------- # August 2009. # # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as # '(c&d) + (b&(c^d))', which allows to accumulate partial results # and lighten "pressure" on scratch registers. This resulted in # >12% performance improvement on contemporary AMD cores (with no # degradation on other CPUs:-). Also, the code was revised to maximize # "distance" between instructions producing input to 'lea' instruction # and the 'lea' instruction itself, which is essential for Intel Atom # core and resulted in ~15% improvement. # October 2010. # # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it # is to offload message schedule denoted by Wt in NIST specification, # or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, # and in SSE2 context was first explored by Dean Gaudet in 2004, see # http://arctic.org/~dean/crypto/sha1.html. Since then several things # have changed that made it interesting again: # # a) XMM units became faster and wider; # b) instruction set became more versatile; # c) an important observation was made by Max Locktykhin, which made # it possible to reduce amount of instructions required to perform # the operation in question, for further details see # http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. # April 2011. # # Add AVX code path, probably most controversial... The thing is that # switch to AVX alone improves performance by as little as 4% in # comparison to SSSE3 code path. But below result doesn't look like # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as # pair of µ-ops, and it's the additional µ-ops, two per round, that # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded # as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with # equivalent 'sh[rl]d' that is responsible for the impressive 5.1 # cycles per processed byte. But 'sh[rl]d' is not something that used # to be fast, nor does it appear to be fast in upcoming Bulldozer # [according to its optimization manual]. Which is why AVX code path # is guarded by *both* AVX and synthetic bit denoting Intel CPUs. # One can argue that it's unfair to AMD, but without 'sh[rl]d' it # makes no sense to keep the AVX code path. If somebody feels that # strongly, it's probably more appropriate to discuss possibility of # using vector rotate XOP on AMD... # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance is summarized in following table. Numbers are # CPU clock cycles spent to process single byte (less is better). # # x86 SSSE3 AVX # Pentium 15.7 - # PIII 11.5 - # P4 10.6 - # AMD K8 7.1 - # Core2 7.3 6.0/+22% - # Westmere 7.3 5.5/+33% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% # Skylake 6.4 4.1/+55% 4.1(**)/+55% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.5/+41% # Atom 12.5 9.3(*)/+35% # Silvermont 14.5 9.9(*)/+46% # Goldmont 8.8 6.7/+30% 1.7(***)/+415% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # The discrepancy is because of front-end limitations, so # called MS-ROM penalties, and on Silvermont even rotate's # limited parallelism. # # (**) As per above comment, the result is for AVX *plus* sh[rl]d. # # (***) SHAEXT result $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } $ymm=1 if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && +$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); # first version supporting AVX $shaext=$xmm; ### set to zero if compiling for 1.0.1 &external_label("OPENSSL_ia32cap_P") if ($xmm); $A="eax"; $B="ebx"; $C="ecx"; $D="edx"; $E="edi"; $T="esi"; $tmp1="ebp"; @V=($A,$B,$C,$D,$E,$T); $alt=0; # 1 denotes alternative IALU implementation, which performs # 8% *worse* on P4, same on Westmere and Atom, 2% better on # Sandy Bridge... sub BODY_00_15 { local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("00_15 $n"); &mov($f,$c); # f to hold F_00_19(b,c,d) if ($n==0) { &mov($tmp1,$a); } else { &mov($a,$tmp1); } &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); &add($tmp1,$e); # tmp1+=e; &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded # with xi, also note that e becomes # f in next round... &and($f,$b); &rotr($b,2); # b=ROTATE(b,30) &xor($f,$d); # f holds F_00_19(b,c,d) &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round &add($f,$tmp1); } # f+=tmp1 else { &add($tmp1,$f); } # f becomes a in next round &mov($tmp1,$a) if ($alt && $n==15); } sub BODY_16_19 { local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("16_19 $n"); if ($alt) { &xor($c,$d); &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($e,$tmp1); # e+=F_00_19(b,c,d) &xor($c,$d); # restore $c &mov($tmp1,$a); # b in next round &rotr($b,$n==16?2:7); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f); # xi=f &rotl($a,5); # ROTATE(a,5) &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$a); # f+=ROTATE(a,5) } else { &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) &add($e,$tmp1); # e+=F_00_19(b,c,d) &mov($tmp1,$a); &rotr($b,2); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f); # xi=f &rotl($tmp1,5); # ROTATE(a,5) &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } } sub BODY_20_39 { local($n,$a,$b,$c,$d,$e,$f)=@_; local $K=($n<40)?0x6ed9eba1:0xca62c1d6; &comment("20_39 $n"); if ($alt) { &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+8)%16)); &add($e,$tmp1); # e+=F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &mov($tmp1,$a); # b in next round &rotr($b,7); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f) if($n<77);# xi=f &rotl($a,5); # ROTATE(a,5) &xor($b,$c) if($n==39);# warm up for BODY_40_59 &and($tmp1,$b) if($n==39); &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round &add($f,$a); # f+=ROTATE(a,5) &rotr($a,5) if ($n==79); } else { &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$c); &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($e,$tmp1); # e+=F_20_39(b,c,d) &rotr($b,2); # b=ROTATE(b,30) &mov($tmp1,$a); &rotl($tmp1,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f) if($n<77);# xi=f &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } } sub BODY_40_59 { local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("40_59 $n"); if ($alt) { &add($e,$tmp1); # e+=b&(c^d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &xor($c,$d); # restore $c &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &and($tmp1,$c); &rotr($b,7); # b=ROTATE(b,30) &add($e,$tmp1); # e+=c&d &mov($tmp1,$a); # b in next round &mov(&swtmp($n%16),$f); # xi=f &rotl($a,5); # ROTATE(a,5) &xor($b,$c) if ($n<59); &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$a); # f+=ROTATE(a,5) } else { &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($tmp1,$e); # b&(c^d)+=e &rotr($b,2); # b=ROTATE(b,30) &mov($e,$a); # e becomes volatile &rotl($e,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f); # xi=f &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) &mov($tmp1,$c); &add($f,$e); # f+=ROTATE(a,5) &and($tmp1,$d); &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=c&d } } &function_begin("sha1_block_data_order"); if ($xmm) { &static_label("shaext_shortcut") if ($shaext); &static_label("ssse3_shortcut"); &static_label("avx_shortcut") if ($ymm); &static_label("K_XX_XX"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &mov ($A,&DWP(0,$T)); &mov ($D,&DWP(4,$T)); &test ($D,1<<9); # check SSSE3 bit &jz (&label("x86")); &mov ($C,&DWP(8,$T)); &test ($A,1<<24); # check FXSR bit &jz (&label("x86")); if ($shaext) { &test ($C,1<<29); # check SHA bit &jnz (&label("shaext_shortcut")); } if ($ymm) { &and ($D,1<<28); # mask AVX bit &and ($A,1<<30); # mask "Intel CPU" bit &or ($A,$D); &cmp ($A,1<<28|1<<30); &je (&label("avx_shortcut")); } &jmp (&label("ssse3_shortcut")); &set_label("x86",16); } &mov($tmp1,&wparam(0)); # SHA_CTX *c &mov($T,&wparam(1)); # const void *input &mov($A,&wparam(2)); # size_t num &stack_push(16+3); # allocate X[16] &shl($A,6); &add($A,$T); &mov(&wparam(2),$A); # pointer beyond the end of input &mov($E,&DWP(16,$tmp1));# pre-load E &jmp(&label("loop")); &set_label("loop",16); # copy input chunk to X, but reversing byte order! for ($i=0; $i<16; $i+=4) { &mov($A,&DWP(4*($i+0),$T)); &mov($B,&DWP(4*($i+1),$T)); &mov($C,&DWP(4*($i+2),$T)); &mov($D,&DWP(4*($i+3),$T)); &bswap($A); &bswap($B); &bswap($C); &bswap($D); &mov(&swtmp($i+0),$A); &mov(&swtmp($i+1),$B); &mov(&swtmp($i+2),$C); &mov(&swtmp($i+3),$D); } &mov(&wparam(1),$T); # redundant in 1st spin &mov($A,&DWP(0,$tmp1)); # load SHA_CTX &mov($B,&DWP(4,$tmp1)); &mov($C,&DWP(8,$tmp1)); &mov($D,&DWP(12,$tmp1)); # E is pre-loaded for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check &mov($tmp1,&wparam(0)); # re-load SHA_CTX* &mov($D,&wparam(1)); # D is last "T" and is discarded &add($E,&DWP(0,$tmp1)); # E is last "A"... &add($T,&DWP(4,$tmp1)); &add($A,&DWP(8,$tmp1)); &add($B,&DWP(12,$tmp1)); &add($C,&DWP(16,$tmp1)); &mov(&DWP(0,$tmp1),$E); # update SHA_CTX &add($D,64); # advance input pointer &mov(&DWP(4,$tmp1),$T); &cmp($D,&wparam(2)); # have we reached the end yet? &mov(&DWP(8,$tmp1),$A); &mov($E,$C); # C is last "E" which needs to be "pre-loaded" &mov(&DWP(12,$tmp1),$B); &mov($T,$D); # input pointer &mov(&DWP(16,$tmp1),$C); &jb(&label("loop")); &stack_pop(16+3); &function_end("sha1_block_data_order"); if ($xmm) { if ($shaext) { ###################################################################### # Intel SHA Extensions implementation of SHA1 update function. # my ($ctx,$inp,$num)=("edi","esi","ecx"); my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3)); my @MSG=map("xmm$_",(4..7)); sub sha1rnds4 { my ($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm); } } sub sha1op38 { my ($opcodelet,$dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } } sub sha1nexte { sha1op38(0xc8,@_); } sub sha1msg1 { sha1op38(0xc9,@_); } sub sha1msg2 { sha1op38(0xca,@_); } &function_begin("_sha1_block_data_order_shaext"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &set_label("shaext_shortcut"); &mov ($ctx,&wparam(0)); &mov ("ebx","esp"); &mov ($inp,&wparam(1)); &mov ($num,&wparam(2)); &sub ("esp",32); &movdqu ($ABCD,&QWP(0,$ctx)); &movd ($E,&DWP(16,$ctx)); &and ("esp",-32); &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap &movdqu (@MSG[0],&QWP(0,$inp)); &pshufd ($ABCD,$ABCD,0b00011011); # flip word order &movdqu (@MSG[1],&QWP(0x10,$inp)); &pshufd ($E,$E,0b00011011); # flip word order &movdqu (@MSG[2],&QWP(0x20,$inp)); &pshufb (@MSG[0],$BSWAP); &movdqu (@MSG[3],&QWP(0x30,$inp)); &pshufb (@MSG[1],$BSWAP); &pshufb (@MSG[2],$BSWAP); &pshufb (@MSG[3],$BSWAP); &jmp (&label("loop_shaext")); &set_label("loop_shaext",16); &dec ($num); &lea ("eax",&DWP(0x40,$inp)); &movdqa (&QWP(0,"esp"),$E); # offload $E &paddd ($E,@MSG[0]); &cmovne ($inp,"eax"); &movdqa (&QWP(16,"esp"),$ABCD); # offload $ABCD for($i=0;$i<20-4;$i+=2) { &sha1msg1 (@MSG[0],@MSG[1]); &movdqa ($E_,$ABCD); &sha1rnds4 ($ABCD,$E,int($i/5)); # 0-3... &sha1nexte ($E_,@MSG[1]); &pxor (@MSG[0],@MSG[2]); &sha1msg1 (@MSG[1],@MSG[2]); &sha1msg2 (@MSG[0],@MSG[3]); &movdqa ($E,$ABCD); &sha1rnds4 ($ABCD,$E_,int(($i+1)/5)); &sha1nexte ($E,@MSG[2]); &pxor (@MSG[1],@MSG[3]); &sha1msg2 (@MSG[1],@MSG[0]); push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); } &movdqu (@MSG[0],&QWP(0,$inp)); &movdqa ($E_,$ABCD); &sha1rnds4 ($ABCD,$E,3); # 64-67 &sha1nexte ($E_,@MSG[1]); &movdqu (@MSG[1],&QWP(0x10,$inp)); &pshufb (@MSG[0],$BSWAP); &movdqa ($E,$ABCD); &sha1rnds4 ($ABCD,$E_,3); # 68-71 &sha1nexte ($E,@MSG[2]); &movdqu (@MSG[2],&QWP(0x20,$inp)); &pshufb (@MSG[1],$BSWAP); &movdqa ($E_,$ABCD); &sha1rnds4 ($ABCD,$E,3); # 72-75 &sha1nexte ($E_,@MSG[3]); &movdqu (@MSG[3],&QWP(0x30,$inp)); &pshufb (@MSG[2],$BSWAP); &movdqa ($E,$ABCD); &sha1rnds4 ($ABCD,$E_,3); # 76-79 &movdqa ($E_,&QWP(0,"esp")); &pshufb (@MSG[3],$BSWAP); &sha1nexte ($E,$E_); &paddd ($ABCD,&QWP(16,"esp")); &jnz (&label("loop_shaext")); &pshufd ($ABCD,$ABCD,0b00011011); &pshufd ($E,$E,0b00011011); &movdqu (&QWP(0,$ctx),$ABCD) &movd (&DWP(16,$ctx),$E); &mov ("esp","ebx"); &function_end("_sha1_block_data_order_shaext"); } ###################################################################### # The SSSE3 implementation. # # %xmm[0-7] are used as ring @X[] buffer containing quadruples of last # 32 elements of the message schedule or Xupdate outputs. First 4 # quadruples are simply byte-swapped input, next 4 are calculated # according to method originally suggested by Dean Gaudet (modulo # being implemented in SSSE3). Once 8 quadruples or 32 elements are # collected, it switches to routine proposed by Max Locktyukhin. # # Calculations inevitably require temporary registers, and there are # no %xmm registers left to spare. For this reason part of the ring # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - # X[-5], and X[4] - X[-4]... # # Another notable optimization is aggressive stack frame compression # aiming to minimize amount of 9-byte instructions... # # Yet another notable optimization is "jumping" $B variable. It means # that there is no register permanently allocated for $B value. This # allowed to eliminate one instruction from body_20_39... # my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 my @V=($A,$B,$C,$D,$E); my $j=0; # hash round my $rx=0; my @T=($T,$tmp1); my $inp; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; &function_begin("_sha1_block_data_order_ssse3"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &set_label("ssse3_shortcut"); &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask &mov ($E,&wparam(0)); # load argument block &mov ($inp=@T[1],&wparam(1)); &mov ($D,&wparam(2)); &mov (@T[0],"esp"); # stack frame layout # # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area # X[4]+K X[5]+K X[6]+K X[7]+K # X[8]+K X[9]+K X[10]+K X[11]+K # X[12]+K X[13]+K X[14]+K X[15]+K # # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area # X[4] X[5] X[6] X[7] # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 # # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants # K_40_59 K_40_59 K_40_59 K_40_59 # K_60_79 K_60_79 K_60_79 K_60_79 # K_00_19 K_00_19 K_00_19 K_00_19 # pbswap mask # # +192 ctx # argument block # +196 inp # +200 end # +204 esp &sub ("esp",208); &and ("esp",-64); &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants &movdqa (&QWP(112+16,"esp"),@X[5]); &movdqa (&QWP(112+32,"esp"),@X[6]); &shl ($D,6); # len*64 &movdqa (&QWP(112+48,"esp"),@X[3]); &add ($D,$inp); # end of input &movdqa (&QWP(112+64,"esp"),@X[2]); &add ($inp,64); &mov (&DWP(192+0,"esp"),$E); # save argument block &mov (&DWP(192+4,"esp"),$inp); &mov (&DWP(192+8,"esp"),$D); &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp &mov ($A,&DWP(0,$E)); # load context &mov ($B,&DWP(4,$E)); &mov ($C,&DWP(8,$E)); &mov ($D,&DWP(12,$E)); &mov ($E,&DWP(16,$E)); &mov (@T[0],$B); # magic seed &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] &movdqu (@X[-3&7],&QWP(-48,$inp)); &movdqu (@X[-2&7],&QWP(-32,$inp)); &movdqu (@X[-1&7],&QWP(-16,$inp)); &pshufb (@X[-4&7],@X[2]); # byte swap &pshufb (@X[-3&7],@X[2]); &pshufb (@X[-2&7],@X[2]); &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot &pshufb (@X[-1&7],@X[2]); &paddd (@X[-4&7],@X[3]); # add K_00_19 &paddd (@X[-3&7],@X[3]); &paddd (@X[-2&7],@X[3]); &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU &psubd (@X[-4&7],@X[3]); # restore X[] &movdqa (&QWP(0+16,"esp"),@X[-3&7]); &psubd (@X[-3&7],@X[3]); &movdqa (&QWP(0+32,"esp"),@X[-2&7]); &mov (@T[1],$C); &psubd (@X[-2&7],@X[3]); &xor (@T[1],$D); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],@T[1]); &jmp (&label("loop")); ###################################################################### # SSE instruction sequence is first broken to groups of independent # instructions, independent in respect to their inputs and shifter # (not all architectures have more than one). Then IALU instructions # are "knitted in" between the SSE groups. Distance is maintained for # SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer # [which allegedly also implements SSSE3]... # # Temporary registers usage. X[2] is volatile at the entry and at the # end is restored from backtrace ring buffer. X[3] is expected to # contain current K_XX_XX constant and is used to calculate X[-1]+K # from previous round, it becomes volatile the moment the value is # saved to stack for transfer to IALU. X[4] becomes volatile whenever # X[-4] is accumulated and offloaded to backtrace ring buffer, at the # end it is loaded with next K_XX_XX [which becomes X[3] in next # round]... # sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); &movdqa (@X[2],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@X[2],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@X[4],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@X[2],@X[0]); eval(shift(@insns)); &pslldq (@X[4],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[2],31); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (@X[3],@X[4]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[4],30); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &pslld (@X[3],2); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[4]); &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_ssse3_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # body_20_39 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); if ($Xi%5) { &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } eval(shift(@insns)); # ror &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (@X[2],@X[0]); &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &psrld (@X[2],30); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &por (@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_ssse3_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &mov ($inp=@T[1],&DWP(192+4,"esp")); &cmp ($inp,&DWP(192+8,"esp")); &je (&label("done")); &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask &movdqu (@X[-4&7],&QWP(0,$inp)); # load input &movdqu (@X[-3&7],&QWP(16,$inp)); &movdqu (@X[-2&7],&QWP(32,$inp)); &movdqu (@X[-1&7],&QWP(48,$inp)); &add ($inp,64); &pshufb (@X[-4&7],@X[2]); # byte swap &mov (&DWP(192+4,"esp"),$inp); &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot $Xi=0; } sub Xloop_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@X[3]); foreach (@insns) { eval; } $Xi++; } sub Xtail_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } sub body_00_19 () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&$_ror ($b,$j?7:2);', # $b>>>2 '&xor (@T[0],$d);', '&mov (@T[1],$a);', # $b in next round '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer '&xor ($b,$c);', # $c^$d for next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&and (@T[1],$b);', # ($b&($c^$d)) for next round '&xor ($b,$c);', # restore $b '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_20_39 () { # b^d^c # on entry @T[0]=b^d return &body_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer '&xor (@T[0],$d) if($j==19);'. '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) '&mov (@T[1],$a);', # $b in next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round '&$_ror ($b,7);', # $b>>>2 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_40_59 () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) '&xor ($c,$d) if ($j>=40);', # restore $c '&$_ror ($b,7);', # $b>>>2 '&mov (@T[1],$a);', # $b for next round '&xor (@T[0],$c);', '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j==59);'. '&xor (@T[1],$b) if ($j< 59);', # b^c for next round '&xor ($b,$c) if ($j< 59);', # c^d for next round '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } ###### sub bodyx_00_19 () { # ((c^d)&b)^d # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K return &bodyx_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 '&lea ($e,&DWP(0,$e,@T[0]));', '&rorx (@T[0],$a,5);', '&andn (@T[1],$a,$c);', '&and ($a,$b)', '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer '&xor (@T[1],$a)', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_20_39 () { # b^d^c # on start $b=b^c^d return &bodyx_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,($j==19?@T[0]:$b))', '&rorx ($b,@T[1],7);', # $b>>>2 '&rorx (@T[0],$a,5);', '&xor ($a,$b) if ($j<79);', '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer '&xor ($a,$c) if ($j<79);', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_40_59 () { # ((b^c)&(c^d))^c # on start $b=((b^c)&(c^d))^c return &bodyx_20_39() if ($rx==59); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx (@T[0],$a,5)', '&lea ($e,&DWP(0,$e,$b))', '&rorx ($b,@T[1],7)', # $b>>>2 '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer '&mov (@T[1],$c)', '&xor ($a,$b)', # b^c for next round '&xor (@T[1],$b)', # c^d for next round '&and ($a,@T[1])', '&add ($e,@T[0])', '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } &set_label("loop",16); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_32_79(\&body_00_19); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov (&DWP(8,@T[1]),$C); &mov ($B,$C); &mov (&DWP(12,@T[1]),$D); &xor ($B,$D); &mov (&DWP(16,@T[1]),$E); &mov (@T[1],@T[0]); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],$B); &mov ($B,$T[1]); &jmp (&label("loop")); &set_label("done",16); $j=$saved_j; @V=@saved_V; &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &mov ("esp",&DWP(192+12,"esp")); # restore %esp &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov (&DWP(8,@T[1]),$C); &mov (&DWP(12,@T[1]),$D); &mov (&DWP(16,@T[1]),$E); &function_end("_sha1_block_data_order_ssse3"); $rx=0; # reset if ($ymm) { my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 my @V=($A,$B,$C,$D,$E); my $j=0; # hash round my @T=($T,$tmp1); my $inp; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; &function_begin("_sha1_block_data_order_avx"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &set_label("avx_shortcut"); &vzeroall(); &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask &mov ($E,&wparam(0)); # load argument block &mov ($inp=@T[1],&wparam(1)); &mov ($D,&wparam(2)); &mov (@T[0],"esp"); # stack frame layout # # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area # X[4]+K X[5]+K X[6]+K X[7]+K # X[8]+K X[9]+K X[10]+K X[11]+K # X[12]+K X[13]+K X[14]+K X[15]+K # # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area # X[4] X[5] X[6] X[7] # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 # # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants # K_40_59 K_40_59 K_40_59 K_40_59 # K_60_79 K_60_79 K_60_79 K_60_79 # K_00_19 K_00_19 K_00_19 K_00_19 # pbswap mask # # +192 ctx # argument block # +196 inp # +200 end # +204 esp &sub ("esp",208); &and ("esp",-64); &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants &vmovdqa(&QWP(112+16,"esp"),@X[5]); &vmovdqa(&QWP(112+32,"esp"),@X[6]); &shl ($D,6); # len*64 &vmovdqa(&QWP(112+48,"esp"),@X[3]); &add ($D,$inp); # end of input &vmovdqa(&QWP(112+64,"esp"),@X[2]); &add ($inp,64); &mov (&DWP(192+0,"esp"),$E); # save argument block &mov (&DWP(192+4,"esp"),$inp); &mov (&DWP(192+8,"esp"),$D); &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp &mov ($A,&DWP(0,$E)); # load context &mov ($B,&DWP(4,$E)); &mov ($C,&DWP(8,$E)); &mov ($D,&DWP(12,$E)); &mov ($E,&DWP(16,$E)); &mov (@T[0],$B); # magic seed &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] &vmovdqu(@X[-3&7],&QWP(-48,$inp)); &vmovdqu(@X[-2&7],&QWP(-32,$inp)); &vmovdqu(@X[-1&7],&QWP(-16,$inp)); &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap &vpshufb(@X[-3&7],@X[-3&7],@X[2]); &vpshufb(@X[-2&7],@X[-2&7],@X[2]); &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot &vpshufb(@X[-1&7],@X[-1&7],@X[2]); &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 &vpaddd (@X[1],@X[-3&7],@X[3]); &vpaddd (@X[2],@X[-2&7],@X[3]); &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU &mov (@T[1],$C); &vmovdqa(&QWP(0+16,"esp"),@X[1]); &xor (@T[1],$D); &vmovdqa(&QWP(0+32,"esp"),@X[2]); &and (@T[0],@T[1]); &jmp (&label("loop")); sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[3],@X[3],@X[-1&7]); &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@X[2],@X[0],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@X[3],@X[4],30); &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslld (@X[4],@X[4],2); &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[3]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); if ($Xi%5) { &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } &vpaddd (@X[3],@X[3],@X[-1&7]); eval(shift(@insns)); # ror eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpsrld (@X[2],@X[0],30); &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_avx_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); &vpaddd (@X[3],@X[3],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &mov ($inp=@T[1],&DWP(192+4,"esp")); &cmp ($inp,&DWP(192+8,"esp")); &je (&label("done")); &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input &vmovdqu(@X[-3&7],&QWP(16,$inp)); &vmovdqu(@X[-2&7],&QWP(32,$inp)); &vmovdqu(@X[-1&7],&QWP(48,$inp)); &add ($inp,64); &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap &mov (&DWP(192+4,"esp"),$inp); &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot $Xi=0; } sub Xloop_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } $Xi++; } sub Xtail_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } &set_label("loop",16); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_32_79(\&body_00_19); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); &Xuplast_avx_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov ($B,$C); &mov (&DWP(8,@T[1]),$C); &xor ($B,$D); &mov (&DWP(12,@T[1]),$D); &mov (&DWP(16,@T[1]),$E); &mov (@T[1],@T[0]); &and (@T[0],$B); &mov ($B,@T[1]); &jmp (&label("loop")); &set_label("done",16); $j=$saved_j; @V=@saved_V; &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &vzeroall(); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &mov ("esp",&DWP(192+12,"esp")); # restore %esp &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov (&DWP(8,@T[1]),$C); &mov (&DWP(12,@T[1]),$D); &mov (&DWP(16,@T[1]),$E); &function_end("_sha1_block_data_order_avx"); } &set_label("K_XX_XX",64); &data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 &data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 &data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 &data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask &data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0); } &asciz("SHA1 block transform for x86, CRYPTOGAMS by "); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 364822) @@ -1,1628 +1,1628 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Multi-buffer SHA1 procedure processes n buffers in parallel by # placing buffer data to designated lane of SIMD register. n is # naturally limited to 4 on pre-AVX2 processors and to 8 on # AVX2-capable processors such as Haswell. # # this +aesni(i) sha1 aesni-sha1 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% # Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% # Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% # # (i) multi-block CBC encrypt with 128-bit key; # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, # because of lower AES-NI instruction throughput; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 8.00+4.44=12.4; # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 30% to 100% (on Haswell); $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha1_multi_block ( # struct { unsigned int A[8]; # unsigned int B[8]; # unsigned int C[8]; # unsigned int D[8]; # unsigned int E[8]; } *ctx, # struct { void *ptr; int blocks; } inp[8], # int num); /* 1 or 2 */ # $ctx="%rdi"; # 1st arg $inp="%rsi"; # 2nd arg $num="%edx"; @ptr=map("%r$_",(8..11)); $Tbl="%rbp"; @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); @Xi=map("%xmm$_",(10..14)); $K="%xmm15"; if (1) { # Atom-specific optimization aiming to eliminate pshufb with high # registers [and thus get rid of 48 cycles accumulated penalty] @Xi=map("%xmm$_",(0..4)); ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); } $REG_SZ=16; sub Xi_off { my $off = shift; $off %= 16; $off *= $REG_SZ; $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; } sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $k=$i+2; # Loads are performed 2+3/4 iterations in advance. 3/4 means that out # of 4 words you would expect to be loaded per given iteration one is # spilled to next iteration. In other words indices in four input # streams are distributed as following: # # $i==0: 0,0,0,0,1,1,1,1,2,2,2, # $i==1: 2,3,3,3, # $i==2: 3,4,4,4, # ... # $i==13: 14,15,15,15, # $i==14: 15 # # Then at $i==15 Xupdate is applied one iteration in advance... $code.=<<___ if ($i==0); movd (@ptr[0]),@Xi[0] lea `16*4`(@ptr[0]),@ptr[0] movd (@ptr[1]),@Xi[2] # borrow @Xi[2] lea `16*4`(@ptr[1]),@ptr[1] movd (@ptr[2]),@Xi[3] # borrow @Xi[3] lea `16*4`(@ptr[2]),@ptr[2] movd (@ptr[3]),@Xi[4] # borrow @Xi[4] lea `16*4`(@ptr[3]),@ptr[3] punpckldq @Xi[3],@Xi[0] movd `4*$j-16*4`(@ptr[0]),@Xi[1] punpckldq @Xi[4],@Xi[2] movd `4*$j-16*4`(@ptr[1]),$t3 punpckldq @Xi[2],@Xi[0] movd `4*$j-16*4`(@ptr[2]),$t2 pshufb $tx,@Xi[0] ___ $code.=<<___ if ($i<14); # just load input movd `4*$j-16*4`(@ptr[3]),$t1 punpckldq $t2,@Xi[1] movdqa $a,$t2 paddd $K,$e # e+=K_00_19 punpckldq $t1,$t3 movdqa $b,$t1 movdqa $b,$t0 pslld \$5,$t2 pandn $d,$t1 pand $c,$t0 punpckldq $t3,@Xi[1] movdqa $a,$t3 movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] movd `4*$k-16*4`(@ptr[0]),@Xi[2] psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 por $t3,$t2 # rol(a,5) movd `4*$k-16*4`(@ptr[1]),$t3 pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) pshufb $tx,@Xi[1] movd `4*$k-16*4`(@ptr[2]),$t2 por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i==14); # just load input movd `4*$j-16*4`(@ptr[3]),$t1 punpckldq $t2,@Xi[1] movdqa $a,$t2 paddd $K,$e # e+=K_00_19 punpckldq $t1,$t3 movdqa $b,$t1 movdqa $b,$t0 pslld \$5,$t2 prefetcht0 63(@ptr[0]) pandn $d,$t1 pand $c,$t0 punpckldq $t3,@Xi[1] movdqa $a,$t3 movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 prefetcht0 63(@ptr[1]) por $t3,$t2 # rol(a,5) pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) prefetcht0 63(@ptr[2]) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) pshufb $tx,@Xi[1] prefetcht0 63(@ptr[3]) por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i>=13 && $i<15); movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" ___ $code.=<<___ if ($i>=15); # apply Xupdate pxor @Xi[-2],@Xi[1] # "X[13]" movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" movdqa $a,$t2 pxor `&Xi_off($j+8)`,@Xi[1] paddd $K,$e # e+=K_00_19 movdqa $b,$t1 pslld \$5,$t2 pxor @Xi[3],@Xi[1] movdqa $b,$t0 pandn $d,$t1 movdqa @Xi[1],$tx pand $c,$t0 movdqa $a,$t3 psrld \$31,$tx paddd @Xi[1],@Xi[1] movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 por $t3,$t2 # rol(a,5) pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $tx,@Xi[1] # rol \$1,@Xi[1] por $t1,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i<79); pxor @Xi[-2],@Xi[1] # "X[13]" movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" movdqa $a,$t2 movdqa $d,$t0 pxor `&Xi_off($j+8)`,@Xi[1] paddd $K,$e # e+=K_20_39 pslld \$5,$t2 pxor $b,$t0 movdqa $a,$t3 ___ $code.=<<___ if ($i<72); movdqa @Xi[0],`&Xi_off($i)` ___ $code.=<<___ if ($i<79); paddd @Xi[0],$e # e+=X[i] pxor @Xi[3],@Xi[1] psrld \$27,$t3 pxor $c,$t0 # Parity(b,c,d) movdqa $b,$t1 pslld \$30,$t1 movdqa @Xi[1],$tx por $t3,$t2 # rol(a,5) psrld \$31,$tx paddd $t0,$e # e+=Parity(b,c,d) paddd @Xi[1],@Xi[1] psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $tx,@Xi[1] # rol(@Xi[1],1) por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i==79); movdqa $a,$t2 paddd $K,$e # e+=K_20_39 movdqa $d,$t0 pslld \$5,$t2 pxor $b,$t0 movdqa $a,$t3 paddd @Xi[0],$e # e+=X[i] psrld \$27,$t3 movdqa $b,$t1 pxor $c,$t0 # Parity(b,c,d) pslld \$30,$t1 por $t3,$t2 # rol(a,5) paddd $t0,$e # e+=Parity(b,c,d) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $t1,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; pxor @Xi[-2],@Xi[1] # "X[13]" movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" movdqa $a,$t2 movdqa $d,$t1 pxor `&Xi_off($j+8)`,@Xi[1] pxor @Xi[3],@Xi[1] paddd $K,$e # e+=K_40_59 pslld \$5,$t2 movdqa $a,$t3 pand $c,$t1 movdqa $d,$t0 movdqa @Xi[1],$tx psrld \$27,$t3 paddd $t1,$e pxor $c,$t0 movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] por $t3,$t2 # rol(a,5) psrld \$31,$tx pand $b,$t0 movdqa $b,$t1 pslld \$30,$t1 paddd @Xi[1],@Xi[1] paddd $t0,$e # e+=Maj(b,d,c) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $tx,@Xi[1] # rol(@X[1],1) por $t1,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl sha1_multi_block .type sha1_multi_block,\@function,3 .align 32 sha1_multi_block: .cfi_startproc mov OPENSSL_ia32cap_P+4(%rip),%rcx bt \$61,%rcx # check SHA bit jc _shaext_shortcut ___ $code.=<<___ if ($avx); test \$`1<<28`,%ecx jnz _avx_shortcut ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbx ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`,%rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody: lea K_XX_XX(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx .Loop_grande: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone movdqu 0x00($ctx),$A # load context lea 128(%rsp),%rax movdqu 0x20($ctx),$B movdqu 0x40($ctx),$C movdqu 0x60($ctx),$D movdqu 0x80($ctx),$E movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 jmp .Loop .align 32 .Loop: ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; movdqa (%rbx),@Xi[0] # pull counters mov \$1,%ecx cmp 4*0(%rbx),%ecx # examine counters pxor $t2,$t2 cmovge $Tbl,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx movdqa @Xi[0],@Xi[1] cmovge $Tbl,@ptr[1] cmp 4*2(%rbx),%ecx pcmpgtd $t2,@Xi[1] # mask value cmovge $Tbl,@ptr[2] cmp 4*3(%rbx),%ecx paddd @Xi[1],@Xi[0] # counters-- cmovge $Tbl,@ptr[3] movdqu 0x00($ctx),$t0 pand @Xi[1],$A movdqu 0x20($ctx),$t1 pand @Xi[1],$B paddd $t0,$A movdqu 0x40($ctx),$t2 pand @Xi[1],$C paddd $t1,$B movdqu 0x60($ctx),$t3 pand @Xi[1],$D paddd $t2,$C movdqu 0x80($ctx),$tx pand @Xi[1],$E movdqu $A,0x00($ctx) paddd $t3,$D movdqu $B,0x20($ctx) paddd $tx,$E movdqu $C,0x40($ctx) movdqu $D,0x60($ctx) movdqu $E,0x80($ctx) movdqa @Xi[0],(%rbx) # save counters movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 dec $num jnz .Loop mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande .Ldone: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha1_multi_block,.-sha1_multi_block ___ {{{ my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); my @MSG0=map("%xmm$_",(4..7)); my @MSG1=map("%xmm$_",(11..14)); $code.=<<___; .type sha1_multi_block_shaext,\@function,3 .align 32 sha1_multi_block_shaext: .cfi_startproc _shaext_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`,%rsp shl \$1,$num # we process pair at a time and \$-256,%rsp lea 0x40($ctx),$ctx # size optimization mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .Lbody_shaext: lea `$REG_SZ*16`(%rsp),%rbx movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap .Loop_grande_shaext: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle %rsp,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_shaext movq 0x00-0x40($ctx),$ABCD0 # a1.a0 movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 movdqa $ABCD0,$ABCD1 punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 pshufd \$0b00111111,@MSG0[3],$E0 pshufd \$0b01111111,@MSG0[3],$E1 pshufd \$0b00011011,$ABCD0,$ABCD0 pshufd \$0b00011011,$ABCD1,$ABCD1 jmp .Loop_shaext .align 32 .Loop_shaext: movdqu 0x00(@ptr[0]),@MSG0[0] movdqu 0x00(@ptr[1]),@MSG1[0] movdqu 0x10(@ptr[0]),@MSG0[1] movdqu 0x10(@ptr[1]),@MSG1[1] movdqu 0x20(@ptr[0]),@MSG0[2] pshufb $BSWAP,@MSG0[0] movdqu 0x20(@ptr[1]),@MSG1[2] pshufb $BSWAP,@MSG1[0] movdqu 0x30(@ptr[0]),@MSG0[3] lea 0x40(@ptr[0]),@ptr[0] pshufb $BSWAP,@MSG0[1] movdqu 0x30(@ptr[1]),@MSG1[3] lea 0x40(@ptr[1]),@ptr[1] pshufb $BSWAP,@MSG1[1] movdqa $E0,0x50(%rsp) # offload paddd @MSG0[0],$E0 movdqa $E1,0x70(%rsp) paddd @MSG1[0],$E1 movdqa $ABCD0,0x40(%rsp) # offload movdqa $ABCD0,$E0_ movdqa $ABCD1,0x60(%rsp) movdqa $ABCD1,$E1_ sha1rnds4 \$0,$E0,$ABCD0 # 0-3 sha1nexte @MSG0[1],$E0_ sha1rnds4 \$0,$E1,$ABCD1 # 0-3 sha1nexte @MSG1[1],$E1_ pshufb $BSWAP,@MSG0[2] prefetcht0 127(@ptr[0]) sha1msg1 @MSG0[1],@MSG0[0] pshufb $BSWAP,@MSG1[2] prefetcht0 127(@ptr[1]) sha1msg1 @MSG1[1],@MSG1[0] pshufb $BSWAP,@MSG0[3] movdqa $ABCD0,$E0 pshufb $BSWAP,@MSG1[3] movdqa $ABCD1,$E1 sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 sha1nexte @MSG0[2],$E0 sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 sha1nexte @MSG1[2],$E1 pxor @MSG0[2],@MSG0[0] sha1msg1 @MSG0[2],@MSG0[1] pxor @MSG1[2],@MSG1[0] sha1msg1 @MSG1[2],@MSG1[1] ___ for($i=2;$i<20-4;$i++) { $code.=<<___; movdqa $ABCD0,$E0_ movdqa $ABCD1,$E1_ sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 sha1nexte @MSG0[3],$E0_ sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 sha1nexte @MSG1[3],$E1_ sha1msg2 @MSG0[3],@MSG0[0] sha1msg2 @MSG1[3],@MSG1[0] pxor @MSG0[3],@MSG0[1] sha1msg1 @MSG0[3],@MSG0[2] pxor @MSG1[3],@MSG1[1] sha1msg1 @MSG1[3],@MSG1[2] ___ ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); } $code.=<<___; movdqa $ABCD0,$E0_ movdqa $ABCD1,$E1_ sha1rnds4 \$3,$E0,$ABCD0 # 64-67 sha1nexte @MSG0[3],$E0_ sha1rnds4 \$3,$E1,$ABCD1 # 64-67 sha1nexte @MSG1[3],$E1_ sha1msg2 @MSG0[3],@MSG0[0] sha1msg2 @MSG1[3],@MSG1[0] pxor @MSG0[3],@MSG0[1] pxor @MSG1[3],@MSG1[1] mov \$1,%ecx pxor @MSG0[2],@MSG0[2] # zero cmp 4*0(%rbx),%ecx # examine counters cmovge %rsp,@ptr[0] # cancel input movdqa $ABCD0,$E0 movdqa $ABCD1,$E1 sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 sha1nexte @MSG0[0],$E0 sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 sha1nexte @MSG1[0],$E1 sha1msg2 @MSG0[0],@MSG0[1] sha1msg2 @MSG1[0],@MSG1[1] cmp 4*1(%rbx),%ecx cmovge %rsp,@ptr[1] movq (%rbx),@MSG0[0] # pull counters movdqa $ABCD0,$E0_ movdqa $ABCD1,$E1_ sha1rnds4 \$3,$E0,$ABCD0 # 72-75 sha1nexte @MSG0[1],$E0_ sha1rnds4 \$3,$E1,$ABCD1 # 72-75 sha1nexte @MSG1[1],$E1_ pshufd \$0x00,@MSG0[0],@MSG1[2] pshufd \$0x55,@MSG0[0],@MSG1[3] movdqa @MSG0[0],@MSG0[1] pcmpgtd @MSG0[2],@MSG1[2] pcmpgtd @MSG0[2],@MSG1[3] movdqa $ABCD0,$E0 movdqa $ABCD1,$E1 sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 sha1nexte $MSG0[2],$E0 sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 sha1nexte $MSG0[2],$E1 pcmpgtd @MSG0[2],@MSG0[1] # counter mask pand @MSG1[2],$ABCD0 pand @MSG1[2],$E0 pand @MSG1[3],$ABCD1 pand @MSG1[3],$E1 paddd @MSG0[1],@MSG0[0] # counters-- paddd 0x40(%rsp),$ABCD0 paddd 0x50(%rsp),$E0 paddd 0x60(%rsp),$ABCD1 paddd 0x70(%rsp),$E1 movq @MSG0[0],(%rbx) # save counters dec $num jnz .Loop_shaext mov `$REG_SZ*17+8`(%rsp),$num pshufd \$0b00011011,$ABCD0,$ABCD0 pshufd \$0b00011011,$ABCD1,$ABCD1 movdqa $ABCD0,@MSG0[0] punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 punpckhdq $E1,$E0 # e1.e0.xx.xx movq $ABCD0,0x00-0x40($ctx) # a1.a0 psrldq \$8,$ABCD0 movq @MSG0[0],0x40-0x40($ctx)# c1.c0 psrldq \$8,@MSG0[0] movq $ABCD0,0x20-0x40($ctx) # b1.b0 psrldq \$8,$E0 movq @MSG0[0],0x60-0x40($ctx)# d1.d0 movq $E0,0x80-0x40($ctx) # e1.e0 lea `$REG_SZ/2`($ctx),$ctx lea `16*2`($inp),$inp dec $num jnz .Loop_grande_shaext .Ldone_shaext: #mov `$REG_SZ*17`(%rsp),%rax # original %rsp ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_shaext: ret .cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext ___ }}} if ($avx) {{{ sub BODY_00_19_avx { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $k=$i+2; my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; $code.=<<___ if ($i==0 && $REG_SZ==16); vmovd (@ptr[0]),@Xi[0] lea `16*4`(@ptr[0]),@ptr[0] vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] lea `16*4`(@ptr[1]),@ptr[1] vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] lea `16*4`(@ptr[3]),@ptr[3] vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] vpunpckldq @Xi[2],@Xi[0],@Xi[0] vmovd `4*$j-16*4`($ptr_n),$t3 vpshufb $tx,@Xi[0],@Xi[0] ___ $code.=<<___ if ($i<15 && $REG_SZ==16); # just load input vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 ___ $code.=<<___ if ($i==0 && $REG_SZ==32); vmovd (@ptr[0]),@Xi[0] lea `16*4`(@ptr[0]),@ptr[0] vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] lea `16*4`(@ptr[4]),@ptr[4] vmovd (@ptr[1]),$t2 lea `16*4`(@ptr[1]),@ptr[1] vmovd (@ptr[5]),$t1 lea `16*4`(@ptr[5]),@ptr[5] vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] lea `16*4`(@ptr[6]),@ptr[6] vpinsrd \$1,(@ptr[3]),$t2,$t2 lea `16*4`(@ptr[3]),@ptr[3] vpunpckldq $t2,@Xi[0],@Xi[0] vpinsrd \$1,(@ptr[7]),$t1,$t1 lea `16*4`(@ptr[7]),@ptr[7] vpunpckldq $t1,@Xi[2],@Xi[2] vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] vinserti128 @Xi[2],@Xi[0],@Xi[0] vmovd `4*$j-16*4`($ptr_n),$t3 vpshufb $tx,@Xi[0],@Xi[0] ___ $code.=<<___ if ($i<15 && $REG_SZ==32); # just load input vmovd `4*$j-16*4`(@ptr[1]),$t2 vmovd `4*$j-16*4`(@ptr[5]),$t1 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 vpunpckldq $t2,@Xi[1],@Xi[1] vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 vpunpckldq $t1,$t3,$t3 ___ $code.=<<___ if ($i<14); vpaddd $K,$e,$e # e+=K_00_19 vpslld \$5,$a,$t2 vpandn $d,$b,$t1 vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] $vpack $t3,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 vpxor $t1,$t0,$t0 # Ch(b,c,d) vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vmovd `4*$k-16*4`($ptr_n),$t3 vpaddd $t0,$e,$e # e+=Ch(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpshufb $tx,@Xi[1],@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ $code.=<<___ if ($i==14); vpaddd $K,$e,$e # e+=K_00_19 prefetcht0 63(@ptr[0]) vpslld \$5,$a,$t2 vpandn $d,$b,$t1 vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] $vpack $t3,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 prefetcht0 63(@ptr[1]) vpxor $t1,$t0,$t0 # Ch(b,c,d) vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) prefetcht0 63(@ptr[2]) vpaddd $t0,$e,$e # e+=Ch(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) prefetcht0 63(@ptr[3]) vpshufb $tx,@Xi[1],@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ $code.=<<___ if ($i>=13 && $i<15); vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" ___ $code.=<<___ if ($i>=15); # apply Xupdate vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" vpaddd $K,$e,$e # e+=K_00_19 vpslld \$5,$a,$t2 vpandn $d,$b,$t1 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 vpxor $t1,$t0,$t0 # Ch(b,c,d) vpxor @Xi[3],@Xi[1],@Xi[1] `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Ch(b,c,d) `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpsrld \$31,@Xi[1],$tx vpaddd @Xi[1],@Xi[1],@Xi[1] vpsrld \$2,$b,$b `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_20_39_avx { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i<79); vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" vpslld \$5,$a,$t2 vpaddd $K,$e,$e # e+=K_20_39 vpxor $b,$d,$t0 ___ $code.=<<___ if ($i<72); vmovdqa @Xi[0],`&Xi_off($i)` ___ $code.=<<___ if ($i<79); vpaddd @Xi[0],$e,$e # e+=X[i] vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 vpxor $c,$t0,$t0 # Parity(b,c,d) vpxor @Xi[3],@Xi[1],@Xi[1] vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Parity(b,c,d) vpsrld \$31,@Xi[1],$tx vpaddd @Xi[1],@Xi[1],@Xi[1] vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) vpor $t1,$b,$b # b=rol(b,30) ___ $code.=<<___ if ($i==79); vpslld \$5,$a,$t2 vpaddd $K,$e,$e # e+=K_20_39 vpxor $b,$d,$t0 vpsrld \$27,$a,$t3 vpaddd @Xi[0],$e,$e # e+=X[i] vpxor $c,$t0,$t0 # Parity(b,c,d) vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Parity(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpor $t1,$b,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_40_59_avx { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" vpaddd $K,$e,$e # e+=K_40_59 vpslld \$5,$a,$t2 vpand $c,$d,$t1 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] vpaddd $t1,$e,$e vpsrld \$27,$a,$t3 vpxor $c,$d,$t0 vpxor @Xi[3],@Xi[1],@Xi[1] vmovdqu @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] vpor $t3,$t2,$t2 # rol(a,5) vpsrld \$31,@Xi[1],$tx vpand $b,$t0,$t0 vpaddd @Xi[1],@Xi[1],@Xi[1] vpslld \$30,$b,$t1 vpaddd $t0,$e,$e # e+=Maj(b,d,c) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) vpor $t1,$b,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } $code.=<<___; .type sha1_multi_block_avx,\@function,3 .align 32 sha1_multi_block_avx: .cfi_startproc _avx_shortcut: ___ $code.=<<___ if ($avx>1); shr \$32,%rcx cmp \$2,$num jb .Lavx test \$`1<<5`,%ecx jnz _avx2_shortcut jmp .Lavx .align 32 .Lavx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx: lea K_XX_XX(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx vzeroupper .Loop_grande_avx: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_avx vmovdqu 0x00($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20($ctx),$B vmovdqu 0x40($ctx),$C vmovdqu 0x60($ctx),$D vmovdqu 0x80($ctx),$E vmovdqu 0x60($Tbl),$tx # pbswap_mask jmp .Loop_avx .align 32 .Loop_avx: ___ $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; mov \$1,%ecx ___ for($i=0;$i<4;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu (%rbx),$t0 # pull counters vpxor $t2,$t2,$t2 vmovdqa $t0,$t1 vpcmpgtd $t2,$t1,$t1 # mask value vpaddd $t1,$t0,$t0 # counters-- vpand $t1,$A,$A vpand $t1,$B,$B vpaddd 0x00($ctx),$A,$A vpand $t1,$C,$C vpaddd 0x20($ctx),$B,$B vpand $t1,$D,$D vpaddd 0x40($ctx),$C,$C vpand $t1,$E,$E vpaddd 0x60($ctx),$D,$D vpaddd 0x80($ctx),$E,$E vmovdqu $A,0x00($ctx) vmovdqu $B,0x20($ctx) vmovdqu $C,0x40($ctx) vmovdqu $D,0x60($ctx) vmovdqu $E,0x80($ctx) vmovdqu $t0,(%rbx) # save counters vmovdqu 0x60($Tbl),$tx # pbswap_mask dec $num jnz .Loop_avx mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande_avx .Ldone_avx: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha1_multi_block_avx,.-sha1_multi_block_avx ___ if ($avx>1) { $code =~ s/\`([^\`]*)\`/eval $1/gem; $REG_SZ=32; @ptr=map("%r$_",(12..15,8..11)); @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); @Xi=map("%ymm$_",(10..14)); $K="%ymm15"; $code.=<<___; .type sha1_multi_block_avx2,\@function,3 .align 32 sha1_multi_block_avx2: .cfi_startproc _avx2_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx2: lea K_XX_XX(%rip),$Tbl shr \$1,$num vzeroupper .Loop_grande_avx2: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu 0x00($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20($ctx),$B lea 256+128(%rsp),%rbx vmovdqu 0x40($ctx),$C vmovdqu 0x60($ctx),$D vmovdqu 0x80($ctx),$E vmovdqu 0x60($Tbl),$tx # pbswap_mask jmp .Loop_avx2 .align 32 .Loop_avx2: ___ $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; mov \$1,%ecx lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu (%rbx),$t0 # pull counters vpxor $t2,$t2,$t2 vmovdqa $t0,$t1 vpcmpgtd $t2,$t1,$t1 # mask value vpaddd $t1,$t0,$t0 # counters-- vpand $t1,$A,$A vpand $t1,$B,$B vpaddd 0x00($ctx),$A,$A vpand $t1,$C,$C vpaddd 0x20($ctx),$B,$B vpand $t1,$D,$D vpaddd 0x40($ctx),$C,$C vpand $t1,$E,$E vpaddd 0x60($ctx),$D,$D vpaddd 0x80($ctx),$E,$E vmovdqu $A,0x00($ctx) vmovdqu $B,0x20($ctx) vmovdqu $C,0x40($ctx) vmovdqu $D,0x60($ctx) vmovdqu $E,0x80($ctx) vmovdqu $t0,(%rbx) # save counters lea 256+128(%rsp),%rbx vmovdqu 0x60($Tbl),$tx # pbswap_mask dec $num jnz .Loop_avx2 #mov `$REG_SZ*17+8`(%rsp),$num #lea $REG_SZ($ctx),$ctx #lea `16*$REG_SZ/4`($inp),$inp #dec $num #jnz .Loop_grande_avx2 .Ldone_avx2: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size sha1_multi_block_avx2,.-sha1_multi_block_avx2 ___ } }}} $code.=<<___; .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 K_XX_XX: .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by " ___ if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lbody jb .Lin_prologue mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue mov `16*17`(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp lea -24-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($avx>1); .type avx2_handler,\@abi-omnipotent .align 16 avx2_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue mov `32*17`($context),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size avx2_handler,.-avx2_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_sha1_multi_block .rva .LSEH_end_sha1_multi_block .rva .LSEH_info_sha1_multi_block .rva .LSEH_begin_sha1_multi_block_shaext .rva .LSEH_end_sha1_multi_block_shaext .rva .LSEH_info_sha1_multi_block_shaext ___ $code.=<<___ if ($avx); .rva .LSEH_begin_sha1_multi_block_avx .rva .LSEH_end_sha1_multi_block_avx .rva .LSEH_info_sha1_multi_block_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_sha1_multi_block_avx2 .rva .LSEH_end_sha1_multi_block_avx2 .rva .LSEH_info_sha1_multi_block_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_multi_block: .byte 9,0,0,0 .rva se_handler .rva .Lbody,.Lepilogue # HandlerData[] .LSEH_info_sha1_multi_block_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_sha1_multi_block_avx: .byte 9,0,0,0 .rva se_handler .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_sha1_multi_block_avx2: .byte 9,0,0,0 .rva avx2_handler .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if ($dst>=8); $rex|=0x01 if ($src>=8); unshift @opcode,$rex|0x40 if ($rex); } sub sha1rnds4 { if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x3a,0xcc); rex(\@opcode,$3,$2); push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } else { return "sha1rnds4\t".@_[0]; } } sub sha1op38 { my $instr = shift; my %opcodelet = ( "sha1nexte" => 0xc8, "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/sha/asm/sha1-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/sha/asm/sha1-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/sha/asm/sha1-x86_64.pl (revision 364822) @@ -1,2132 +1,2132 @@ #! /usr/bin/env perl # Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # sha1_block procedure for x86_64. # # It was brought to my attention that on EM64T compiler-generated code # was far behind 32-bit assembler implementation. This is unlike on # Opteron where compiler-generated code was only 15% behind 32-bit # assembler, which originally made it hard to motivate the effort. # There was suggestion to mechanically translate 32-bit code, but I # dismissed it, reasoning that x86_64 offers enough register bank # capacity to fully utilize SHA-1 parallelism. Therefore this fresh # implementation:-) However! While 64-bit code does perform better # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, # x86_64 does offer larger *addressable* bank, but out-of-order core # reaches for even more registers through dynamic aliasing, and EM64T # core must have managed to run-time optimize even 32-bit code just as # good as 64-bit one. Performance improvement is summarized in the # following table: # # gcc 3.4 32-bit asm cycles/byte # Opteron +45% +20% 6.8 # Xeon P4 +65% +0% 9.9 # Core2 +60% +10% 7.0 # August 2009. # # The code was revised to minimize code size and to maximize # "distance" between instructions producing input to 'lea' # instruction and the 'lea' instruction itself, which is essential # for Intel Atom core. # October 2010. # # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it # is to offload message schedule denoted by Wt in NIST specification, # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module # for background and implementation details. The only difference from # 32-bit code is that 64-bit code doesn't have to spill @X[] elements # to free temporary registers. # April 2011. # # Add AVX code path. See sha1-586.pl for further information. # May 2013. # # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions # and loading pair of consecutive blocks to 256-bit %ymm registers) # did not provide impressive performance improvement till a crucial # hint regarding the number of Xupdate iterations to pre-compute in # advance was provided by Ilya Albrekht of Intel Corp. # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance is summarized in following table. Numbers are # CPU clock cycles spent to process single byte (less is better). # # x86_64 SSSE3 AVX[2] # P4 9.05 - # Opteron 6.26 - # Core2 6.55 6.05/+8% - # Westmere 6.73 5.30/+27% - # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% # Skylake 5.18 4.06/+28% 3.54/+46% # Bulldozer 9.11 5.95/+53% # Ryzen 4.75 3.80/+24% 1.93/+150%(**) # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% # Knights L 13.2(*) 9.68/+36% 8.30/+59% # Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; # (**) SHAEXT result $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $ctx="%rdi"; # 1st arg $inp="%rsi"; # 2nd arg $num="%rdx"; # 3rd arg # reassign arguments in order to produce more compact code $ctx="%r8"; $inp="%r9"; $num="%r10"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; @xi=("%edx","%ebp","%r14d"); $A="%esi"; $B="%edi"; $C="%r11d"; $D="%r12d"; $E="%r13d"; @V=($A,$B,$C,$D,$E); sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi[0] bswap $xi[0] ___ $code.=<<___ if ($i<15); mov `4*$j`($inp),$xi[1] mov $d,$t0 mov $xi[0],`4*$i`(%rsp) mov $a,$t2 bswap $xi[1] xor $c,$t0 rol \$5,$t2 and $b,$t0 lea 0x5a827999($xi[0],$e),$e add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$e ___ $code.=<<___ if ($i>=15); xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $c,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 lea 0x5a827999($xi[0],$e),$e rol \$30,$b xor $d,$t0 add $t2,$e rol \$1,$xi[1] add $t0,$e ___ push(@xi,shift(@xi)); } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); xor `4*($j%16)`(%rsp),$xi[1] mov $b,$t0 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] lea $K($xi[0],$e),$e xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e rol \$1,$xi[1] ___ $code.=<<___ if ($i==79); mov $b,$t0 mov $a,$t2 xor $d,$t0 lea $K($xi[0],$e),$e rol \$5,$t2 xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e ___ push(@xi,shift(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $d,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] and $c,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] lea 0x8f1bbcdc($xi[0],$e),$e xor $c,$t1 rol \$5,$t2 add $t0,$e rol \$1,$xi[1] and $b,$t1 add $t2,$e rol \$30,$b add $t1,$e ___ push(@xi,shift(@xi)); } $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl sha1_block_data_order .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: .cfi_startproc mov OPENSSL_ia32cap_P+0(%rip),%r9d mov OPENSSL_ia32cap_P+4(%rip),%r8d mov OPENSSL_ia32cap_P+8(%rip),%r10d test \$`1<<9`,%r8d # check SSSE3 bit jz .Lialu ___ $code.=<<___ if ($shaext); test \$`1<<29`,%r10d # check SHA bit jnz _shaext_shortcut ___ $code.=<<___ if ($avx>1); and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 cmp \$`1<<3|1<<5|1<<8`,%r10d je _avx2_shortcut ___ $code.=<<___ if ($avx); and \$`1<<28`,%r8d # mask AVX bit and \$`1<<30`,%r9d # mask "Intel CPU" bit or %r9d,%r8d cmp \$`1<<28|1<<30`,%r8d je _avx_shortcut ___ $code.=<<___; jmp _ssse3_shortcut .align 16 .Lialu: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %rax,`16*4`(%rsp) .cfi_cfa_expression %rsp+64,deref,+8 .Lprologue: mov 0($ctx),$A mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E jmp .Lloop .align 16 .Lloop: ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; add 0($ctx),$A add 4($ctx),$B add 8($ctx),$C add 12($ctx),$D add 16($ctx),$E mov $A,0($ctx) mov $B,4($ctx) mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) sub \$1,$num lea `16*4`($inp),$inp jnz .Lloop mov `16*4`(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order ___ if ($shaext) {{{ ###################################################################### # Intel SHA Extensions implementation of SHA1 update function. # my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx"); my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9)); my @MSG=map("%xmm$_",(4..7)); $code.=<<___; .type sha1_block_data_order_shaext,\@function,3 .align 32 sha1_block_data_order_shaext: _shaext_shortcut: .cfi_startproc ___ $code.=<<___ if ($win64); lea `-8-4*16`(%rsp),%rsp movaps %xmm6,-8-4*16(%rax) movaps %xmm7,-8-3*16(%rax) movaps %xmm8,-8-2*16(%rax) movaps %xmm9,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; movdqu ($ctx),$ABCD movd 16($ctx),$E movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap movdqu ($inp),@MSG[0] pshufd \$0b00011011,$ABCD,$ABCD # flip word order movdqu 0x10($inp),@MSG[1] pshufd \$0b00011011,$E,$E # flip word order movdqu 0x20($inp),@MSG[2] pshufb $BSWAP,@MSG[0] movdqu 0x30($inp),@MSG[3] pshufb $BSWAP,@MSG[1] pshufb $BSWAP,@MSG[2] movdqa $E,$E_SAVE # offload $E pshufb $BSWAP,@MSG[3] jmp .Loop_shaext .align 16 .Loop_shaext: dec $num lea 0x40($inp),%r8 # next input block paddd @MSG[0],$E cmovne %r8,$inp movdqa $ABCD,$ABCD_SAVE # offload $ABCD ___ for($i=0;$i<20-4;$i+=2) { $code.=<<___; sha1msg1 @MSG[1],@MSG[0] movdqa $ABCD,$E_ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3... sha1nexte @MSG[1],$E_ pxor @MSG[2],@MSG[0] sha1msg1 @MSG[2],@MSG[1] sha1msg2 @MSG[3],@MSG[0] movdqa $ABCD,$E sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD sha1nexte @MSG[2],$E pxor @MSG[3],@MSG[1] sha1msg2 @MSG[0],@MSG[1] ___ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); } $code.=<<___; movdqu ($inp),@MSG[0] movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 64-67 sha1nexte @MSG[1],$E_ movdqu 0x10($inp),@MSG[1] pshufb $BSWAP,@MSG[0] movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 68-71 sha1nexte @MSG[2],$E movdqu 0x20($inp),@MSG[2] pshufb $BSWAP,@MSG[1] movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 72-75 sha1nexte @MSG[3],$E_ movdqu 0x30($inp),@MSG[3] pshufb $BSWAP,@MSG[2] movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 76-79 sha1nexte $E_SAVE,$E pshufb $BSWAP,@MSG[3] paddd $ABCD_SAVE,$ABCD movdqa $E,$E_SAVE # offload $E jnz .Loop_shaext pshufd \$0b00011011,$ABCD,$ABCD pshufd \$0b00011011,$E,$E movdqu $ABCD,($ctx) movd $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-4*16(%rax),%xmm6 movaps -8-3*16(%rax),%xmm7 movaps -8-2*16(%rax),%xmm8 movaps -8-1*16(%rax),%xmm9 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext ___ }}} {{{ my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my $Kx="%xmm11"; my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $rx=0; my $K_XX_XX="%r14"; my $fp="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; { my $sn; sub align32() { ++$sn; $code.=<<___; jmp .Lalign32_$sn # see "Decoded ICache" in manual .align 32 .Lalign32_$sn: ___ } } $code.=<<___; .type sha1_block_data_order_ssse3,\@function,3 .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: .cfi_startproc mov %rsp,$fp # frame pointer .cfi_def_cfa_register $fp push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler .cfi_push %r13 push %r14 .cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-40-6*16($fp) movaps %xmm7,-40-5*16($fp) movaps %xmm8,-40-4*16($fp) movaps %xmm9,-40-3*16($fp) movaps %xmm10,-40-2*16($fp) movaps %xmm11,-40-1*16($fp) .Lprologue_ssse3: ___ $code.=<<___; and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument shl \$6,$num add $inp,$num lea K_XX_XX+64(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] movdqa 64($K_XX_XX),@X[2] # pbswap mask movdqa -64($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @X[2],@X[-4&7] # byte swap pshufb @X[2],@X[-3&7] pshufb @X[2],@X[-2&7] add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU psubd @Tx[1],@X[-4&7] # restore X[] movdqa @X[-3&7],16(%rsp) psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] jmp .Loop_ssse3 ___ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xupdate_ssse3_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_ror/); eval(shift(@insns)) if (@insns[0] =~ /_ror/); &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); } eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],30); eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xuplast_ssse3_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$num); &je (".Ldone_ssse3"); unshift(@Tx,pop(@Tx)); &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); &movdqu (@X[-1&7],"48($inp)"); &pshufb (@X[-4&7],@X[2]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } $Xi++; } sub Xtail_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } sub body_00_19 () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&$_ror ($b,$j?7:2)', # $b>>>2 '&xor (@T[0],$d)', '&mov (@T[1],$a)', # $b for next round '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer '&xor ($b,$c)', # $c^$d for next round '&$_rol ($a,5)', '&add ($e,@T[0])', '&and (@T[1],$b)', # ($b&($c^$d)) for next round '&xor ($b,$c)', # restore $b '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_20_39 () { # b^d^c # on entry @T[0]=b^d return &body_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer '&xor (@T[0],$d) if($j==19);'. '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c) '&mov (@T[1],$a)', # $b for next round '&$_rol ($a,5)', '&add ($e,@T[0])', '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round '&$_ror ($b,7)', # $b>>>2 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_40_59 () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d) '&xor ($c,$d) if ($j>=40)', # restore $c '&$_ror ($b,7)', # $b>>>2 '&mov (@T[1],$a)', # $b for next round '&xor (@T[0],$c)', '&$_rol ($a,5)', '&add ($e,@T[0])', '&xor (@T[1],$c) if ($j==59);'. '&xor (@T[1],$b) if ($j< 59)', # b^c for next round '&xor ($b,$c) if ($j< 59)', # c^d for next round '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } $code.=<<___; .align 16 .Loop_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_32_79(\&body_00_19); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_ssse3 .align 16 .Ldone_ssse3: ___ $j=$saved_j; @V=@saved_V; &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -40-6*16($fp),%xmm6 movaps -40-5*16($fp),%xmm7 movaps -40-4*16($fp),%xmm8 movaps -40-3*16($fp),%xmm9 movaps -40-2*16($fp),%xmm10 movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; mov -40($fp),%r14 .cfi_restore %r14 mov -32($fp),%r13 .cfi_restore %r13 mov -24($fp),%r12 .cfi_restore %r12 mov -16($fp),%rbp .cfi_restore %rbp mov -8($fp),%rbx .cfi_restore %rbx lea ($fp),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret .cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 ___ if ($avx) { $Xi=4; # reset variables @X=map("%xmm$_",(4..7,0..3)); @Tx=map("%xmm$_",(8..10)); $j=0; $rx=0; my $done_avx_label=".Ldone_avx"; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; .type sha1_block_data_order_avx,\@function,3 .align 16 sha1_block_data_order_avx: _avx_shortcut: .cfi_startproc mov %rsp,$fp .cfi_def_cfa_register $fp push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler .cfi_push %r13 push %r14 .cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp vzeroupper ___ $code.=<<___ if ($win64); vmovaps %xmm6,-40-6*16($fp) vmovaps %xmm7,-40-5*16($fp) vmovaps %xmm8,-40-4*16($fp) vmovaps %xmm9,-40-3*16($fp) vmovaps %xmm10,-40-2*16($fp) vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx: ___ $code.=<<___; and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument shl \$6,$num add $inp,$num lea K_XX_XX+64(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask vmovdqa -64($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] vmovdqu 48($inp),@X[-1&7] vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap add \$64,$inp vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) jmp .Loop_avx ___ sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslld (@Tx[2],@Tx[2],2); &vpxor (@X[0],@X[0],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); &vpaddd (@Tx[1],$Kx,@X[-1&7]); &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); # ror eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpsrld (@Tx[0],@X[0],30); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_avx_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$num); &je ($done_avx_label); &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); &vmovdqu(@X[-1&7],"48($inp)"); &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } $Xi++; } sub Xtail_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } $code.=<<___; .align 16 .Loop_avx: ___ &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_32_79(\&body_00_19); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); &Xuplast_avx_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_avx .align 16 $done_avx_label: ___ $j=$saved_j; @V=@saved_V; &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); $code.=<<___; vzeroupper add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -40-6*16($fp),%xmm6 movaps -40-5*16($fp),%xmm7 movaps -40-4*16($fp),%xmm8 movaps -40-3*16($fp),%xmm9 movaps -40-2*16($fp),%xmm10 movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; mov -40($fp),%r14 .cfi_restore %r14 mov -32($fp),%r13 .cfi_restore %r13 mov -24($fp),%r12 .cfi_restore %r12 mov -16($fp),%rbp .cfi_restore %rbp mov -8($fp),%rbx .cfi_restore %rbx lea ($fp),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx ___ if ($avx>1) { use integer; $Xi=4; # reset variables @X=map("%ymm$_",(4..7,0..3)); @Tx=map("%ymm$_",(8..10)); $Kx="%ymm11"; $j=0; my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi"); my ($a5,$t0)=("%r12d","%edi"); my ($A,$F,$B,$C,$D,$E)=@ROTX; my $rx=0; my $frame="%r13"; $code.=<<___; .type sha1_block_data_order_avx2,\@function,3 .align 16 sha1_block_data_order_avx2: _avx2_shortcut: .cfi_startproc mov %rsp,$fp .cfi_def_cfa_register $fp push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 vzeroupper ___ $code.=<<___ if ($win64); lea -6*16(%rsp),%rsp vmovaps %xmm6,-40-6*16($fp) vmovaps %xmm7,-40-5*16($fp) vmovaps %xmm8,-40-4*16($fp) vmovaps %xmm9,-40-3*16($fp) vmovaps %xmm10,-40-2*16($fp) vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx2: ___ $code.=<<___; mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument lea -640(%rsp),%rsp shl \$6,$num lea 64($inp),$frame and \$-128,%rsp add $inp,$num lea K_XX_XX+64(%rip),$K_XX_XX mov 0($ctx),$A # load context cmp $num,$frame cmovae $inp,$frame # next or same block mov 4($ctx),$F mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E vmovdqu 64($K_XX_XX),@X[2] # pbswap mask vmovdqu ($inp),%xmm0 vmovdqu 16($inp),%xmm1 vmovdqu 32($inp),%xmm2 vmovdqu 48($inp),%xmm3 lea 64($inp),$inp vinserti128 \$1,($frame),@X[-4&7],@X[-4&7] vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-4&7],@X[-4&7] vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-3&7],@X[-3&7] vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vmovdqu -64($K_XX_XX),$Kx # K_00_19 vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU vpaddd $Kx,@X[-2&7],@X[2] vmovdqu @X[1],32(%rsp) vpaddd $Kx,@X[-1&7],@X[3] vmovdqu @X[2],64(%rsp) vmovdqu @X[3],96(%rsp) ___ for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31 use integer; &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" &vpsrld (@Tx[0],@X[0],31); &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 &vpslld (@Tx[2],@Tx[2],2); &vpxor (@X[0],@X[0],@Tx[1]); &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 &vpaddd (@Tx[1],@X[0],$Kx); &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU push(@X,shift(@X)); # "rotate" X[] } $code.=<<___; lea 128(%rsp),$frame jmp .Loop_avx2 .align 32 .Loop_avx2: rorx \$2,$F,$B andn $D,$F,$t0 and $C,$F xor $t0,$F ___ sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path # at start $f=(b&c)^(~b&d), $b>>>=2 return &bodyx_20_39() if ($rx==19); $rx++; ( '($a,$f,$b,$c,$d,$e)=@ROTX;'. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K '&lea ($frame,"256($frame)") if ($j%32==31);', '&andn ($t0,$a,$c)', # ~b&d for next round '&add ($e,$f)', # e+=(b&c)^(~b&d) '&rorx ($a5,$a,27)', # a<<<5 '&rorx ($f,$a,2)', # b>>>2 for next round '&and ($a,$b)', # b&c for next round '&add ($e,$a5)', # e+=a<<<5 '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round 'unshift(@ROTX,pop(@ROTX)); $j++;' ) } sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path # on entry $f=b^c^d, $b>>>=2 return &bodyx_40_59() if ($rx==39); $rx++; ( '($a,$f,$b,$c,$d,$e)=@ROTX;'. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K '&lea ($frame,"256($frame)") if ($j%32==31);', '&lea ($e,"($e,$f)")', # e+=b^c^d '&rorx ($a5,$a,27)', # a<<<5 '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round '&xor ($a,$b) if ($j<79)', # b^c for next round '&add ($e,$a5)', # e+=a<<<5 '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round 'unshift(@ROTX,pop(@ROTX)); $j++;' ) } sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path # on entry $f=((b^c)&(c^d)), $b>>>=2 $rx++; ( '($a,$f,$b,$c,$d,$e)=@ROTX;'. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K '&lea ($frame,"256($frame)") if ($j%32==31);', '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c '&mov ($t0,$b) if ($j<59)', # count on zero latency '&xor ($t0,$c) if ($j<59)', # c^d for next round '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c '&rorx ($a5,$a,27)', # a<<<5 '&rorx ($f,$a,2)', # b>>>2 in next round '&xor ($a,$b)', # b^c for next round '&add ($e,$a5)', # e+=a<<<5 '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round 'unshift(@ROTX,pop(@ROTX)); $j++;' ) } sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions my ($a,$b,$c,$d,$e); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],31); &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); &vpslld (@Tx[2],@Tx[2],2); &vpxor (@X[0],@X[0],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],@X[0],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx2_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],30); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #&vpslld (@X[0],@X[0],2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],@X[0],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xloop_avx2() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } &align32(); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_20_39); &Xupdate_avx2_32_79(\&bodyx_20_39); &Xupdate_avx2_32_79(\&bodyx_20_39); &Xupdate_avx2_32_79(\&bodyx_20_39); &align32(); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xloop_avx2(\&bodyx_20_39); &Xloop_avx2(\&bodyx_20_39); &Xloop_avx2(\&bodyx_20_39); &Xloop_avx2(\&bodyx_20_39); $code.=<<___; lea 128($inp),$frame lea 128($inp),%rdi # borrow $t0 cmp $num,$frame cmovae $inp,$frame # next or previous block # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c add 0($ctx),@ROTX[0] # update context add 4($ctx),@ROTX[1] add 8($ctx),@ROTX[3] mov @ROTX[0],0($ctx) add 12($ctx),@ROTX[4] mov @ROTX[1],4($ctx) mov @ROTX[0],$A # A=d add 16($ctx),@ROTX[5] mov @ROTX[3],$a5 mov @ROTX[3],8($ctx) mov @ROTX[4],$D # D=b #xchg @ROTX[5],$F # F=c, C=f mov @ROTX[4],12($ctx) mov @ROTX[1],$F # F=e mov @ROTX[5],16($ctx) #mov $F,16($ctx) mov @ROTX[5],$E # E=c mov $a5,$C # C=f #xchg $F,$E # E=c, F=e cmp $num,$inp je .Ldone_avx2 ___ $Xi=4; # reset variables @X=map("%ymm$_",(4..7,0..3)); $code.=<<___; vmovdqu 64($K_XX_XX),@X[2] # pbswap mask cmp $num,%rdi # borrowed $t0 ja .Last_avx2 vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7] vmovdqu -48(%rdi),%xmm1 vmovdqu -32(%rdi),%xmm2 vmovdqu -16(%rdi),%xmm3 vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7] vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] jmp .Last_avx2 .align 32 .Last_avx2: lea 128+16(%rsp),$frame rorx \$2,$F,$B andn $D,$F,$t0 and $C,$F xor $t0,$F sub \$-128,$inp ___ $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_20_39); &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19 &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap &Xloop_avx2 (\&bodyx_20_39); &vpshufb (@X[-3&7],@X[-3&7],@X[2]); &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19 &Xloop_avx2 (\&bodyx_20_39); &vmovdqu ("0(%rsp)",@Tx[0]); &vpshufb (@X[-2&7],@X[-2&7],@X[2]); &vpaddd (@Tx[1],@X[-3&7],$Kx); &Xloop_avx2 (\&bodyx_20_39); &vmovdqu ("32(%rsp)",@Tx[1]); &vpshufb (@X[-1&7],@X[-1&7],@X[2]); &vpaddd (@X[2],@X[-2&7],$Kx); &Xloop_avx2 (\&bodyx_40_59); &align32 (); &vmovdqu ("64(%rsp)",@X[2]); &vpaddd (@X[3],@X[-1&7],$Kx); &Xloop_avx2 (\&bodyx_40_59); &vmovdqu ("96(%rsp)",@X[3]); &Xloop_avx2 (\&bodyx_40_59); &Xupdate_avx2_16_31(\&bodyx_40_59); &Xupdate_avx2_16_31(\&bodyx_20_39); &Xupdate_avx2_16_31(\&bodyx_20_39); &Xupdate_avx2_16_31(\&bodyx_20_39); &Xloop_avx2 (\&bodyx_20_39); $code.=<<___; lea 128(%rsp),$frame # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c add 0($ctx),@ROTX[0] # update context add 4($ctx),@ROTX[1] add 8($ctx),@ROTX[3] mov @ROTX[0],0($ctx) add 12($ctx),@ROTX[4] mov @ROTX[1],4($ctx) mov @ROTX[0],$A # A=d add 16($ctx),@ROTX[5] mov @ROTX[3],$a5 mov @ROTX[3],8($ctx) mov @ROTX[4],$D # D=b #xchg @ROTX[5],$F # F=c, C=f mov @ROTX[4],12($ctx) mov @ROTX[1],$F # F=e mov @ROTX[5],16($ctx) #mov $F,16($ctx) mov @ROTX[5],$E # E=c mov $a5,$C # C=f #xchg $F,$E # E=c, F=e cmp $num,$inp jbe .Loop_avx2 .Ldone_avx2: vzeroupper ___ $code.=<<___ if ($win64); movaps -40-6*16($fp),%xmm6 movaps -40-5*16($fp),%xmm7 movaps -40-4*16($fp),%xmm8 movaps -40-3*16($fp),%xmm9 movaps -40-2*16($fp),%xmm10 movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; mov -40($fp),%r14 .cfi_restore %r14 mov -32($fp),%r13 .cfi_restore %r13 mov -24($fp),%r12 .cfi_restore %r12 mov -16($fp),%rbp .cfi_restore %rbp mov -8($fp),%rbx .cfi_restore %rbx lea ($fp),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 ___ } } $code.=<<___; .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 ___ }}} $code.=<<___; .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " .align 64 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lepilogue(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler ___ $code.=<<___ if ($shaext); .type shaext_handler,\@abi-omnipotent .align 16 shaext_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail lea .Lepilogue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea -8-4*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_seh_tail .size shaext_handler,.-shaext_handler ___ $code.=<<___; .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipR11 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea -40-6*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$12,%ecx .long 0xa548f3fc # cld; rep movsq mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size ssse3_handler,.-ssse3_handler .section .pdata .align 4 .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_sha1_block_data_order_shaext .rva .LSEH_end_sha1_block_data_order_shaext .rva .LSEH_info_sha1_block_data_order_shaext ___ $code.=<<___; .rva .LSEH_begin_sha1_block_data_order_ssse3 .rva .LSEH_end_sha1_block_data_order_ssse3 .rva .LSEH_info_sha1_block_data_order_ssse3 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_sha1_block_data_order_avx .rva .LSEH_end_sha1_block_data_order_avx .rva .LSEH_info_sha1_block_data_order_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_sha1_block_data_order_avx2 .rva .LSEH_end_sha1_block_data_order_avx2 .rva .LSEH_info_sha1_block_data_order_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler ___ $code.=<<___ if ($shaext); .LSEH_info_sha1_block_data_order_shaext: .byte 9,0,0,0 .rva shaext_handler ___ $code.=<<___; .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_sha1_block_data_order_avx: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_sha1_block_data_order_avx2: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ } #################################################################### sub sha1rnds4 { if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) { my @opcode=(0x0f,0x3a,0xcc); push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } else { return "sha1rnds4\t".@_[0]; } } sub sha1op38 { my $instr = shift; my %opcodelet = ( "sha1nexte" => 0xc8, "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); my $rex=0; $rex|=0x04 if ($2>=8); $rex|=0x01 if ($1>=8); unshift @opcode,0x40|$rex if ($rex); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/sha/asm/sha256-586.pl =================================================================== --- head/crypto/openssl/crypto/sha/asm/sha256-586.pl (revision 364821) +++ head/crypto/openssl/crypto/sha/asm/sha256-586.pl (revision 364822) @@ -1,1296 +1,1296 @@ #! /usr/bin/env perl # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # SHA256 block transform for x86. September 2007. # # Performance improvement over compiler generated code varies from # 10% to 40% [see below]. Not very impressive on some µ-archs, but # it's 5 times smaller and optimizes amount of writes. # # May 2012. # # Optimization including two of Pavel Semjanov's ideas, alternative # Maj and full unroll, resulted in ~20-25% improvement on most CPUs, # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not # on P4, where it kills performance, nor Sandy Bridge, where folded # loop is approximately as fast... # # June 2012. # # Add AMD XOP-specific code path, >30% improvement on Bulldozer over # May version, >60% over original. Add AVX+shrd code path, >25% # improvement on Sandy Bridge over May version, 60% over original. # # May 2013. # # Replace AMD XOP code path with SSSE3 to cover more processors. # (Biggest improvement coefficient is on upcoming Atom Silvermont, # not shown.) Add AVX+BMI code path. # # March 2014. # # Add support for Intel SHA Extensions. # # Performance in clock cycles per processed byte (less is better): # # gcc icc x86 asm(*) SIMD x86_64 asm(**) # Pentium 46 57 40/38 - - # PIII 36 33 27/24 - - # P4 41 38 28 - 17.3 # AMD K8 27 25 19/15.5 - 14.9 # Core2 26 23 18/15.6 14.3 13.8 # Westmere 27 - 19/15.7 13.4 12.3 # Sandy Bridge 25 - 15.9 12.4 11.6 # Ivy Bridge 24 - 15.0 11.4 10.3 # Haswell 22 - 13.9 9.46 7.80 # Skylake 20 - 14.9 9.50 7.70 # Bulldozer 36 - 27/22 17.0 13.6 # VIA Nano 36 - 25/22 16.8 16.5 # Atom 50 - 30/25 21.9 18.9 # Silvermont 40 - 34/31 22.9 20.6 # Goldmont 29 - 20 16.3(***) # # (*) numbers after slash are for unrolled loop, where applicable; # (**) x86_64 assembly performance is presented for reference # purposes, results are best-available; # (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$avx=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if ($xmm && !$avx && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.03) + ($1>=2.10); } if ($xmm && !$avx && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { +if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=$xmm; ### set to zero if compiling for 1.0.1 $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of # fully unrolled loop was measured to run about # 3-4x slower. If slowdown coefficient is N and # unrolled loop is m times faster, then you break # even at (N-1)/(m-1) blocks. Then it needs to be # adjusted for probability of code being evicted, # code size/cache size=1/4. Typical m is 1.15... $A="eax"; $E="edx"; $T="ebx"; $Aoff=&DWP(4,"esp"); $Boff=&DWP(8,"esp"); $Coff=&DWP(12,"esp"); $Doff=&DWP(16,"esp"); $Eoff=&DWP(20,"esp"); $Foff=&DWP(24,"esp"); $Goff=&DWP(28,"esp"); $Hoff=&DWP(32,"esp"); $Xoff=&DWP(36,"esp"); $K256="ebp"; sub BODY_16_63() { &mov ($T,"ecx"); # "ecx" is preloaded &mov ("esi",&DWP(4*(9+15+16-14),"esp")); &ror ("ecx",18-7); &mov ("edi","esi"); &ror ("esi",19-17); &xor ("ecx",$T); &shr ($T,3); &ror ("ecx",7); &xor ("esi","edi"); &xor ($T,"ecx"); # T = sigma0(X[-15]) &ror ("esi",17); &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] &shr ("edi",10); &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] #&xor ("edi","esi") # sigma1(X[-2]) # &add ($T,"edi"); # T += sigma1(X[-2]) # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] &BODY_00_15(1); } sub BODY_00_15() { my $in_16_63=shift; &mov ("ecx",$E); &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) &mov ("esi",$Foff); &ror ("ecx",25-11); &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) &mov ("edi",$Goff); &xor ("ecx",$E); &xor ("esi","edi"); &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] &ror ("ecx",11-6); &and ("esi",$E); &mov ($Eoff,$E); # modulo-scheduled &xor ($E,"ecx"); &add ($T,$Hoff); # T += h &xor ("esi","edi"); # Ch(e,f,g) &ror ($E,6); # Sigma1(e) &mov ("ecx",$A); &add ($T,"esi"); # T += Ch(e,f,g) &ror ("ecx",22-13); &add ($T,$E); # T += Sigma1(e) &mov ("edi",$Boff); &xor ("ecx",$A); &mov ($Aoff,$A); # modulo-scheduled &lea ("esp",&DWP(-4,"esp")); &ror ("ecx",13-2); &mov ("esi",&DWP(0,$K256)); &xor ("ecx",$A); &mov ($E,$Eoff); # e in next iteration, d in this one &xor ($A,"edi"); # a ^= b &ror ("ecx",2); # Sigma0(a) &add ($T,"esi"); # T+= K[i] &mov (&DWP(0,"esp"),$A); # (b^c) in next round &add ($E,$T); # d += T &and ($A,&DWP(4,"esp")); # a &= (b^c) &add ($T,"ecx"); # T += Sigma0(a) &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T &add ($K256,4); &add ($A,$T); # h += T } &external_label("OPENSSL_ia32cap_P") if (!$i386); &function_begin("sha256_block_data_order"); &mov ("esi",wparam(0)); # ctx &mov ("edi",wparam(1)); # inp &mov ("eax",wparam(2)); # num &mov ("ebx","esp"); # saved sp &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($K256); &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); &sub ("esp",16); &and ("esp",-64); &shl ("eax",6); &add ("eax","edi"); &mov (&DWP(0,"esp"),"esi"); # ctx &mov (&DWP(4,"esp"),"edi"); # inp &mov (&DWP(8,"esp"),"eax"); # inp+num*128 &mov (&DWP(12,"esp"),"ebx"); # saved sp if (!$i386 && $xmm) { &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); &mov ("ecx",&DWP(0,"edx")); &mov ("ebx",&DWP(4,"edx")); &test ("ecx",1<<20); # check for P4 &jnz (&label("loop")); &mov ("edx",&DWP(8,"edx")) if ($xmm); &test ("ecx",1<<24); # check for FXSR &jz ($unroll_after?&label("no_xmm"):&label("loop")); &and ("ecx",1<<30); # mask "Intel CPU" bit &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits &test ("edx",1<<29) if ($shaext); # check for SHA &jnz (&label("shaext")) if ($shaext); &or ("ecx","ebx"); &and ("ecx",1<<28|1<<30); &cmp ("ecx",1<<28|1<<30); if ($xmm) { &je (&label("AVX")) if ($avx); &test ("ebx",1<<9); # check for SSSE3 &jnz (&label("SSSE3")); } else { &je (&label("loop_shrd")); } if ($unroll_after) { &set_label("no_xmm"); &sub ("eax","edi"); &cmp ("eax",$unroll_after); &jae (&label("unrolled")); } } &jmp (&label("loop")); sub COMPACT_LOOP() { my $suffix=shift; &set_label("loop$suffix",$suffix?32:16); # copy input block to stack reversing byte and dword order for($i=0;$i<4;$i++) { &mov ("eax",&DWP($i*16+0,"edi")); &mov ("ebx",&DWP($i*16+4,"edi")); &mov ("ecx",&DWP($i*16+8,"edi")); &bswap ("eax"); &mov ("edx",&DWP($i*16+12,"edi")); &bswap ("ebx"); &push ("eax"); &bswap ("ecx"); &push ("ebx"); &bswap ("edx"); &push ("ecx"); &push ("edx"); } &add ("edi",64); &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H &mov (&DWP(4*(9+16)+4,"esp"),"edi"); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($A,&DWP(0,"esi")); &mov ("ebx",&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); # &mov ($Aoff,$A); &mov ($Boff,"ebx"); &xor ("ebx","ecx"); &mov ($Coff,"ecx"); &mov ($Doff,"edi"); &mov (&DWP(0,"esp"),"ebx"); # magic &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("edi",&DWP(28,"esi")); # &mov ($Eoff,$E); &mov ($Foff,"ebx"); &mov ($Goff,"ecx"); &mov ($Hoff,"edi"); &set_label("00_15$suffix",16); &BODY_00_15(); &cmp ("esi",0xc19bf174); &jne (&label("00_15$suffix")); &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) &jmp (&label("16_63$suffix")); &set_label("16_63$suffix",16); &BODY_16_63(); &cmp ("esi",0xc67178f2); &jne (&label("16_63$suffix")); &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx # &mov ($A,$Aoff); &mov ("ebx",$Boff); # &mov ("edi",$Coff); &mov ("ecx",$Doff); &add ($A,&DWP(0,"esi")); &add ("ebx",&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$A); &mov (&DWP(4,"esi"),"ebx"); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); # &mov ($E,$Eoff); &mov ("eax",$Foff); &mov ("ebx",$Goff); &mov ("ecx",$Hoff); &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp &add ($E,&DWP(16,"esi")); &add ("eax",&DWP(20,"esi")); &add ("ebx",&DWP(24,"esi")); &add ("ecx",&DWP(28,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"eax"); &mov (&DWP(24,"esi"),"ebx"); &mov (&DWP(28,"esi"),"ecx"); &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame &sub ($K256,4*64); # rewind K &cmp ("edi",&DWP(8,"esp")); # are we done yet? &jb (&label("loop$suffix")); } &COMPACT_LOOP(); &mov ("esp",&DWP(12,"esp")); # restore sp &function_end_A(); if (!$i386 && !$xmm) { # ~20% improvement on Sandy Bridge local *ror = sub { &shrd(@_[0],@_) }; &COMPACT_LOOP("_shrd"); &mov ("esp",&DWP(12,"esp")); # restore sp &function_end_A(); } &set_label("K256",64); # Yes! I keep it in the code segment! @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); &data_word(@K256); &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask &asciz("SHA256 block transform for x86, CRYPTOGAMS by "); ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets sub off { &DWP(4*(((shift)-$i)&7),"esp"); } if (!$i386 && $unroll_after) { my @AH=($A,$K256); &set_label("unrolled",16); &lea ("esp",&DWP(-96,"esp")); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("ebx",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"ebx"); &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"ebx"); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &jmp (&label("grand_loop")); &set_label("grand_loop",16); # copy input block to stack reversing byte order for($i=0;$i<5;$i++) { &mov ("ebx",&DWP(12*$i+0,"edi")); &mov ("ecx",&DWP(12*$i+4,"edi")); &bswap ("ebx"); &mov ("esi",&DWP(12*$i+8,"edi")); &bswap ("ecx"); &mov (&DWP(32+12*$i+0,"esp"),"ebx"); &bswap ("esi"); &mov (&DWP(32+12*$i+4,"esp"),"ecx"); &mov (&DWP(32+12*$i+8,"esp"),"esi"); } &mov ("ebx",&DWP($i*12,"edi")); &add ("edi",64); &bswap ("ebx"); &mov (&DWP(96+4,"esp"),"edi"); &mov (&DWP(32+12*$i,"esp"),"ebx"); my ($t1,$t2) = ("ecx","esi"); for ($i=0;$i<64;$i++) { if ($i>=16) { &mov ($T,$t1); # $t1 is preloaded # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); &ror ($t1,18-7); &mov ("edi",$t2); &ror ($t2,19-17); &xor ($t1,$T); &shr ($T,3); &ror ($t1,7); &xor ($t2,"edi"); &xor ($T,$t1); # T = sigma0(X[-15]) &ror ($t2,17); &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] &shr ("edi",10); &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] #&xor ("edi",$t2) # sigma1(X[-2]) # &add ($T,"edi"); # T += sigma1(X[-2]) # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] } &mov ($t1,$E); &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) &mov ($t2,&off($f)); &ror ($E,25-11); &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) &mov ("edi",&off($g)); &xor ($E,$t1); &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] &xor ($t2,"edi"); &ror ($E,11-6); &and ($t2,$t1); &mov (&off($e),$t1); # save $E, modulo-scheduled &xor ($E,$t1); &add ($T,&off($h)); # T += h &xor ("edi",$t2); # Ch(e,f,g) &ror ($E,6); # Sigma1(e) &mov ($t1,$AH[0]); &add ($T,"edi"); # T += Ch(e,f,g) &ror ($t1,22-13); &mov ($t2,$AH[0]); &mov ("edi",&off($b)); &xor ($t1,$AH[0]); &mov (&off($a),$AH[0]); # save $A, modulo-scheduled &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round &ror ($t1,13-2); &and ($AH[1],$AH[0]); # (b^c) &= (a^b) &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] &xor ($t1,$t2); &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); &ror ($t1,2); # Sigma0(a) &add ($AH[1],$E); # h += T &add ($E,&off($d)); # d += T &add ($AH[1],$t1); # h += Sigma0(a) &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); @AH = reverse(@AH); # rotate(a,h) ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ebx",&DWP(24,"esp")); &mov ("ecx",&DWP(28,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ebx",&DWP(24,"esi")); &add ("ecx",&DWP(28,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(24,"esi"),"ebx"); &mov (&DWP(28,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ebx"); &mov (&DWP(28,"esp"),"ecx"); &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_loop")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &function_end_A(); } if (!$i386 && $xmm) {{{ if ($shaext) { ###################################################################### # Intel SHA Extensions implementation of SHA256 update function. # my ($ctx,$inp,$end)=("esi","edi","eax"); my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); my @MSG=map("xmm$_",(3..6)); sub sha256op38 { my ($opcodelet,$dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } } sub sha256rnds2 { sha256op38(0xcb,@_); } sub sha256msg1 { sha256op38(0xcc,@_); } sub sha256msg2 { sha256op38(0xcd,@_); } &set_label("shaext",32); &sub ("esp",32); &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA &lea ($K256,&DWP(0x80,$K256)); &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask &pshufd ($Wi,$ABEF,0x1b); # ABCD &pshufd ($ABEF,$ABEF,0xb1); # CDAB &pshufd ($CDGH,$CDGH,0x1b); # EFGH &palignr ($ABEF,$CDGH,8); # ABEF &punpcklqdq ($CDGH,$Wi); # CDGH &jmp (&label("loop_shaext")); &set_label("loop_shaext",16); &movdqu (@MSG[0],&QWP(0,$inp)); &movdqu (@MSG[1],&QWP(0x10,$inp)); &movdqu (@MSG[2],&QWP(0x20,$inp)); &pshufb (@MSG[0],$TMP); &movdqu (@MSG[3],&QWP(0x30,$inp)); &movdqa (&QWP(16,"esp"),$CDGH); # offload &movdqa ($Wi,&QWP(0*16-0x80,$K256)); &paddd ($Wi,@MSG[0]); &pshufb (@MSG[1],$TMP); &sha256rnds2 ($CDGH,$ABEF); # 0-3 &pshufd ($Wi,$Wi,0x0e); &nop (); &movdqa (&QWP(0,"esp"),$ABEF); # offload &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(1*16-0x80,$K256)); &paddd ($Wi,@MSG[1]); &pshufb (@MSG[2],$TMP); &sha256rnds2 ($CDGH,$ABEF); # 4-7 &pshufd ($Wi,$Wi,0x0e); &lea ($inp,&DWP(0x40,$inp)); &sha256msg1 (@MSG[0],@MSG[1]); &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(2*16-0x80,$K256)); &paddd ($Wi,@MSG[2]); &pshufb (@MSG[3],$TMP); &sha256rnds2 ($CDGH,$ABEF); # 8-11 &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[3]); &palignr ($TMP,@MSG[2],4); &nop (); &paddd (@MSG[0],$TMP); &sha256msg1 (@MSG[1],@MSG[2]); &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(3*16-0x80,$K256)); &paddd ($Wi,@MSG[3]); &sha256msg2 (@MSG[0],@MSG[3]); &sha256rnds2 ($CDGH,$ABEF); # 12-15 &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[0]); &palignr ($TMP,@MSG[3],4); &nop (); &paddd (@MSG[1],$TMP); &sha256msg1 (@MSG[2],@MSG[3]); &sha256rnds2 ($ABEF,$CDGH); for($i=4;$i<16-3;$i++) { &movdqa ($Wi,&QWP($i*16-0x80,$K256)); &paddd ($Wi,@MSG[0]); &sha256msg2 (@MSG[1],@MSG[0]); &sha256rnds2 ($CDGH,$ABEF); # 16-19... &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[1]); &palignr ($TMP,@MSG[0],4); &nop (); &paddd (@MSG[2],$TMP); &sha256msg1 (@MSG[3],@MSG[0]); &sha256rnds2 ($ABEF,$CDGH); push(@MSG,shift(@MSG)); } &movdqa ($Wi,&QWP(13*16-0x80,$K256)); &paddd ($Wi,@MSG[0]); &sha256msg2 (@MSG[1],@MSG[0]); &sha256rnds2 ($CDGH,$ABEF); # 52-55 &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[1]) &palignr ($TMP,@MSG[0],4); &sha256rnds2 ($ABEF,$CDGH); &paddd (@MSG[2],$TMP); &movdqa ($Wi,&QWP(14*16-0x80,$K256)); &paddd ($Wi,@MSG[1]); &sha256rnds2 ($CDGH,$ABEF); # 56-59 &pshufd ($Wi,$Wi,0x0e); &sha256msg2 (@MSG[2],@MSG[1]); &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(15*16-0x80,$K256)); &paddd ($Wi,@MSG[2]); &nop (); &sha256rnds2 ($CDGH,$ABEF); # 60-63 &pshufd ($Wi,$Wi,0x0e); &cmp ($end,$inp); &nop (); &sha256rnds2 ($ABEF,$CDGH); &paddd ($CDGH,&QWP(16,"esp")); &paddd ($ABEF,&QWP(0,"esp")); &jnz (&label("loop_shaext")); &pshufd ($CDGH,$CDGH,0xb1); # DCHG &pshufd ($TMP,$ABEF,0x1b); # FEBA &pshufd ($ABEF,$ABEF,0xb1); # BAFE &punpckhqdq ($ABEF,$CDGH); # DCBA &palignr ($CDGH,$TMP,8); # HGFE &mov ("esp",&DWP(32+12,"esp")); &movdqu (&QWP(0,$ctx),$ABEF); &movdqu (&QWP(16,$ctx),$CDGH); &function_end_A(); } my @X = map("xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); my @AH = ($A,$T); &set_label("SSSE3",32); &lea ("esp",&DWP(-96,"esp")); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"edi"); &mov ($E,&DWP(16,"esi")); &mov ("edi",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &movdqa ($t3,&QWP(256,$K256)); &jmp (&label("grand_ssse3")); &set_label("grand_ssse3",16); # load input, reverse byte order, add K256[0..15], save to stack &movdqu (@X[0],&QWP(0,"edi")); &movdqu (@X[1],&QWP(16,"edi")); &movdqu (@X[2],&QWP(32,"edi")); &movdqu (@X[3],&QWP(48,"edi")); &add ("edi",64); &pshufb (@X[0],$t3); &mov (&DWP(96+4,"esp"),"edi"); &pshufb (@X[1],$t3); &movdqa ($t0,&QWP(0,$K256)); &pshufb (@X[2],$t3); &movdqa ($t1,&QWP(16,$K256)); &paddd ($t0,@X[0]); &pshufb (@X[3],$t3); &movdqa ($t2,&QWP(32,$K256)); &paddd ($t1,@X[1]); &movdqa ($t3,&QWP(48,$K256)); &movdqa (&QWP(32+0,"esp"),$t0); &paddd ($t2,@X[2]); &movdqa (&QWP(32+16,"esp"),$t1); &paddd ($t3,@X[3]); &movdqa (&QWP(32+32,"esp"),$t2); &movdqa (&QWP(32+48,"esp"),$t3); &jmp (&label("ssse3_00_47")); &set_label("ssse3_00_47",16); &add ($K256,64); sub SSSE3_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 120 instructions eval(shift(@insns)); &movdqa ($t0,@X[1]); eval(shift(@insns)); # @ eval(shift(@insns)); &movdqa ($t3,@X[3]); eval(shift(@insns)); eval(shift(@insns)); &palignr ($t0,@X[0],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &palignr ($t3,@X[2],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t1,$t0); eval(shift(@insns)); # @ eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t0,3); eval(shift(@insns)); eval(shift(@insns)); # @ &paddd (@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); &psrld ($t2,7); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,32-18); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t2,18-7); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,18-7); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t1); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,10); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &psrlq ($t2,17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); &psrlq ($t2,19-17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); # @ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); # @ &psrld ($t3,10); eval(shift(@insns)); &psrlq ($t2,17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); &psrlq ($t2,19-17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); eval(shift(@insns)); # @ &movdqa ($t2,&QWP(16*$j,$K256)); eval(shift(@insns)); eval(shift(@insns)); &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd ($t2,@X[0]); eval(shift(@insns)); # @ foreach (@insns) { eval; } # remaining instructions &movdqa (&QWP(32+16*$j,"esp"),$t2); } sub body_00_15 () { ( '&mov ("ecx",$E);', '&ror ($E,25-11);', '&mov ("esi",&off($f));', '&xor ($E,"ecx");', '&mov ("edi",&off($g));', '&xor ("esi","edi");', '&ror ($E,11-6);', '&and ("esi","ecx");', '&mov (&off($e),"ecx");', # save $E, modulo-scheduled '&xor ($E,"ecx");', '&xor ("edi","esi");', # Ch(e,f,g) '&ror ($E,6);', # T = Sigma1(e) '&mov ("ecx",$AH[0]);', '&add ($E,"edi");', # T += Ch(e,f,g) '&mov ("edi",&off($b));', '&mov ("esi",$AH[0]);', '&ror ("ecx",22-13);', '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled '&xor ("ecx",$AH[0]);', '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round '&add ($E,&off($h));', # T += h '&ror ("ecx",13-2);', '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) '&xor ("ecx","esi");', '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) '&ror ("ecx",2);', # Sigma0(a) '&add ($AH[1],$E);', # h += T '&add ($E,&off($d));', # d += T '&add ($AH[1],"ecx");'. # h += Sigma0(a) '@AH = reverse(@AH); $i++;' # rotate(a,h) ); } for ($i=0,$j=0; $j<4; $j++) { &SSSE3_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmp (&DWP(16*$j,$K256),0x00010203); &jne (&label("ssse3_00_47")); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ecx",&DWP(24,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ecx",&DWP(24,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(28,"esp")); &mov (&DWP(24,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &add ("edi",&DWP(28,"esi")); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esi"),"edi"); &mov (&DWP(28,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &movdqa ($t3,&QWP(64,$K256)); &sub ($K256,3*64); # rewind K &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_ssse3")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &function_end_A(); if ($avx) { &set_label("AVX",32); if ($avx>1) { &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 &cmp ("edx",1<<8|1<<3); &je (&label("AVX_BMI")); } &lea ("esp",&DWP(-96,"esp")); &vzeroall (); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"edi"); &mov ($E,&DWP(16,"esi")); &mov ("edi",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &vmovdqa ($t3,&QWP(256,$K256)); &jmp (&label("grand_avx")); &set_label("grand_avx",32); # load input, reverse byte order, add K256[0..15], save to stack &vmovdqu (@X[0],&QWP(0,"edi")); &vmovdqu (@X[1],&QWP(16,"edi")); &vmovdqu (@X[2],&QWP(32,"edi")); &vmovdqu (@X[3],&QWP(48,"edi")); &add ("edi",64); &vpshufb (@X[0],@X[0],$t3); &mov (&DWP(96+4,"esp"),"edi"); &vpshufb (@X[1],@X[1],$t3); &vpshufb (@X[2],@X[2],$t3); &vpaddd ($t0,@X[0],&QWP(0,$K256)); &vpshufb (@X[3],@X[3],$t3); &vpaddd ($t1,@X[1],&QWP(16,$K256)); &vpaddd ($t2,@X[2],&QWP(32,$K256)); &vpaddd ($t3,@X[3],&QWP(48,$K256)); &vmovdqa (&QWP(32+0,"esp"),$t0); &vmovdqa (&QWP(32+16,"esp"),$t1); &vmovdqa (&QWP(32+32,"esp"),$t2); &vmovdqa (&QWP(32+48,"esp"),$t3); &jmp (&label("avx_00_47")); &set_label("avx_00_47",16); &add ($K256,64); sub Xupdate_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] '&vpsrld ($t2,$t0,7);', '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] '&vpsrld ($t3,$t0,3);', '&vpslld ($t1,$t0,14);', '&vpxor ($t0,$t3,$t2);', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,18-7);', '&vpxor ($t0,$t0,$t1);', '&vpslld ($t1,$t1,25-14);', '&vpxor ($t0,$t0,$t2);', '&vpsrld ($t2,$t3,10);', '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) '&vpsrlq ($t1,$t3,17);', '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t1);', '&vpsrlq ($t3,$t3,19);', '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] '&vpshufd ($t3,$t2,0b10000100);', '&vpsrldq ($t3,$t3,8);', '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,10);', '&vpsrlq ($t1,$t3,17);', '&vpxor ($t2,$t2,$t1);', '&vpsrlq ($t3,$t3,19);', '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] '&vpshufd ($t3,$t2,0b11101000);', '&vpslldq ($t3,$t3,8);', '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) ); } local *ror = sub { &shrd(@_[0],@_) }; sub AVX_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 120 instructions my $insn; foreach (Xupdate_AVX()) { # 31 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval($insn = shift(@insns)); eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); } &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); foreach (@insns) { eval; } # remaining instructions &vmovdqa (&QWP(32+16*$j,"esp"),$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmp (&DWP(16*$j,$K256),0x00010203); &jne (&label("avx_00_47")); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ecx",&DWP(24,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ecx",&DWP(24,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(28,"esp")); &mov (&DWP(24,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &add ("edi",&DWP(28,"esi")); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esi"),"edi"); &mov (&DWP(28,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &vmovdqa ($t3,&QWP(64,$K256)); &sub ($K256,3*64); # rewind K &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_avx")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &vzeroall (); &function_end_A(); if ($avx>1) { sub bodyx_00_15 () { # +10% ( '&rorx ("ecx",$E,6)', '&rorx ("esi",$E,11)', '&mov (&off($e),$E)', # save $E, modulo-scheduled '&rorx ("edi",$E,25)', '&xor ("ecx","esi")', '&andn ("esi",$E,&off($g))', '&xor ("ecx","edi")', # Sigma1(e) '&and ($E,&off($f))', '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled '&or ($E,"esi")', # T = Ch(e,f,g) '&rorx ("edi",$AH[0],2)', '&rorx ("esi",$AH[0],13)', '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) '&rorx ("ecx",$AH[0],22)', '&xor ("esi","edi")', '&mov ("edi",&off($b))', '&xor ("ecx","esi")', # Sigma0(a) '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round '&add ($E,&off($h))', # T += h '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) '&add ("ecx",$E)', # h += T '&add ($E,&off($d))', # d += T '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) '@AH = reverse(@AH); $i++;' # rotate(a,h) ); } &set_label("AVX_BMI",32); &lea ("esp",&DWP(-96,"esp")); &vzeroall (); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"edi"); &mov ($E,&DWP(16,"esi")); &mov ("edi",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &vmovdqa ($t3,&QWP(256,$K256)); &jmp (&label("grand_avx_bmi")); &set_label("grand_avx_bmi",32); # load input, reverse byte order, add K256[0..15], save to stack &vmovdqu (@X[0],&QWP(0,"edi")); &vmovdqu (@X[1],&QWP(16,"edi")); &vmovdqu (@X[2],&QWP(32,"edi")); &vmovdqu (@X[3],&QWP(48,"edi")); &add ("edi",64); &vpshufb (@X[0],@X[0],$t3); &mov (&DWP(96+4,"esp"),"edi"); &vpshufb (@X[1],@X[1],$t3); &vpshufb (@X[2],@X[2],$t3); &vpaddd ($t0,@X[0],&QWP(0,$K256)); &vpshufb (@X[3],@X[3],$t3); &vpaddd ($t1,@X[1],&QWP(16,$K256)); &vpaddd ($t2,@X[2],&QWP(32,$K256)); &vpaddd ($t3,@X[3],&QWP(48,$K256)); &vmovdqa (&QWP(32+0,"esp"),$t0); &vmovdqa (&QWP(32+16,"esp"),$t1); &vmovdqa (&QWP(32+32,"esp"),$t2); &vmovdqa (&QWP(32+48,"esp"),$t3); &jmp (&label("avx_bmi_00_47")); &set_label("avx_bmi_00_47",16); &add ($K256,64); for ($i=0,$j=0; $j<4; $j++) { &AVX_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmp (&DWP(16*$j,$K256),0x00010203); &jne (&label("avx_bmi_00_47")); for ($i=0; $i<16; ) { foreach(bodyx_00_15()) { eval; } } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ecx",&DWP(24,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ecx",&DWP(24,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(28,"esp")); &mov (&DWP(24,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &add ("edi",&DWP(28,"esi")); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esi"),"edi"); &mov (&DWP(28,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &vmovdqa ($t3,&QWP(64,$K256)); &sub ($K256,3*64); # rewind K &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_avx_bmi")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &vzeroall (); &function_end_A(); } } }}} &function_end_B("sha256_block_data_order"); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/sha/asm/sha256-mb-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/sha/asm/sha256-mb-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/sha/asm/sha256-mb-x86_64.pl (revision 364822) @@ -1,1614 +1,1614 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Multi-buffer SHA256 procedure processes n buffers in parallel by # placing buffer data to designated lane of SIMD register. n is # naturally limited to 4 on pre-AVX2 processors and to 8 on # AVX2-capable processors such as Haswell. # # this +aesni(i) sha256 aesni-sha256 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95% # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% # # (i) multi-block CBC encrypt with 128-bit key; # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, # because of lower AES-NI instruction throughput, nor is there # AES-NI-SHA256 stitch for these processors; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 20.3+4.44=24.7; # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 75% to 130% (on Haswell); $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha256_multi_block ( # struct { unsigned int A[8]; # unsigned int B[8]; # unsigned int C[8]; # unsigned int D[8]; # unsigned int E[8]; # unsigned int F[8]; # unsigned int G[8]; # unsigned int H[8]; } *ctx, # struct { void *ptr; int blocks; } inp[8], # int num); /* 1 or 2 */ # $ctx="%rdi"; # 1st arg $inp="%rsi"; # 2nd arg $num="%edx"; # 3rd arg @ptr=map("%r$_",(8..11)); $Tbl="%rbp"; @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); $REG_SZ=16; sub Xi_off { my $off = shift; $off %= 16; $off *= $REG_SZ; $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; } sub ROUND_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; $code.=<<___ if ($i<15); movd `4*$i`(@ptr[0]),$Xi movd `4*$i`(@ptr[1]),$t1 movd `4*$i`(@ptr[2]),$t2 movd `4*$i`(@ptr[3]),$t3 punpckldq $t2,$Xi punpckldq $t3,$t1 punpckldq $t1,$Xi ___ $code.=<<___ if ($i==15); movd `4*$i`(@ptr[0]),$Xi lea `16*4`(@ptr[0]),@ptr[0] movd `4*$i`(@ptr[1]),$t1 lea `16*4`(@ptr[1]),@ptr[1] movd `4*$i`(@ptr[2]),$t2 lea `16*4`(@ptr[2]),@ptr[2] movd `4*$i`(@ptr[3]),$t3 lea `16*4`(@ptr[3]),@ptr[3] punpckldq $t2,$Xi punpckldq $t3,$t1 punpckldq $t1,$Xi ___ $code.=<<___; movdqa $e,$sigma `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)` movdqa $e,$t3 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)` psrld \$6,$sigma movdqa $e,$t2 pslld \$7,$t3 movdqa $Xi,`&Xi_off($i)` paddd $h,$Xi # Xi+=h psrld \$11,$t2 pxor $t3,$sigma pslld \$21-7,$t3 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] pxor $t2,$sigma psrld \$25-11,$t2 movdqa $e,$t1 `"prefetcht0 63(@ptr[0])" if ($i==15)` pxor $t3,$sigma movdqa $e,$axb # borrow $axb pslld \$26-21,$t3 pandn $g,$t1 pand $f,$axb pxor $t2,$sigma `"prefetcht0 63(@ptr[1])" if ($i==15)` movdqa $a,$t2 pxor $t3,$sigma # Sigma1(e) movdqa $a,$t3 psrld \$2,$t2 paddd $sigma,$Xi # Xi+=Sigma1(e) pxor $axb,$t1 # Ch(e,f,g) movdqa $b,$axb movdqa $a,$sigma pslld \$10,$t3 pxor $a,$axb # a^b, b^c in next round `"prefetcht0 63(@ptr[2])" if ($i==15)` psrld \$13,$sigma pxor $t3,$t2 paddd $t1,$Xi # Xi+=Ch(e,f,g) pslld \$19-10,$t3 pand $axb,$bxc pxor $sigma,$t2 `"prefetcht0 63(@ptr[3])" if ($i==15)` psrld \$22-13,$sigma pxor $t3,$t2 movdqa $b,$h pslld \$30-19,$t3 pxor $t2,$sigma pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) paddd $Xi,$d # d+=Xi pxor $t3,$sigma # Sigma0(a) paddd $Xi,$h # h+=Xi paddd $sigma,$h # h+=Sigma0(a) ___ $code.=<<___ if (($i%8)==7); lea `32*8`($Tbl),$Tbl ___ ($axb,$bxc)=($bxc,$axb); } sub ROUND_16_XX { my $i=shift; $code.=<<___; movdqa `&Xi_off($i+1)`,$Xn paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] movdqa $Xn,$sigma movdqa $Xn,$t2 psrld \$3,$sigma movdqa $Xn,$t3 psrld \$7,$t2 movdqa `&Xi_off($i+14)`,$t1 pslld \$14,$t3 pxor $t2,$sigma psrld \$18-7,$t2 movdqa $t1,$axb # borrow $axb pxor $t3,$sigma pslld \$25-14,$t3 pxor $t2,$sigma psrld \$10,$t1 movdqa $axb,$t2 psrld \$17,$axb pxor $t3,$sigma # sigma0(X[i+1]) pslld \$13,$t2 paddd $sigma,$Xi # Xi+=sigma0(e) pxor $axb,$t1 psrld \$19-17,$axb pxor $t2,$t1 pslld \$15-13,$t2 pxor $axb,$t1 pxor $t2,$t1 # sigma0(X[i+14]) paddd $t1,$Xi # Xi+=sigma1(X[i+14]) ___ &ROUND_00_15($i,@_); ($Xi,$Xn)=($Xn,$Xi); } $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl sha256_multi_block .type sha256_multi_block,\@function,3 .align 32 sha256_multi_block: .cfi_startproc mov OPENSSL_ia32cap_P+4(%rip),%rcx bt \$61,%rcx # check SHA bit jc _shaext_shortcut ___ $code.=<<___ if ($avx); test \$`1<<28`,%ecx jnz _avx_shortcut ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody: lea K256+128(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx lea 0x80($ctx),$ctx # size optimization .Loop_grande: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone movdqu 0x00-0x80($ctx),$A # load context lea 128(%rsp),%rax movdqu 0x20-0x80($ctx),$B movdqu 0x40-0x80($ctx),$C movdqu 0x60-0x80($ctx),$D movdqu 0x80-0x80($ctx),$E movdqu 0xa0-0x80($ctx),$F movdqu 0xc0-0x80($ctx),$G movdqu 0xe0-0x80($ctx),$H movdqu .Lpbswap(%rip),$Xn jmp .Loop .align 32 .Loop: movdqa $C,$bxc pxor $B,$bxc # magic seed ___ for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } $code.=<<___; movdqu `&Xi_off($i)`,$Xi mov \$3,%ecx jmp .Loop_16_xx .align 32 .Loop_16_xx: ___ for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } $code.=<<___; dec %ecx jnz .Loop_16_xx mov \$1,%ecx lea K256+128(%rip),$Tbl movdqa (%rbx),$sigma # pull counters cmp 4*0(%rbx),%ecx # examine counters pxor $t1,$t1 cmovge $Tbl,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx movdqa $sigma,$Xn cmovge $Tbl,@ptr[1] cmp 4*2(%rbx),%ecx pcmpgtd $t1,$Xn # mask value cmovge $Tbl,@ptr[2] cmp 4*3(%rbx),%ecx paddd $Xn,$sigma # counters-- cmovge $Tbl,@ptr[3] movdqu 0x00-0x80($ctx),$t1 pand $Xn,$A movdqu 0x20-0x80($ctx),$t2 pand $Xn,$B movdqu 0x40-0x80($ctx),$t3 pand $Xn,$C movdqu 0x60-0x80($ctx),$Xi pand $Xn,$D paddd $t1,$A movdqu 0x80-0x80($ctx),$t1 pand $Xn,$E paddd $t2,$B movdqu 0xa0-0x80($ctx),$t2 pand $Xn,$F paddd $t3,$C movdqu 0xc0-0x80($ctx),$t3 pand $Xn,$G paddd $Xi,$D movdqu 0xe0-0x80($ctx),$Xi pand $Xn,$H paddd $t1,$E paddd $t2,$F movdqu $A,0x00-0x80($ctx) paddd $t3,$G movdqu $B,0x20-0x80($ctx) paddd $Xi,$H movdqu $C,0x40-0x80($ctx) movdqu $D,0x60-0x80($ctx) movdqu $E,0x80-0x80($ctx) movdqu $F,0xa0-0x80($ctx) movdqu $G,0xc0-0x80($ctx) movdqu $H,0xe0-0x80($ctx) movdqa $sigma,(%rbx) # save counters movdqa .Lpbswap(%rip),$Xn dec $num jnz .Loop mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande .Ldone: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha256_multi_block,.-sha256_multi_block ___ {{{ my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15)); my @MSG0=map("%xmm$_",(4..7)); my @MSG1=map("%xmm$_",(8..11)); $code.=<<___; .type sha256_multi_block_shaext,\@function,3 .align 32 sha256_multi_block_shaext: .cfi_startproc _shaext_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`,%rsp shl \$1,$num # we process pair at a time and \$-256,%rsp lea 0x80($ctx),$ctx # size optimization mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .Lbody_shaext: lea `$REG_SZ*16`(%rsp),%rbx lea K256_shaext+0x80(%rip),$Tbl .Loop_grande_shaext: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle %rsp,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_shaext movq 0x00-0x80($ctx),$ABEF0 # A1.A0 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0 movq 0x40-0x80($ctx),$CDGH0 # C1.C0 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap movdqa $ABEF0,$ABEF1 movdqa $CDGH0,$CDGH1 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1 pshufd \$0b00011011,$ABEF0,$ABEF0 pshufd \$0b00011011,$CDGH0,$CDGH0 pshufd \$0b00011011,$ABEF1,$ABEF1 pshufd \$0b00011011,$CDGH1,$CDGH1 jmp .Loop_shaext .align 32 .Loop_shaext: movdqu 0x00(@ptr[0]),@MSG0[0] movdqu 0x00(@ptr[1]),@MSG1[0] movdqu 0x10(@ptr[0]),@MSG0[1] movdqu 0x10(@ptr[1]),@MSG1[1] movdqu 0x20(@ptr[0]),@MSG0[2] pshufb $TMPx,@MSG0[0] movdqu 0x20(@ptr[1]),@MSG1[2] pshufb $TMPx,@MSG1[0] movdqu 0x30(@ptr[0]),@MSG0[3] lea 0x40(@ptr[0]),@ptr[0] movdqu 0x30(@ptr[1]),@MSG1[3] lea 0x40(@ptr[1]),@ptr[1] movdqa 0*16-0x80($Tbl),$Wi pshufb $TMPx,@MSG0[1] paddd @MSG0[0],$Wi pxor $ABEF0,@MSG0[0] # black magic movdqa $Wi,$TMP0 movdqa 0*16-0x80($Tbl),$TMP1 pshufb $TMPx,@MSG1[1] paddd @MSG1[0],$TMP1 movdqa $CDGH0,0x50(%rsp) # offload sha256rnds2 $ABEF0,$CDGH0 # 0-3 pxor $ABEF1,@MSG1[0] # black magic movdqa $TMP1,$Wi movdqa $CDGH1,0x70(%rsp) sha256rnds2 $ABEF1,$CDGH1 # 0-3 pshufd \$0x0e,$TMP0,$Wi pxor $ABEF0,@MSG0[0] # black magic movdqa $ABEF0,0x40(%rsp) # offload sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi pxor $ABEF1,@MSG1[0] # black magic movdqa $ABEF1,0x60(%rsp) movdqa 1*16-0x80($Tbl),$TMP0 paddd @MSG0[1],$TMP0 pshufb $TMPx,@MSG0[2] sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi movdqa 1*16-0x80($Tbl),$TMP1 paddd @MSG1[1],$TMP1 sha256rnds2 $ABEF0,$CDGH0 # 4-7 movdqa $TMP1,$Wi prefetcht0 127(@ptr[0]) pshufb $TMPx,@MSG0[3] pshufb $TMPx,@MSG1[2] prefetcht0 127(@ptr[1]) sha256rnds2 $ABEF1,$CDGH1 # 4-7 pshufd \$0x0e,$TMP0,$Wi pshufb $TMPx,@MSG1[3] sha256msg1 @MSG0[1],@MSG0[0] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 2*16-0x80($Tbl),$TMP0 paddd @MSG0[2],$TMP0 sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi movdqa 2*16-0x80($Tbl),$TMP1 paddd @MSG1[2],$TMP1 sha256rnds2 $ABEF0,$CDGH0 # 8-11 sha256msg1 @MSG1[1],@MSG1[0] movdqa $TMP1,$Wi movdqa @MSG0[3],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 8-11 pshufd \$0x0e,$TMP0,$Wi palignr \$4,@MSG0[2],$TMPx paddd $TMPx,@MSG0[0] movdqa @MSG1[3],$TMPx palignr \$4,@MSG1[2],$TMPx sha256msg1 @MSG0[2],@MSG0[1] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 3*16-0x80($Tbl),$TMP0 paddd @MSG0[3],$TMP0 sha256rnds2 $CDGH1,$ABEF1 sha256msg1 @MSG1[2],@MSG1[1] movdqa $TMP0,$Wi movdqa 3*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[0] paddd @MSG1[3],$TMP1 sha256msg2 @MSG0[3],@MSG0[0] sha256rnds2 $ABEF0,$CDGH0 # 12-15 movdqa $TMP1,$Wi movdqa @MSG0[0],$TMPx palignr \$4,@MSG0[3],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 12-15 sha256msg2 @MSG1[3],@MSG1[0] pshufd \$0x0e,$TMP0,$Wi paddd $TMPx,@MSG0[1] movdqa @MSG1[0],$TMPx palignr \$4,@MSG1[3],$TMPx sha256msg1 @MSG0[3],@MSG0[2] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 4*16-0x80($Tbl),$TMP0 paddd @MSG0[0],$TMP0 sha256rnds2 $CDGH1,$ABEF1 sha256msg1 @MSG1[3],@MSG1[2] ___ for($i=4;$i<16-3;$i++) { $code.=<<___; movdqa $TMP0,$Wi movdqa $i*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[1] paddd @MSG1[0],$TMP1 sha256msg2 @MSG0[0],@MSG0[1] sha256rnds2 $ABEF0,$CDGH0 # 16-19... movdqa $TMP1,$Wi movdqa @MSG0[1],$TMPx palignr \$4,@MSG0[0],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 16-19... sha256msg2 @MSG1[0],@MSG1[1] pshufd \$0x0e,$TMP0,$Wi paddd $TMPx,@MSG0[2] movdqa @MSG1[1],$TMPx palignr \$4,@MSG1[0],$TMPx sha256msg1 @MSG0[0],@MSG0[3] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa `($i+1)*16`-0x80($Tbl),$TMP0 paddd @MSG0[1],$TMP0 sha256rnds2 $CDGH1,$ABEF1 sha256msg1 @MSG1[0],@MSG1[3] ___ push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); } $code.=<<___; movdqa $TMP0,$Wi movdqa 13*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[1] paddd @MSG1[0],$TMP1 sha256msg2 @MSG0[0],@MSG0[1] sha256rnds2 $ABEF0,$CDGH0 # 52-55 movdqa $TMP1,$Wi movdqa @MSG0[1],$TMPx palignr \$4,@MSG0[0],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 52-55 sha256msg2 @MSG1[0],@MSG1[1] pshufd \$0x0e,$TMP0,$Wi paddd $TMPx,@MSG0[2] movdqa @MSG1[1],$TMPx palignr \$4,@MSG1[0],$TMPx nop sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 14*16-0x80($Tbl),$TMP0 paddd @MSG0[1],$TMP0 sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi movdqa 14*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[2] paddd @MSG1[1],$TMP1 sha256msg2 @MSG0[1],@MSG0[2] nop sha256rnds2 $ABEF0,$CDGH0 # 56-59 movdqa $TMP1,$Wi mov \$1,%ecx pxor @MSG0[1],@MSG0[1] # zero sha256rnds2 $ABEF1,$CDGH1 # 56-59 sha256msg2 @MSG1[1],@MSG1[2] pshufd \$0x0e,$TMP0,$Wi movdqa 15*16-0x80($Tbl),$TMP0 paddd @MSG0[2],$TMP0 movq (%rbx),@MSG0[2] # pull counters nop sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 15*16-0x80($Tbl),$TMP1 paddd @MSG1[2],$TMP1 sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi cmp 4*0(%rbx),%ecx # examine counters cmovge %rsp,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx cmovge %rsp,@ptr[1] pshufd \$0x00,@MSG0[2],@MSG1[0] sha256rnds2 $ABEF0,$CDGH0 # 60-63 movdqa $TMP1,$Wi pshufd \$0x55,@MSG0[2],@MSG1[1] movdqa @MSG0[2],@MSG1[2] sha256rnds2 $ABEF1,$CDGH1 # 60-63 pshufd \$0x0e,$TMP0,$Wi pcmpgtd @MSG0[1],@MSG1[0] pcmpgtd @MSG0[1],@MSG1[1] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi pcmpgtd @MSG0[1],@MSG1[2] # counter mask movdqa K256_shaext-0x10(%rip),$TMPx sha256rnds2 $CDGH1,$ABEF1 pand @MSG1[0],$CDGH0 pand @MSG1[1],$CDGH1 pand @MSG1[0],$ABEF0 pand @MSG1[1],$ABEF1 paddd @MSG0[2],@MSG1[2] # counters-- paddd 0x50(%rsp),$CDGH0 paddd 0x70(%rsp),$CDGH1 paddd 0x40(%rsp),$ABEF0 paddd 0x60(%rsp),$ABEF1 movq @MSG1[2],(%rbx) # save counters dec $num jnz .Loop_shaext mov `$REG_SZ*17+8`(%rsp),$num pshufd \$0b00011011,$ABEF0,$ABEF0 pshufd \$0b00011011,$CDGH0,$CDGH0 pshufd \$0b00011011,$ABEF1,$ABEF1 pshufd \$0b00011011,$CDGH1,$CDGH1 movdqa $ABEF0,@MSG0[0] movdqa $CDGH0,@MSG0[1] punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0 movq $ABEF0,0x00-0x80($ctx) # A1.A0 psrldq \$8,$ABEF0 movq @MSG0[0],0x80-0x80($ctx) # E1.E0 psrldq \$8,@MSG0[0] movq $ABEF0,0x20-0x80($ctx) # B1.B0 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0 movq $CDGH0,0x40-0x80($ctx) # C1.C0 psrldq \$8,$CDGH0 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0 psrldq \$8,@MSG0[1] movq $CDGH0,0x60-0x80($ctx) # D1.D0 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 lea `$REG_SZ/2`($ctx),$ctx lea `16*2`($inp),$inp dec $num jnz .Loop_grande_shaext .Ldone_shaext: #mov `$REG_SZ*17`(%rsp),%rax # original %rsp ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_shaext: ret .cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext ___ }}} if ($avx) {{{ sub ROUND_00_15_avx { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; $code.=<<___ if ($i<15 && $REG_SZ==16); vmovd `4*$i`(@ptr[0]),$Xi vmovd `4*$i`(@ptr[1]),$t1 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 vpunpckldq $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___ if ($i==15 && $REG_SZ==16); vmovd `4*$i`(@ptr[0]),$Xi lea `16*4`(@ptr[0]),@ptr[0] vmovd `4*$i`(@ptr[1]),$t1 lea `16*4`(@ptr[1]),@ptr[1] vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 lea `16*4`(@ptr[3]),@ptr[3] vpunpckldq $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___ if ($i<15 && $REG_SZ==32); vmovd `4*$i`(@ptr[0]),$Xi vmovd `4*$i`(@ptr[4]),$t1 vmovd `4*$i`(@ptr[1]),$t2 vmovd `4*$i`(@ptr[5]),$t3 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 vpunpckldq $t2,$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 vpunpckldq $t3,$t1,$t1 vinserti128 $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___ if ($i==15 && $REG_SZ==32); vmovd `4*$i`(@ptr[0]),$Xi lea `16*4`(@ptr[0]),@ptr[0] vmovd `4*$i`(@ptr[4]),$t1 lea `16*4`(@ptr[4]),@ptr[4] vmovd `4*$i`(@ptr[1]),$t2 lea `16*4`(@ptr[1]),@ptr[1] vmovd `4*$i`(@ptr[5]),$t3 lea `16*4`(@ptr[5]),@ptr[5] vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 lea `16*4`(@ptr[6]),@ptr[6] vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 lea `16*4`(@ptr[3]),@ptr[3] vpunpckldq $t2,$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 lea `16*4`(@ptr[7]),@ptr[7] vpunpckldq $t3,$t1,$t1 vinserti128 $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___; vpsrld \$6,$e,$sigma vpslld \$26,$e,$t3 vmovdqu $Xi,`&Xi_off($i)` vpaddd $h,$Xi,$Xi # Xi+=h vpsrld \$11,$e,$t2 vpxor $t3,$sigma,$sigma vpslld \$21,$e,$t3 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] vpxor $t2,$sigma,$sigma vpsrld \$25,$e,$t2 vpxor $t3,$sigma,$sigma `"prefetcht0 63(@ptr[0])" if ($i==15)` vpslld \$7,$e,$t3 vpandn $g,$e,$t1 vpand $f,$e,$axb # borrow $axb `"prefetcht0 63(@ptr[1])" if ($i==15)` vpxor $t2,$sigma,$sigma vpsrld \$2,$a,$h # borrow $h vpxor $t3,$sigma,$sigma # Sigma1(e) `"prefetcht0 63(@ptr[2])" if ($i==15)` vpslld \$30,$a,$t2 vpxor $axb,$t1,$t1 # Ch(e,f,g) vpxor $a,$b,$axb # a^b, b^c in next round `"prefetcht0 63(@ptr[3])" if ($i==15)` vpxor $t2,$h,$h vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) vpsrld \$13,$a,$t2 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpslld \$19,$a,$t3 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) vpand $axb,$bxc,$bxc `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$h,$sigma vpsrld \$22,$a,$t2 vpxor $t3,$sigma,$sigma `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpslld \$10,$a,$t3 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) vpaddd $Xi,$d,$d # d+=Xi `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # Sigma0(a) vpaddd $Xi,$h,$h # h+=Xi vpaddd $sigma,$h,$h # h+=Sigma0(a) ___ $code.=<<___ if (($i%8)==7); add \$`32*8`,$Tbl ___ ($axb,$bxc)=($bxc,$axb); } sub ROUND_16_XX_avx { my $i=shift; $code.=<<___; vmovdqu `&Xi_off($i+1)`,$Xn vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] vpsrld \$3,$Xn,$sigma vpsrld \$7,$Xn,$t2 vpslld \$25,$Xn,$t3 vpxor $t2,$sigma,$sigma vpsrld \$18,$Xn,$t2 vpxor $t3,$sigma,$sigma vpslld \$14,$Xn,$t3 vmovdqu `&Xi_off($i+14)`,$t1 vpsrld \$10,$t1,$axb # borrow $axb vpxor $t2,$sigma,$sigma vpsrld \$17,$t1,$t2 vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) vpslld \$15,$t1,$t3 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) vpxor $t2,$axb,$sigma vpsrld \$19,$t1,$t2 vpxor $t3,$sigma,$sigma vpslld \$13,$t1,$t3 vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) ___ &ROUND_00_15_avx($i,@_); ($Xi,$Xn)=($Xn,$Xi); } $code.=<<___; .type sha256_multi_block_avx,\@function,3 .align 32 sha256_multi_block_avx: .cfi_startproc _avx_shortcut: ___ $code.=<<___ if ($avx>1); shr \$32,%rcx cmp \$2,$num jb .Lavx test \$`1<<5`,%ecx jnz _avx2_shortcut jmp .Lavx .align 32 .Lavx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx: lea K256+128(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx lea 0x80($ctx),$ctx # size optimization .Loop_grande_avx: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_avx vmovdqu 0x00-0x80($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20-0x80($ctx),$B vmovdqu 0x40-0x80($ctx),$C vmovdqu 0x60-0x80($ctx),$D vmovdqu 0x80-0x80($ctx),$E vmovdqu 0xa0-0x80($ctx),$F vmovdqu 0xc0-0x80($ctx),$G vmovdqu 0xe0-0x80($ctx),$H vmovdqu .Lpbswap(%rip),$Xn jmp .Loop_avx .align 32 .Loop_avx: vpxor $B,$C,$bxc # magic seed ___ for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; vmovdqu `&Xi_off($i)`,$Xi mov \$3,%ecx jmp .Loop_16_xx_avx .align 32 .Loop_16_xx_avx: ___ for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; dec %ecx jnz .Loop_16_xx_avx mov \$1,%ecx lea K256+128(%rip),$Tbl ___ for($i=0;$i<4;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqa (%rbx),$sigma # pull counters vpxor $t1,$t1,$t1 vmovdqa $sigma,$Xn vpcmpgtd $t1,$Xn,$Xn # mask value vpaddd $Xn,$sigma,$sigma # counters-- vmovdqu 0x00-0x80($ctx),$t1 vpand $Xn,$A,$A vmovdqu 0x20-0x80($ctx),$t2 vpand $Xn,$B,$B vmovdqu 0x40-0x80($ctx),$t3 vpand $Xn,$C,$C vmovdqu 0x60-0x80($ctx),$Xi vpand $Xn,$D,$D vpaddd $t1,$A,$A vmovdqu 0x80-0x80($ctx),$t1 vpand $Xn,$E,$E vpaddd $t2,$B,$B vmovdqu 0xa0-0x80($ctx),$t2 vpand $Xn,$F,$F vpaddd $t3,$C,$C vmovdqu 0xc0-0x80($ctx),$t3 vpand $Xn,$G,$G vpaddd $Xi,$D,$D vmovdqu 0xe0-0x80($ctx),$Xi vpand $Xn,$H,$H vpaddd $t1,$E,$E vpaddd $t2,$F,$F vmovdqu $A,0x00-0x80($ctx) vpaddd $t3,$G,$G vmovdqu $B,0x20-0x80($ctx) vpaddd $Xi,$H,$H vmovdqu $C,0x40-0x80($ctx) vmovdqu $D,0x60-0x80($ctx) vmovdqu $E,0x80-0x80($ctx) vmovdqu $F,0xa0-0x80($ctx) vmovdqu $G,0xc0-0x80($ctx) vmovdqu $H,0xe0-0x80($ctx) vmovdqu $sigma,(%rbx) # save counters vmovdqu .Lpbswap(%rip),$Xn dec $num jnz .Loop_avx mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande_avx .Ldone_avx: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha256_multi_block_avx,.-sha256_multi_block_avx ___ if ($avx>1) { $code =~ s/\`([^\`]*)\`/eval $1/gem; $REG_SZ=32; @ptr=map("%r$_",(12..15,8..11)); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); $code.=<<___; .type sha256_multi_block_avx2,\@function,3 .align 32 sha256_multi_block_avx2: .cfi_startproc _avx2_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx2: lea K256+128(%rip),$Tbl lea 0x80($ctx),$ctx # size optimization .Loop_grande_avx2: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu 0x00-0x80($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20-0x80($ctx),$B lea 256+128(%rsp),%rbx vmovdqu 0x40-0x80($ctx),$C vmovdqu 0x60-0x80($ctx),$D vmovdqu 0x80-0x80($ctx),$E vmovdqu 0xa0-0x80($ctx),$F vmovdqu 0xc0-0x80($ctx),$G vmovdqu 0xe0-0x80($ctx),$H vmovdqu .Lpbswap(%rip),$Xn jmp .Loop_avx2 .align 32 .Loop_avx2: vpxor $B,$C,$bxc # magic seed ___ for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; vmovdqu `&Xi_off($i)`,$Xi mov \$3,%ecx jmp .Loop_16_xx_avx2 .align 32 .Loop_16_xx_avx2: ___ for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; dec %ecx jnz .Loop_16_xx_avx2 mov \$1,%ecx lea `$REG_SZ*16`(%rsp),%rbx lea K256+128(%rip),$Tbl ___ for($i=0;$i<8;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqa (%rbx),$sigma # pull counters vpxor $t1,$t1,$t1 vmovdqa $sigma,$Xn vpcmpgtd $t1,$Xn,$Xn # mask value vpaddd $Xn,$sigma,$sigma # counters-- vmovdqu 0x00-0x80($ctx),$t1 vpand $Xn,$A,$A vmovdqu 0x20-0x80($ctx),$t2 vpand $Xn,$B,$B vmovdqu 0x40-0x80($ctx),$t3 vpand $Xn,$C,$C vmovdqu 0x60-0x80($ctx),$Xi vpand $Xn,$D,$D vpaddd $t1,$A,$A vmovdqu 0x80-0x80($ctx),$t1 vpand $Xn,$E,$E vpaddd $t2,$B,$B vmovdqu 0xa0-0x80($ctx),$t2 vpand $Xn,$F,$F vpaddd $t3,$C,$C vmovdqu 0xc0-0x80($ctx),$t3 vpand $Xn,$G,$G vpaddd $Xi,$D,$D vmovdqu 0xe0-0x80($ctx),$Xi vpand $Xn,$H,$H vpaddd $t1,$E,$E vpaddd $t2,$F,$F vmovdqu $A,0x00-0x80($ctx) vpaddd $t3,$G,$G vmovdqu $B,0x20-0x80($ctx) vpaddd $Xi,$H,$H vmovdqu $C,0x40-0x80($ctx) vmovdqu $D,0x60-0x80($ctx) vmovdqu $E,0x80-0x80($ctx) vmovdqu $F,0xa0-0x80($ctx) vmovdqu $G,0xc0-0x80($ctx) vmovdqu $H,0xe0-0x80($ctx) vmovdqu $sigma,(%rbx) # save counters lea 256+128(%rsp),%rbx vmovdqu .Lpbswap(%rip),$Xn dec $num jnz .Loop_avx2 #mov `$REG_SZ*17+8`(%rsp),$num #lea $REG_SZ($ctx),$ctx #lea `16*$REG_SZ/4`($inp),$inp #dec $num #jnz .Loop_grande_avx2 .Ldone_avx2: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size sha256_multi_block_avx2,.-sha256_multi_block_avx2 ___ } }}} $code.=<<___; .align 256 K256: ___ sub TABLE { foreach (@_) { $code.=<<___; .long $_,$_,$_,$_ .long $_,$_,$_,$_ ___ } } &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); $code.=<<___; .Lpbswap: .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap K256_shaext: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by " ___ if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lbody jb .Lin_prologue mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue mov `16*17`(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp lea -24-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($avx>1); .type avx2_handler,\@abi-omnipotent .align 16 avx2_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue mov `32*17`($context),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size avx2_handler,.-avx2_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_sha256_multi_block .rva .LSEH_end_sha256_multi_block .rva .LSEH_info_sha256_multi_block .rva .LSEH_begin_sha256_multi_block_shaext .rva .LSEH_end_sha256_multi_block_shaext .rva .LSEH_info_sha256_multi_block_shaext ___ $code.=<<___ if ($avx); .rva .LSEH_begin_sha256_multi_block_avx .rva .LSEH_end_sha256_multi_block_avx .rva .LSEH_info_sha256_multi_block_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_sha256_multi_block_avx2 .rva .LSEH_end_sha256_multi_block_avx2 .rva .LSEH_info_sha256_multi_block_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha256_multi_block: .byte 9,0,0,0 .rva se_handler .rva .Lbody,.Lepilogue # HandlerData[] .LSEH_info_sha256_multi_block_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_sha256_multi_block_avx: .byte 9,0,0,0 .rva se_handler .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_sha256_multi_block_avx2: .byte 9,0,0,0 .rva avx2_handler .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if ($dst>=8); $rex|=0x01 if ($src>=8); unshift @opcode,$rex|0x40 if ($rex); } sub sha256op38 { my $instr = shift; my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: head/crypto/openssl/crypto/sha/asm/sha512-x86_64.pl =================================================================== --- head/crypto/openssl/crypto/sha/asm/sha512-x86_64.pl (revision 364821) +++ head/crypto/openssl/crypto/sha/asm/sha512-x86_64.pl (revision 364822) @@ -1,2558 +1,2558 @@ #! /usr/bin/env perl # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. Rights for redistribution and usage in source and binary # forms are granted according to the OpenSSL license. # ==================================================================== # # sha256/512_block procedure for x86_64. # # 40% improvement over compiler-generated code on Opteron. On EM64T # sha256 was observed to run >80% faster and sha512 - >40%. No magical # tricks, just straight implementation... I really wonder why gcc # [being armed with inline assembler] fails to generate as fast code. # The only thing which is cool about this module is that it's very # same instruction sequence used for both SHA-256 and SHA-512. In # former case the instructions operate on 32-bit operands, while in # latter - on 64-bit ones. All I had to do is to get one flavor right, # the other one passed the test right away:-) # # sha256_block runs in ~1005 cycles on Opteron, which gives you # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock # frequency in GHz. sha512_block runs in ~1275 cycles, which results # in 128*1000/1275=100MBps per GHz. Is there room for improvement? # Well, if you compare it to IA-64 implementation, which maintains # X[16] in register bank[!], tends to 4 instructions per CPU clock # cycle and runs in 1003 cycles, 1275 is very good result for 3-way # issue Opteron pipeline and X[16] maintained in memory. So that *if* # there is a way to improve it, *then* the only way would be to try to # offload X[16] updates to SSE unit, but that would require "deeper" # loop unroll, which in turn would naturally cause size blow-up, not # to mention increased complexity! And once again, only *if* it's # actually possible to noticeably improve overall ILP, instruction # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect # performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates # apparently are not atomic instructions, but implemented in microcode. # # May 2012. # # Optimization including one of Pavel Semjanov's ideas, alternative # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and # unfortunately -2% SHA512 on P4 [which nobody should care about # that much]. # # June 2012. # # Add SIMD code paths, see below for improvement coefficients. SSSE3 # code path was not attempted for SHA512, because improvement is not # estimated to be high enough, noticeably less than 9%, to justify # the effort, not on pre-AVX processors. [Obviously with exclusion # for VIA Nano, but it has SHA512 instruction that is faster and # should be used instead.] For reference, corresponding estimated # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that # higher coefficients are observed on VIA Nano and Bulldozer has more # to do with specifics of their architecture [which is topic for # separate discussion]. # # November 2012. # # Add AVX2 code path. Two consecutive input blocks are loaded to # 256-bit %ymm registers, with data from first block to least # significant 128-bit halves and data from second to most significant. # The data is then processed with same SIMD instruction sequence as # for AVX, but with %ymm as operands. Side effect is increased stack # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB # code size increase. # # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance in cycles per processed byte (less is better): # # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) # # AMD K8 14.9 - - 9.57 - # P4 17.3 - - 30.8 - # Core 2 15.6 13.8(+13%) - 9.97 - # Westmere 14.8 12.3(+19%) - 9.58 - # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # # (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions # below certain limit makes no difference/sense; to conserve # space SHA256 XOP code path is therefore omitted; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if ($output =~ /512/) { $func="sha512_block_data_order"; $TABLE="K512"; $SZ=8; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", "%r8", "%r9", "%r10","%r11"); ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @sigma1=(19,61, 6); $rounds=80; } else { $func="sha256_block_data_order"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; } $ctx="%rdi"; # 1st arg, zapped by $a3 $inp="%rsi"; # 2nd arg $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $_rsp="`16*$SZ+3*8`(%rsp)"; $framesz="16*$SZ+4*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my $STRIDE=$SZ; $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 xor $e,$a0 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $g,$a2 # f^g mov $T1,`$SZ*($i&0xf)`(%rsp) xor $a,$a1 and $e,$a2 # (f^g)&e ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 add $h,$T1 # T1+=h xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 xor $e,$a0 add $a2,$T1 # T1+=Ch(e,f,g) mov $a,$a2 add ($Tbl),$T1 # T1+=K[round] xor $a,$a1 xor $b,$a2 # a^b, b^c in next round ror \$$Sigma1[0],$a0 # Sigma1(e) mov $b,$h and $a2,$a3 ror \$$Sigma0[0],$a1 # Sigma0(a) add $a0,$T1 # T1+=Sigma1(e) xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) add $T1,$d # d+=T1 add $T1,$h # h+=T1 lea $STRIDE($Tbl),$Tbl # round++ ___ $code.=<<___ if ($i<15); add $a1,$h # h+=Sigma0(a) ___ ($a2,$a3) = ($a3,$a2); } sub ROUND_16_XX() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 mov $a0,$T1 ror \$`$sigma0[1]-$sigma0[0]`,$a0 add $a1,$a # modulo-scheduled h+=Sigma0(a) mov $a2,$a1 ror \$`$sigma1[1]-$sigma1[0]`,$a2 xor $T1,$a0 shr \$$sigma0[2],$T1 ror \$$sigma0[0],$a0 xor $a1,$a2 shr \$$sigma1[2],$a1 ror \$$sigma1[0],$a2 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) xor $a1,$a2 # sigma1(X[(i+14)&0xf]) add `$SZ*(($i+9)&0xf)`(%rsp),$T1 add `$SZ*($i&0xf)`(%rsp),$T1 mov $e,$a0 add $a2,$T1 mov $a,$a1 ___ &ROUND_00_15(@_); } $code=<<___; .text .extern OPENSSL_ia32cap_P .globl $func .type $func,\@function,3 .align 16 $func: .cfi_startproc ___ $code.=<<___ if ($SZ==4 || $avx); lea OPENSSL_ia32cap_P(%rip),%r11 mov 0(%r11),%r9d mov 4(%r11),%r10d mov 8(%r11),%r11d ___ $code.=<<___ if ($SZ==4 && $shaext); test \$`1<<29`,%r11d # check for SHA jnz _shaext_shortcut ___ $code.=<<___ if ($avx && $SZ==8); test \$`1<<11`,%r10d # check for XOP jnz .Lxop_shortcut ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 cmp \$`1<<8|1<<5|1<<3`,%r11d je .Lavx2_shortcut ___ $code.=<<___ if ($avx); and \$`1<<30`,%r9d # mask "Intel CPU" bit and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits or %r9d,%r10d cmp \$`1<<28|1<<9|1<<30`,%r10d je .Lavx_shortcut ___ $code.=<<___ if ($SZ==4); test \$`1<<9`,%r10d jnz .Lssse3_shortcut ___ $code.=<<___; mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 .Lprologue: mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H jmp .Lloop .align 16 .Lloop: mov $B,$a3 lea $TABLE(%rip),$Tbl xor $C,$a3 # magic ___ for($i=0;$i<16;$i++) { $code.=" mov $SZ*$i($inp),$T1\n"; $code.=" mov @ROT[4],$a0\n"; $code.=" mov @ROT[0],$a1\n"; $code.=" bswap $T1\n"; &ROUND_00_15($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: ___ for(;$i<32;$i++) { &ROUND_16_XX($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; cmpb \$0,`$SZ-1`($Tbl) jnz .Lrounds_16_xx mov $_ctx,$ctx add $a1,$A # modulo-scheduled h+=Sigma0(a) lea 16*$SZ($inp),$inp add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop mov $_rsp,%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size $func,.-$func ___ if ($SZ==4) { $code.=<<___; .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " ___ } else { $code.=<<___; .align 64 .type $TABLE,\@object $TABLE: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " ___ } ###################################################################### # SIMD code paths # if ($SZ==4 && $shaext) {{{ ###################################################################### # Intel SHA Extensions implementation of SHA256 update function. # my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); my @MSG=map("%xmm$_",(3..6)); $code.=<<___; .type sha256_block_data_order_shaext,\@function,3 .align 64 sha256_block_data_order_shaext: _shaext_shortcut: .cfi_startproc ___ $code.=<<___ if ($win64); lea `-8-5*16`(%rsp),%rsp movaps %xmm6,-8-5*16(%rax) movaps %xmm7,-8-4*16(%rax) movaps %xmm8,-8-3*16(%rax) movaps %xmm9,-8-2*16(%rax) movaps %xmm10,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x200-0x80($Tbl),$TMP # byte swap mask pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi nop movdqa $ABEF,$ABEF_SAVE # offload sha256rnds2 $CDGH,$ABEF movdqa 1*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi lea 0x40($inp),$inp sha256msg1 @MSG[1],@MSG[0] sha256rnds2 $CDGH,$ABEF movdqa 2*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP nop paddd $TMP,@MSG[0] sha256msg1 @MSG[2],@MSG[1] sha256rnds2 $CDGH,$ABEF movdqa 3*32-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP nop paddd $TMP,@MSG[1] sha256msg1 @MSG[3],@MSG[2] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { $code.=<<___; movdqa $i*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP nop paddd $TMP,@MSG[2] sha256msg1 @MSG[0],@MSG[3] sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP sha256rnds2 $CDGH,$ABEF paddd $TMP,@MSG[2] movdqa 14*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP sha256rnds2 $CDGH,$ABEF movdqa 15*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi nop sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi dec $num nop sha256rnds2 $CDGH,$ABEF paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-5*16(%rax),%xmm6 movaps -8-4*16(%rax),%xmm7 movaps -8-3*16(%rax),%xmm8 movaps -8-2*16(%rax),%xmm9 movaps -8-1*16(%rax),%xmm10 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext ___ }}} {{{ my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a0,$e)', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a2,$b)', # a^b, b^c in next round '&add ($h,$a4)', # h+=Ch(e,f,g) '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($d,$h)', # d+=h '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } ###################################################################### # SSSE3 code path # if ($SZ==4) { # SHA256 only my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; .type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: .cfi_startproc .Lssse3_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___; .Lprologue_ssse3: mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ $code.=<<___; #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] pshufb $t3,@X[0] movdqu 0x30($inp),@X[3] lea $TABLE(%rip),$Tbl pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 movdqa 0x20($Tbl),$t1 pshufb $t3,@X[2] paddd @X[0],$t0 movdqa 0x40($Tbl),$t2 pshufb $t3,@X[3] movdqa 0x60($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 movdqa $t0,0x00(%rsp) mov $A,$a1 movdqa $t1,0x10(%rsp) mov $B,$a3 movdqa $t2,0x20(%rsp) xor $C,$a3 # magic movdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( '&movdqa ($t0,@X[1]);', '&movdqa ($t3,@X[3])', '&palignr ($t0,@X[0],$SZ)', # X[1..4] '&palignr ($t3,@X[2],$SZ);', # X[9..12] '&movdqa ($t1,$t0)', '&movdqa ($t2,$t0);', '&psrld ($t0,$sigma0[2])', '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] '&psrld ($t2,$sigma0[0])', '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] '&pslld ($t1,8*$SZ-$sigma0[1]);'. '&pxor ($t0,$t2)', '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t1)', '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t2);', '&movdqa ($t2,$t3)', '&pxor ($t0,$t1);', # sigma0(X[1..4]) '&psrld ($t3,$sigma1[2])', '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2)', '&pshufb ($t3,$t4)', # sigma1(X[14..15]) '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] '&movdqa ($t2,$t3);', '&psrld ($t3,$sigma1[2])', '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', '&movdqa ($t2,16*2*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); } sub SSSE3_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions if (0) { foreach (Xupdate_256_SSSE3()) { # 36 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } } else { # squeeze extra 4% on Westmere and 19% on Atom eval(shift(@insns)); #@ &movdqa ($t0,@X[1]); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t3,@X[3]); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &palignr ($t0,@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t1,$t0); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t2,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[4..15] eval(shift(@insns)); eval(shift(@insns)); #@ &pslld ($t1,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrld ($t2,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t1); # sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); #&pshufb ($t3,$t4); # sigma1(X[14..15]) &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ #&pshufb ($t3,$t5); &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,16*2*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); } &paddd ($t2,@X[0]); foreach (@insns) { eval; } # remaining instructions &movdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_ssse3 mov $_rsp,%rsi .cfi_def_cfa %rsi,8 ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret .cfi_endproc .size ${func}_ssse3,.-${func}_ssse3 ___ } if ($avx) {{ ###################################################################### # XOP code path # if ($SZ==8) { # SHA512 only $code.=<<___; .type ${func}_xop,\@function,3 .align 64 ${func}_xop: .cfi_startproc .Lxop_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_xop: vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H jmp .Lloop_xop ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub XOP_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[3],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq ($t3,$t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq ($t3,$t3,8); # 22 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } else { # SHA512 my @X = map("%xmm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[1],@X[1] vmovdqu 0x40($inp),@X[4] vpshufb $t3,@X[2],@X[2] vmovdqu 0x50($inp),@X[5] vpshufb $t3,@X[3],@X[3] vmovdqu 0x60($inp),@X[6] vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) mov $B,$a3 vmovdqa $t2,0x60(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x70(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: add \$`16*2*$SZ`,$Tbl ___ sub XOP_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 52 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrlq ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) eval(shift(@insns)); eval(shift(@insns)); &vpsrlq ($t2,@X[7],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &XOP_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_xop mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96(%rsp),%xmm10 movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_xop: ret .cfi_endproc .size ${func}_xop,.-${func}_xop ___ } ###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; .type ${func}_avx,\@function,3 .align 64 ${func}_avx: .cfi_startproc .Lavx_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_avx: vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] '&vpsrld ($t2,$t0,$sigma0[0]);', '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] '&vpsrld ($t3,$t0,$sigma0[2])', '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrld ($t2,$t3,$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) '&vpsrlq ($t3,$t3,$sigma1[0]);', '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,$sigma1[2])', '&vpsrlq ($t3,$t3,$sigma1[0])', '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufb ($t2,$t2,$t5)', '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) ); } sub AVX_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } else { # SHA512 my @X = map("%xmm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[1],@X[1] vmovdqu 0x40($inp),@X[4] vpshufb $t3,@X[2],@X[2] vmovdqu 0x50($inp),@X[5] vpshufb $t3,@X[3],@X[3] vmovdqu 0x60($inp),@X[6] vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) mov $B,$a3 vmovdqa $t2,0x60(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x70(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: add \$`16*2*$SZ`,$Tbl ___ sub Xupdate_512_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] '&vpsrlq ($t2,$t0,$sigma0[0])', '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] '&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', '&vpxor ($t3,$t3,$t1)', '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) ); } sub AVX_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 52 instructions foreach (Xupdate_512_AVX()) { # 23 instructions eval; eval(shift(@insns)); eval(shift(@insns)); } &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &AVX_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_avx mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96(%rsp),%xmm10 movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size ${func}_avx,.-${func}_avx ___ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; sub bodyx_00_15 () { # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] '&and ($a4,$e)', # f&e '&rorx ($a0,$e,$Sigma1[2])', '&rorx ($a2,$e,$Sigma1[1])', '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past '&lea ($h,"($h,$a4)")', '&andn ($a4,$e,$g)', # ~e&g '&xor ($a0,$a2)', '&rorx ($a1,$e,$Sigma1[0])', '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) '&xor ($a0,$a1)', # Sigma1(e) '&mov ($a2,$a)', '&rorx ($a4,$a,$Sigma0[2])', '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) '&xor ($a2,$b)', # a^b, b^c in next round '&rorx ($a1,$a,$Sigma0[1])', '&rorx ($a0,$a,$Sigma0[0])', '&lea ($d,"($d,$h)")', # d+=h '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a4)', '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&xor ($a1,$a0)', # Sigma0(a) '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) '&mov ($a4,$e)', # copy of f in future '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); # and at the finish one has to $a+=$a1 } $code.=<<___; .type ${func}_avx2,\@function,3 .align 64 ${func}_avx2: .cfi_startproc .Lavx2_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp shl \$4,%rdx # num*16 and \$-256*$SZ,%rsp # align stack frame lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ add \$`2*$SZ*($rounds-8)`,%rsp mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_avx2: vzeroupper sub \$-16*$SZ,$inp # inp++, size optimization mov $SZ*0($ctx),$A mov $inp,%r12 # borrow $T1 mov $SZ*1($ctx),$B cmp %rdx,$inp # $_end mov $SZ*2($ctx),$C cmove %rsp,%r12 # next block or random data mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ if ($SZ==4) { # SHA256 my @X = map("%ymm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); $code.=<<___; vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu -16*$SZ+0($inp),%xmm0 vmovdqu -16*$SZ+16($inp),%xmm1 vmovdqu -16*$SZ+32($inp),%xmm2 vmovdqu -16*$SZ+48($inp),%xmm3 #mov $inp,$_inp # offload $inp vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t3,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t3,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rdi as frame pointer mov $_rsp,%rdi .cfi_def_cfa %rdi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rdi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; mov $B,$a3 vmovdqa $t2,0x00(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x20(%rsp) mov $F,$a4 sub \$-16*2*$SZ,$Tbl # size optimization jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: ___ sub AVX2_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 96 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%2)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX2_256_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1)."($Tbl)",0); &jne (".Lavx2_00_47"); for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } else { # SHA512 my @X = map("%ymm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); $code.=<<___; jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqu -16*$SZ($inp),%xmm0 vmovdqu -16*$SZ+16($inp),%xmm1 vmovdqu -16*$SZ+32($inp),%xmm2 lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu -16*$SZ+48($inp),%xmm3 vmovdqu -16*$SZ+64($inp),%xmm4 vmovdqu -16*$SZ+80($inp),%xmm5 vmovdqu -16*$SZ+96($inp),%xmm6 vmovdqu -16*$SZ+112($inp),%xmm7 #mov $inp,$_inp # offload $inp vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t2,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t2,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] vpshufb $t2,@X[2],@X[2] vinserti128 \$1,64(%r12),@X[4],@X[4] vpshufb $t2,@X[3],@X[3] vinserti128 \$1,80(%r12),@X[5],@X[5] vpshufb $t2,@X[4],@X[4] vinserti128 \$1,96(%r12),@X[6],@X[6] vpshufb $t2,@X[5],@X[5] vinserti128 \$1,112(%r12),@X[7],@X[7] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t2,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t2,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x20(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x40(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x60(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rdi as frame pointer mov $_rsp,%rdi .cfi_def_cfa %rdi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rdi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) mov $B,$a3 vmovdqa $t2,0x40(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x60(%rsp) mov $F,$a4 add \$16*2*$SZ,$Tbl jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: ___ sub AVX2_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 48 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%4)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_512_AVX()) { # 23 instructions eval; if ($_ !~ /\;$/) { eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } } &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &AVX2_512_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1-0x80)."($Tbl)",0); &jne (".Lavx2_00_47"); for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } $code.=<<___; mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx add $a1,$A #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp lea `2*$SZ*($rounds-8)`(%rsp),$Tbl add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) cmp `$PUSH8+2*8`($Tbl),$inp # $_end je .Ldone_avx2 xor $a1,$a1 mov $B,$a3 xor $C,$a3 # magic mov $F,$a4 jmp .Lower_avx2 .align 16 .Lower_avx2: ___ for ($i=0; $i<8; ) { my $base="+16($Tbl)"; foreach(bodyx_00_15()) { eval; } } $code.=<<___; lea -$PUSH8($Tbl),$Tbl cmp %rsp,$Tbl jae .Lower_avx2 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx add $a1,$A #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp lea `2*$SZ*($rounds-8)`(%rsp),%rsp # restore frame pointer to original location at $_rsp .cfi_cfa_expression $_rsp,deref,+8 add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F lea `2*16*$SZ`($inp),$inp # inp+=2 add $SZ*6($ctx),$G mov $inp,%r12 add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) cmove %rsp,%r12 # next block or stale data mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jbe .Loop_avx2 lea (%rsp),$Tbl # temporarily use $Tbl as index to $_rsp # this avoids the need to save a secondary frame pointer at -8(%rsp) .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 .Ldone_avx2: mov `16*$SZ+3*8`($Tbl),%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32($Tbl),%xmm6 movaps 16*$SZ+48($Tbl),%xmm7 movaps 16*$SZ+64($Tbl),%xmm8 movaps 16*$SZ+80($Tbl),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96($Tbl),%xmm10 movaps 16*$SZ+112($Tbl),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HanderlData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ $code.=<<___ if ($avx>1); lea .Lavx2_shortcut(%rip),%r10 cmp %r10,%rbx # context->RipRbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea .Lepilogue(%rip),%r10 cmp %r10,%rbx jb .Lin_prologue # non-AVX code lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area lea 512($context),%rdi # &context.Xmm6 mov \$`$SZ==4?8:12`,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($SZ==4 && $shaext); .type shaext_handler,\@abi-omnipotent .align 16 shaext_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lin_prologue lea .Lepilogue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue lea -8-5*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$10,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size shaext_handler,.-shaext_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_$func .rva .LSEH_end_$func .rva .LSEH_info_$func ___ $code.=<<___ if ($SZ==4 && $shaext); .rva .LSEH_begin_${func}_shaext .rva .LSEH_end_${func}_shaext .rva .LSEH_info_${func}_shaext ___ $code.=<<___ if ($SZ==4); .rva .LSEH_begin_${func}_ssse3 .rva .LSEH_end_${func}_ssse3 .rva .LSEH_info_${func}_ssse3 ___ $code.=<<___ if ($avx && $SZ==8); .rva .LSEH_begin_${func}_xop .rva .LSEH_end_${func}_xop .rva .LSEH_info_${func}_xop ___ $code.=<<___ if ($avx); .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_${func}_avx2 .rva .LSEH_end_${func}_avx2 .rva .LSEH_info_${func}_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_$func: .byte 9,0,0,0 .rva se_handler .rva .Lprologue,.Lepilogue # HandlerData[] ___ $code.=<<___ if ($SZ==4 && $shaext); .LSEH_info_${func}_shaext: .byte 9,0,0,0 .rva shaext_handler ___ $code.=<<___ if ($SZ==4); .LSEH_info_${func}_ssse3: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx && $SZ==8); .LSEH_info_${func}_xop: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_${func}_avx: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_${func}_avx2: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ } sub sha256op38 { my $instr = shift; my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { my @opcode=(0x0f,0x38); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!";