Index: stable/12/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/aes/asm/aesni-mb-x86_64.pl (revision 364963) @@ -1,1474 +1,1474 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Multi-buffer AES-NI procedures process several independent buffers # in parallel by interleaving independent instructions. # # Cycles per byte for interleave factor 4: # # asymptotic measured # --------------------------- # Westmere 5.00/4=1.25 5.13/4=1.28 # Atom 15.0/4=3.75 ?15.7/4=3.93 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29 # Haswell 4.44/4=1.11 4.44/4=1.11 # Bulldozer 5.75/4=1.44 5.76/4=1.44 # # Cycles per byte for interleave factor 8 (not implemented for # pre-AVX processors, where higher interleave factor incidentally # doesn't result in improvement): # # asymptotic measured # --------------------------- # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*) # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*) # Haswell 5.00/8=0.63 5.00/8=0.63 # Bulldozer 5.75/8=0.72 5.77/8=0.72 # # (*) Sandy/Ivy Bridge are known to handle high interleave factors # suboptimally; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void aesni_multi_cbc_encrypt ( # struct { void *inp,*out; int blocks; double iv[2]; } inp[8]; # const AES_KEY *key, # int num); /* 1 or 2 */ # $inp="%rdi"; # 1st arg $key="%rsi"; # 2nd arg $num="%edx"; @inptr=map("%r$_",(8..11)); @outptr=map("%r$_",(12..15)); ($rndkey0,$rndkey1)=("%xmm0","%xmm1"); @out=map("%xmm$_",(2..5)); @inp=map("%xmm$_",(6..9)); ($counters,$mask,$zero)=map("%xmm$_",(10..12)); ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx"); $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl aesni_multi_cbc_encrypt .type aesni_multi_cbc_encrypt,\@function,3 .align 32 aesni_multi_cbc_encrypt: .cfi_startproc ___ $code.=<<___ if ($avx); cmp \$2,$num jb .Lenc_non_avx mov OPENSSL_ia32cap_P+4(%rip),%ecx test \$`1<<28`,%ecx # AVX bit jnz _avx_cbc_enc_shortcut jmp .Lenc_non_avx .align 16 .Lenc_non_avx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,0x60(%rsp) movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters sub \$48,%rsp and \$-64,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Lenc4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*2($inp),$inp .Lenc4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*2`($inp),@inptr[$i] cmp $num,$one mov `40*$i+8-40*2`($inp),@outptr[$i] cmovg $one,$num # find maximum test $one,$one movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Lenc4x_done movups 0x10-0x78($key),$rndkey1 pxor $zero,@out[0] movups 0x20-0x78($key),$rndkey0 pxor $zero,@out[1] mov 0xf0-0x78($key),$rounds pxor $zero,@out[2] movdqu (@inptr[0]),@inp[0] # load inputs pxor $zero,@out[3] movdqu (@inptr[1]),@inp[1] pxor @inp[0],@out[0] movdqu (@inptr[2]),@inp[2] pxor @inp[1],@out[1] movdqu (@inptr[3]),@inp[3] pxor @inp[2],@out[2] pxor @inp[3],@out[3] movdqa 32(%rsp),$counters # load counters xor $offset,$offset jmp .Loop_enc4x .align 32 .Loop_enc4x: add \$16,$offset lea 16(%rsp),$sink # sink pointer mov \$1,$one # constant of 1 sub $offset,$sink aesenc $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesenc $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[2],$offset) aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 ___ for($i=0;$i<4;$i++) { my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; $code.=<<___; cmp `32+4*$i`(%rsp),$one aesenc $rndkey,@out[0] aesenc $rndkey,@out[1] aesenc $rndkey,@out[2] cmovge $sink,@inptr[$i] # cancel input cmovg $sink,@outptr[$i] # sink output aesenc $rndkey,@out[3] movups `0x40+16*$i-0x78`($key),$rndkey ___ } $code.=<<___; movdqa $counters,$mask aesenc $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesenc $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 pxor $zero,$zero aesenc $rndkey1,@out[0] pcmpgtd $zero,$mask movdqu -0x78($key),$zero # reload 0-round key aesenc $rndkey1,@out[1] paddd $mask,$counters # decrement counters movdqa $counters,32(%rsp) # update counters aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0x90-0x78($key),$rndkey1 cmp \$11,$rounds aesenc $rndkey0,@out[0] aesenc $rndkey0,@out[1] aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xa0-0x78($key),$rndkey0 jb .Lenc4x_tail aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0xb0-0x78($key),$rndkey1 aesenc $rndkey0,@out[0] aesenc $rndkey0,@out[1] aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xc0-0x78($key),$rndkey0 je .Lenc4x_tail aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movups 0xd0-0x78($key),$rndkey1 aesenc $rndkey0,@out[0] aesenc $rndkey0,@out[1] aesenc $rndkey0,@out[2] aesenc $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Lenc4x_tail .align 32 .Lenc4x_tail: aesenc $rndkey1,@out[0] aesenc $rndkey1,@out[1] aesenc $rndkey1,@out[2] aesenc $rndkey1,@out[3] movdqu (@inptr[0],$offset),@inp[0] movdqu 0x10-0x78($key),$rndkey1 aesenclast $rndkey0,@out[0] movdqu (@inptr[1],$offset),@inp[1] pxor $zero,@inp[0] aesenclast $rndkey0,@out[1] movdqu (@inptr[2],$offset),@inp[2] pxor $zero,@inp[1] aesenclast $rndkey0,@out[2] movdqu (@inptr[3],$offset),@inp[3] pxor $zero,@inp[2] aesenclast $rndkey0,@out[3] movdqu 0x20-0x78($key),$rndkey0 pxor $zero,@inp[3] movups @out[0],-16(@outptr[0],$offset) pxor @inp[0],@out[0] movups @out[1],-16(@outptr[1],$offset) pxor @inp[1],@out[1] movups @out[2],-16(@outptr[2],$offset) pxor @inp[2],@out[2] movups @out[3],-16(@outptr[3],$offset) pxor @inp[3],@out[3] dec $num jnz .Loop_enc4x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 mov 24(%rsp),$num #pxor @inp[0],@out[0] #pxor @inp[1],@out[1] #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME! #pxor @inp[2],@out[2] #movdqu @out[1],`40*1+24-40*2`($inp) #pxor @inp[3],@out[3] #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out... lea `40*4`($inp),$inp dec $num jnz .Lenc4x_loop_grande .Lenc4x_done: ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 #movaps -0x68(%rax),%xmm13 #movaps -0x58(%rax),%xmm14 #movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lenc4x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt .globl aesni_multi_cbc_decrypt .type aesni_multi_cbc_decrypt,\@function,3 .align 32 aesni_multi_cbc_decrypt: .cfi_startproc ___ $code.=<<___ if ($avx); cmp \$2,$num jb .Ldec_non_avx mov OPENSSL_ia32cap_P+4(%rip),%ecx test \$`1<<28`,%ecx # AVX bit jnz _avx_cbc_dec_shortcut jmp .Ldec_non_avx .align 16 .Ldec_non_avx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,0x60(%rsp) movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters sub \$48,%rsp and \$-64,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Ldec4x_body: movdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*2($inp),$inp .Ldec4x_loop_grande: mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*2`($inp),@inptr[$i] cmp $num,$one mov `40*$i+8-40*2`($inp),@outptr[$i] cmovg $one,$num # find maximum test $one,$one movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@inptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldec4x_done movups 0x10-0x78($key),$rndkey1 movups 0x20-0x78($key),$rndkey0 mov 0xf0-0x78($key),$rounds movdqu (@inptr[0]),@out[0] # load inputs movdqu (@inptr[1]),@out[1] pxor $zero,@out[0] movdqu (@inptr[2]),@out[2] pxor $zero,@out[1] movdqu (@inptr[3]),@out[3] pxor $zero,@out[2] pxor $zero,@out[3] movdqa 32(%rsp),$counters # load counters xor $offset,$offset jmp .Loop_dec4x .align 32 .Loop_dec4x: add \$16,$offset lea 16(%rsp),$sink # sink pointer mov \$1,$one # constant of 1 sub $offset,$sink aesdec $rndkey1,@out[0] prefetcht0 31(@inptr[0],$offset) # prefetch input prefetcht0 31(@inptr[1],$offset) aesdec $rndkey1,@out[1] prefetcht0 31(@inptr[2],$offset) prefetcht0 31(@inptr[3],$offset) aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0x30-0x78($key),$rndkey1 ___ for($i=0;$i<4;$i++) { my $rndkey = ($i&1) ? $rndkey1 : $rndkey0; $code.=<<___; cmp `32+4*$i`(%rsp),$one aesdec $rndkey,@out[0] aesdec $rndkey,@out[1] aesdec $rndkey,@out[2] cmovge $sink,@inptr[$i] # cancel input cmovg $sink,@outptr[$i] # sink output aesdec $rndkey,@out[3] movups `0x40+16*$i-0x78`($key),$rndkey ___ } $code.=<<___; movdqa $counters,$mask aesdec $rndkey0,@out[0] prefetcht0 15(@outptr[0],$offset) # prefetch output prefetcht0 15(@outptr[1],$offset) aesdec $rndkey0,@out[1] prefetcht0 15(@outptr[2],$offset) prefetcht0 15(@outptr[3],$offset) aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0x80-0x78($key),$rndkey0 pxor $zero,$zero aesdec $rndkey1,@out[0] pcmpgtd $zero,$mask movdqu -0x78($key),$zero # reload 0-round key aesdec $rndkey1,@out[1] paddd $mask,$counters # decrement counters movdqa $counters,32(%rsp) # update counters aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0x90-0x78($key),$rndkey1 cmp \$11,$rounds aesdec $rndkey0,@out[0] aesdec $rndkey0,@out[1] aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xa0-0x78($key),$rndkey0 jb .Ldec4x_tail aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0xb0-0x78($key),$rndkey1 aesdec $rndkey0,@out[0] aesdec $rndkey0,@out[1] aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xc0-0x78($key),$rndkey0 je .Ldec4x_tail aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] aesdec $rndkey1,@out[2] aesdec $rndkey1,@out[3] movups 0xd0-0x78($key),$rndkey1 aesdec $rndkey0,@out[0] aesdec $rndkey0,@out[1] aesdec $rndkey0,@out[2] aesdec $rndkey0,@out[3] movups 0xe0-0x78($key),$rndkey0 jmp .Ldec4x_tail .align 32 .Ldec4x_tail: aesdec $rndkey1,@out[0] aesdec $rndkey1,@out[1] aesdec $rndkey1,@out[2] pxor $rndkey0,@inp[0] pxor $rndkey0,@inp[1] aesdec $rndkey1,@out[3] movdqu 0x10-0x78($key),$rndkey1 pxor $rndkey0,@inp[2] pxor $rndkey0,@inp[3] movdqu 0x20-0x78($key),$rndkey0 aesdeclast @inp[0],@out[0] aesdeclast @inp[1],@out[1] movdqu -16(@inptr[0],$offset),@inp[0] # load next IV movdqu -16(@inptr[1],$offset),@inp[1] aesdeclast @inp[2],@out[2] aesdeclast @inp[3],@out[3] movdqu -16(@inptr[2],$offset),@inp[2] movdqu -16(@inptr[3],$offset),@inp[3] movups @out[0],-16(@outptr[0],$offset) movdqu (@inptr[0],$offset),@out[0] movups @out[1],-16(@outptr[1],$offset) movdqu (@inptr[1],$offset),@out[1] pxor $zero,@out[0] movups @out[2],-16(@outptr[2],$offset) movdqu (@inptr[2],$offset),@out[2] pxor $zero,@out[1] movups @out[3],-16(@outptr[3],$offset) movdqu (@inptr[3],$offset),@out[3] pxor $zero,@out[2] pxor $zero,@out[3] dec $num jnz .Loop_dec4x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 mov 24(%rsp),$num lea `40*4`($inp),$inp dec $num jnz .Ldec4x_loop_grande .Ldec4x_done: ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 #movaps -0x68(%rax),%xmm13 #movaps -0x58(%rax),%xmm14 #movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Ldec4x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt ___ if ($avx) {{{ my @ptr=map("%r$_",(8..15)); my $offload=$sink; my @out=map("%xmm$_",(2..9)); my @inp=map("%xmm$_",(10..13)); my ($counters,$zero)=("%xmm14","%xmm15"); $code.=<<___; .type aesni_multi_cbc_encrypt_avx,\@function,3 .align 32 aesni_multi_cbc_encrypt_avx: .cfi_startproc _avx_cbc_enc_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters # +64 distances between inputs and outputs # +128 off-load area for @inp[0..3] sub \$192,%rsp and \$-128,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Lenc8x_body: vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*4($inp),$inp shr \$1,$num .Lenc8x_loop_grande: #mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; $code.=<<___; mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer cmp $num,$one mov `40*$i+8-40*4`($inp),$temp # output pointer cmovg $one,$num # find maximum test $one,$one vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output mov $temp,`64+8*$i`(%rsp) # initialize distances ___ } $code.=<<___; test $num,$num jz .Lenc8x_done vmovups 0x10-0x78($key),$rndkey1 vmovups 0x20-0x78($key),$rndkey0 mov 0xf0-0x78($key),$rounds vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round lea 128(%rsp),$offload # offload area vpxor (@ptr[1]),$zero,@inp[1] vpxor (@ptr[2]),$zero,@inp[2] vpxor (@ptr[3]),$zero,@inp[3] vpxor @inp[0],@out[0],@out[0] vpxor (@ptr[4]),$zero,@inp[0] vpxor @inp[1],@out[1],@out[1] vpxor (@ptr[5]),$zero,@inp[1] vpxor @inp[2],@out[2],@out[2] vpxor (@ptr[6]),$zero,@inp[2] vpxor @inp[3],@out[3],@out[3] vpxor (@ptr[7]),$zero,@inp[3] vpxor @inp[0],@out[4],@out[4] mov \$1,$one # constant of 1 vpxor @inp[1],@out[5],@out[5] vpxor @inp[2],@out[6],@out[6] vpxor @inp[3],@out[7],@out[7] jmp .Loop_enc8x .align 32 .Loop_enc8x: ___ for($i=0;$i<8;$i++) { my $rndkey=($i&1)?$rndkey0:$rndkey1; $code.=<<___; vaesenc $rndkey,@out[0],@out[0] cmp 32+4*$i(%rsp),$one ___ $code.=<<___ if ($i); mov 64+8*$i(%rsp),$offset ___ $code.=<<___; vaesenc $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesenc $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesenc $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input vaesenc $rndkey,@out[4],@out[4] cmovg %rsp,$offset # sink output vaesenc $rndkey,@out[5],@out[5] sub @ptr[$i],$offset vaesenc $rndkey,@out[6],@out[6] vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round mov $offset,64+8*$i(%rsp) vaesenc $rndkey,@out[7],@out[7] vmovups `16*(3+$i)-0x78`($key),$rndkey lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output ___ $code.=<<___ if ($i<4) vmovdqu @inp[$i%4],`16*$i`($offload) # off-load ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Lenc8x_tail vaesenc $rndkey1,@out[0],@out[0] vaesenc $rndkey1,@out[1],@out[1] vaesenc $rndkey1,@out[2],@out[2] vaesenc $rndkey1,@out[3],@out[3] vaesenc $rndkey1,@out[4],@out[4] vaesenc $rndkey1,@out[5],@out[5] vaesenc $rndkey1,@out[6],@out[6] vaesenc $rndkey1,@out[7],@out[7] vmovups 0xb0-0x78($key),$rndkey1 vaesenc $rndkey0,@out[0],@out[0] vaesenc $rndkey0,@out[1],@out[1] vaesenc $rndkey0,@out[2],@out[2] vaesenc $rndkey0,@out[3],@out[3] vaesenc $rndkey0,@out[4],@out[4] vaesenc $rndkey0,@out[5],@out[5] vaesenc $rndkey0,@out[6],@out[6] vaesenc $rndkey0,@out[7],@out[7] vmovups 0xc0-0x78($key),$rndkey0 je .Lenc8x_tail vaesenc $rndkey1,@out[0],@out[0] vaesenc $rndkey1,@out[1],@out[1] vaesenc $rndkey1,@out[2],@out[2] vaesenc $rndkey1,@out[3],@out[3] vaesenc $rndkey1,@out[4],@out[4] vaesenc $rndkey1,@out[5],@out[5] vaesenc $rndkey1,@out[6],@out[6] vaesenc $rndkey1,@out[7],@out[7] vmovups 0xd0-0x78($key),$rndkey1 vaesenc $rndkey0,@out[0],@out[0] vaesenc $rndkey0,@out[1],@out[1] vaesenc $rndkey0,@out[2],@out[2] vaesenc $rndkey0,@out[3],@out[3] vaesenc $rndkey0,@out[4],@out[4] vaesenc $rndkey0,@out[5],@out[5] vaesenc $rndkey0,@out[6],@out[6] vaesenc $rndkey0,@out[7],@out[7] vmovups 0xe0-0x78($key),$rndkey0 .Lenc8x_tail: vaesenc $rndkey1,@out[0],@out[0] vpxor $zero,$zero,$zero vaesenc $rndkey1,@out[1],@out[1] vaesenc $rndkey1,@out[2],@out[2] vpcmpgtd $zero,$counters,$zero vaesenc $rndkey1,@out[3],@out[3] vaesenc $rndkey1,@out[4],@out[4] vpaddd $counters,$zero,$zero # decrement counters vmovdqu 48(%rsp),$counters vaesenc $rndkey1,@out[5],@out[5] mov 64(%rsp),$offset # pre-load 1st offset vaesenc $rndkey1,@out[6],@out[6] vaesenc $rndkey1,@out[7],@out[7] vmovups 0x10-0x78($key),$rndkey1 vaesenclast $rndkey0,@out[0],@out[0] vmovdqa $zero,32(%rsp) # update counters vpxor $zero,$zero,$zero vaesenclast $rndkey0,@out[1],@out[1] vaesenclast $rndkey0,@out[2],@out[2] vpcmpgtd $zero,$counters,$zero vaesenclast $rndkey0,@out[3],@out[3] vaesenclast $rndkey0,@out[4],@out[4] vpaddd $zero,$counters,$counters # decrement counters vmovdqu -0x78($key),$zero # 0-round vaesenclast $rndkey0,@out[5],@out[5] vaesenclast $rndkey0,@out[6],@out[6] vmovdqa $counters,48(%rsp) # update counters vaesenclast $rndkey0,@out[7],@out[7] vmovups 0x20-0x78($key),$rndkey0 vmovups @out[0],-16(@ptr[0]) # write output sub $offset,@ptr[0] # switch to input vpxor 0x00($offload),@out[0],@out[0] vmovups @out[1],-16(@ptr[1]) sub `64+1*8`(%rsp),@ptr[1] vpxor 0x10($offload),@out[1],@out[1] vmovups @out[2],-16(@ptr[2]) sub `64+2*8`(%rsp),@ptr[2] vpxor 0x20($offload),@out[2],@out[2] vmovups @out[3],-16(@ptr[3]) sub `64+3*8`(%rsp),@ptr[3] vpxor 0x30($offload),@out[3],@out[3] vmovups @out[4],-16(@ptr[4]) sub `64+4*8`(%rsp),@ptr[4] vpxor @inp[0],@out[4],@out[4] vmovups @out[5],-16(@ptr[5]) sub `64+5*8`(%rsp),@ptr[5] vpxor @inp[1],@out[5],@out[5] vmovups @out[6],-16(@ptr[6]) sub `64+6*8`(%rsp),@ptr[6] vpxor @inp[2],@out[6],@out[6] vmovups @out[7],-16(@ptr[7]) sub `64+7*8`(%rsp),@ptr[7] vpxor @inp[3],@out[7],@out[7] dec $num jnz .Loop_enc8x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num #lea `40*8`($inp),$inp #dec $num #jnz .Lenc8x_loop_grande .Lenc8x_done: vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lenc8x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx .type aesni_multi_cbc_decrypt_avx,\@function,3 .align 32 aesni_multi_cbc_decrypt_avx: .cfi_startproc _avx_cbc_dec_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; # stack layout # # +0 output sink # +16 input sink [original %rsp and $num] # +32 counters # +64 distances between inputs and outputs # +128 off-load area for @inp[0..3] # +192 IV/input offload sub \$256,%rsp and \$-256,%rsp sub \$192,%rsp mov %rax,16(%rsp) # original %rsp .cfi_cfa_expression %rsp+16,deref,+8 .Ldec8x_body: vzeroupper vmovdqu ($key),$zero # 0-round key lea 0x78($key),$key # size optimization lea 40*4($inp),$inp shr \$1,$num .Ldec8x_loop_grande: #mov $num,24(%rsp) # original $num xor $num,$num ___ for($i=0;$i<8;$i++) { my $temp = $i ? $offload : $offset; $code.=<<___; mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer cmp $num,$one mov `40*$i+8-40*4`($inp),$temp # output pointer cmovg $one,$num # find maximum test $one,$one vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV mov $one,`32+4*$i`(%rsp) # initialize counters cmovle %rsp,@ptr[$i] # cancel input sub @ptr[$i],$temp # distance between input and output mov $temp,`64+8*$i`(%rsp) # initialize distances vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV ___ } $code.=<<___; test $num,$num jz .Ldec8x_done vmovups 0x10-0x78($key),$rndkey1 vmovups 0x20-0x78($key),$rndkey0 mov 0xf0-0x78($key),$rounds lea 192+128(%rsp),$offload # offload area vmovdqu (@ptr[0]),@out[0] # load inputs vmovdqu (@ptr[1]),@out[1] vmovdqu (@ptr[2]),@out[2] vmovdqu (@ptr[3]),@out[3] vmovdqu (@ptr[4]),@out[4] vmovdqu (@ptr[5]),@out[5] vmovdqu (@ptr[6]),@out[6] vmovdqu (@ptr[7]),@out[7] vmovdqu @out[0],0x00($offload) # offload inputs vpxor $zero,@out[0],@out[0] # xor inputs with 0-round vmovdqu @out[1],0x10($offload) vpxor $zero,@out[1],@out[1] vmovdqu @out[2],0x20($offload) vpxor $zero,@out[2],@out[2] vmovdqu @out[3],0x30($offload) vpxor $zero,@out[3],@out[3] vmovdqu @out[4],0x40($offload) vpxor $zero,@out[4],@out[4] vmovdqu @out[5],0x50($offload) vpxor $zero,@out[5],@out[5] vmovdqu @out[6],0x60($offload) vpxor $zero,@out[6],@out[6] vmovdqu @out[7],0x70($offload) vpxor $zero,@out[7],@out[7] xor \$0x80,$offload mov \$1,$one # constant of 1 jmp .Loop_dec8x .align 32 .Loop_dec8x: ___ for($i=0;$i<8;$i++) { my $rndkey=($i&1)?$rndkey0:$rndkey1; $code.=<<___; vaesdec $rndkey,@out[0],@out[0] cmp 32+4*$i(%rsp),$one ___ $code.=<<___ if ($i); mov 64+8*$i(%rsp),$offset ___ $code.=<<___; vaesdec $rndkey,@out[1],@out[1] prefetcht0 31(@ptr[$i]) # prefetch input vaesdec $rndkey,@out[2],@out[2] ___ $code.=<<___ if ($i>1); prefetcht0 15(@ptr[$i-2]) # prefetch output ___ $code.=<<___; vaesdec $rndkey,@out[3],@out[3] lea (@ptr[$i],$offset),$offset cmovge %rsp,@ptr[$i] # cancel input vaesdec $rndkey,@out[4],@out[4] cmovg %rsp,$offset # sink output vaesdec $rndkey,@out[5],@out[5] sub @ptr[$i],$offset vaesdec $rndkey,@out[6],@out[6] vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input mov $offset,64+8*$i(%rsp) vaesdec $rndkey,@out[7],@out[7] vmovups `16*(3+$i)-0x78`($key),$rndkey lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output ___ $code.=<<___ if ($i<4); vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load ___ } $code.=<<___; vmovdqu 32(%rsp),$counters prefetcht0 15(@ptr[$i-2]) # prefetch output prefetcht0 15(@ptr[$i-1]) cmp \$11,$rounds jb .Ldec8x_tail vaesdec $rndkey1,@out[0],@out[0] vaesdec $rndkey1,@out[1],@out[1] vaesdec $rndkey1,@out[2],@out[2] vaesdec $rndkey1,@out[3],@out[3] vaesdec $rndkey1,@out[4],@out[4] vaesdec $rndkey1,@out[5],@out[5] vaesdec $rndkey1,@out[6],@out[6] vaesdec $rndkey1,@out[7],@out[7] vmovups 0xb0-0x78($key),$rndkey1 vaesdec $rndkey0,@out[0],@out[0] vaesdec $rndkey0,@out[1],@out[1] vaesdec $rndkey0,@out[2],@out[2] vaesdec $rndkey0,@out[3],@out[3] vaesdec $rndkey0,@out[4],@out[4] vaesdec $rndkey0,@out[5],@out[5] vaesdec $rndkey0,@out[6],@out[6] vaesdec $rndkey0,@out[7],@out[7] vmovups 0xc0-0x78($key),$rndkey0 je .Ldec8x_tail vaesdec $rndkey1,@out[0],@out[0] vaesdec $rndkey1,@out[1],@out[1] vaesdec $rndkey1,@out[2],@out[2] vaesdec $rndkey1,@out[3],@out[3] vaesdec $rndkey1,@out[4],@out[4] vaesdec $rndkey1,@out[5],@out[5] vaesdec $rndkey1,@out[6],@out[6] vaesdec $rndkey1,@out[7],@out[7] vmovups 0xd0-0x78($key),$rndkey1 vaesdec $rndkey0,@out[0],@out[0] vaesdec $rndkey0,@out[1],@out[1] vaesdec $rndkey0,@out[2],@out[2] vaesdec $rndkey0,@out[3],@out[3] vaesdec $rndkey0,@out[4],@out[4] vaesdec $rndkey0,@out[5],@out[5] vaesdec $rndkey0,@out[6],@out[6] vaesdec $rndkey0,@out[7],@out[7] vmovups 0xe0-0x78($key),$rndkey0 .Ldec8x_tail: vaesdec $rndkey1,@out[0],@out[0] vpxor $zero,$zero,$zero vaesdec $rndkey1,@out[1],@out[1] vaesdec $rndkey1,@out[2],@out[2] vpcmpgtd $zero,$counters,$zero vaesdec $rndkey1,@out[3],@out[3] vaesdec $rndkey1,@out[4],@out[4] vpaddd $counters,$zero,$zero # decrement counters vmovdqu 48(%rsp),$counters vaesdec $rndkey1,@out[5],@out[5] mov 64(%rsp),$offset # pre-load 1st offset vaesdec $rndkey1,@out[6],@out[6] vaesdec $rndkey1,@out[7],@out[7] vmovups 0x10-0x78($key),$rndkey1 vaesdeclast $rndkey0,@out[0],@out[0] vmovdqa $zero,32(%rsp) # update counters vpxor $zero,$zero,$zero vaesdeclast $rndkey0,@out[1],@out[1] vpxor 0x00($offload),@out[0],@out[0] # xor with IV vaesdeclast $rndkey0,@out[2],@out[2] vpxor 0x10($offload),@out[1],@out[1] vpcmpgtd $zero,$counters,$zero vaesdeclast $rndkey0,@out[3],@out[3] vpxor 0x20($offload),@out[2],@out[2] vaesdeclast $rndkey0,@out[4],@out[4] vpxor 0x30($offload),@out[3],@out[3] vpaddd $zero,$counters,$counters # decrement counters vmovdqu -0x78($key),$zero # 0-round vaesdeclast $rndkey0,@out[5],@out[5] vpxor 0x40($offload),@out[4],@out[4] vaesdeclast $rndkey0,@out[6],@out[6] vpxor 0x50($offload),@out[5],@out[5] vmovdqa $counters,48(%rsp) # update counters vaesdeclast $rndkey0,@out[7],@out[7] vpxor 0x60($offload),@out[6],@out[6] vmovups 0x20-0x78($key),$rndkey0 vmovups @out[0],-16(@ptr[0]) # write output sub $offset,@ptr[0] # switch to input vmovdqu 128+0(%rsp),@out[0] vpxor 0x70($offload),@out[7],@out[7] vmovups @out[1],-16(@ptr[1]) sub `64+1*8`(%rsp),@ptr[1] vmovdqu @out[0],0x00($offload) vpxor $zero,@out[0],@out[0] vmovdqu 128+16(%rsp),@out[1] vmovups @out[2],-16(@ptr[2]) sub `64+2*8`(%rsp),@ptr[2] vmovdqu @out[1],0x10($offload) vpxor $zero,@out[1],@out[1] vmovdqu 128+32(%rsp),@out[2] vmovups @out[3],-16(@ptr[3]) sub `64+3*8`(%rsp),@ptr[3] vmovdqu @out[2],0x20($offload) vpxor $zero,@out[2],@out[2] vmovdqu 128+48(%rsp),@out[3] vmovups @out[4],-16(@ptr[4]) sub `64+4*8`(%rsp),@ptr[4] vmovdqu @out[3],0x30($offload) vpxor $zero,@out[3],@out[3] vmovdqu @inp[0],0x40($offload) vpxor @inp[0],$zero,@out[4] vmovups @out[5],-16(@ptr[5]) sub `64+5*8`(%rsp),@ptr[5] vmovdqu @inp[1],0x50($offload) vpxor @inp[1],$zero,@out[5] vmovups @out[6],-16(@ptr[6]) sub `64+6*8`(%rsp),@ptr[6] vmovdqu @inp[2],0x60($offload) vpxor @inp[2],$zero,@out[6] vmovups @out[7],-16(@ptr[7]) sub `64+7*8`(%rsp),@ptr[7] vmovdqu @inp[3],0x70($offload) vpxor @inp[3],$zero,@out[7] xor \$128,$offload dec $num jnz .Loop_dec8x mov 16(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 #mov 24(%rsp),$num #lea `40*8`($inp),$inp #dec $num #jnz .Ldec8x_loop_grande .Ldec8x_done: vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Ldec8x_epilogue: ret .cfi_endproc .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx ___ }}} if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->Rip<.Lprologue jb .Lin_prologue mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue mov 16(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_aesni_multi_cbc_encrypt .rva .LSEH_end_aesni_multi_cbc_encrypt .rva .LSEH_info_aesni_multi_cbc_encrypt .rva .LSEH_begin_aesni_multi_cbc_decrypt .rva .LSEH_end_aesni_multi_cbc_decrypt .rva .LSEH_info_aesni_multi_cbc_decrypt ___ $code.=<<___ if ($avx); .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx .rva .LSEH_end_aesni_multi_cbc_encrypt_avx .rva .LSEH_info_aesni_multi_cbc_encrypt_avx .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx .rva .LSEH_end_aesni_multi_cbc_decrypt_avx .rva .LSEH_info_aesni_multi_cbc_decrypt_avx ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_aesni_multi_cbc_encrypt: .byte 9,0,0,0 .rva se_handler .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[] .LSEH_info_aesni_multi_cbc_decrypt: .byte 9,0,0,0 .rva se_handler .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_aesni_multi_cbc_encrypt_avx: .byte 9,0,0,0 .rva se_handler .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[] .LSEH_info_aesni_multi_cbc_decrypt_avx: .byte 9,0,0,0 .rva se_handler .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); push @opcode,$rex|0x40 if($rex); } sub aesni { my $line=shift; my @opcode=(0x66); if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { rex(\@opcode,$4,$3); push @opcode,0x0f,0x3a,0xdf; push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M my $c=$2; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesimc" => 0xdb, "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); push @opcode,0x0f,0x38,$opcodelet{$1}; push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); my $off = $2; push @opcode,0x44 if ($3>=8); push @opcode,0x0f,0x38,$opcodelet{$1}; push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M push @opcode,($off=~/^0/?oct($off):$off)&0xff; return ".byte\t".join(',',@opcode); } return $line; } $code =~ s/\`([^\`]*)\`/eval($1)/gem; $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/aes/asm/aesni-sha1-x86_64.pl (revision 364963) @@ -1,2146 +1,2146 @@ #! /usr/bin/env perl # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # June 2011 # # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled # in http://download.intel.com/design/intarch/papers/323686.pdf, is # that since AESNI-CBC encrypt exhibit *very* low instruction-level # parallelism, interleaving it with another algorithm would allow to # utilize processor resources better and achieve better performance. # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and # AESNI code is weaved into it. Below are performance numbers in # cycles per processed byte, less is better, for standalone AESNI-CBC # encrypt, sum of the latter and standalone SHA1, and "stitched" # subroutine: # # AES-128-CBC +SHA1 stitch gain # Westmere 3.77[+5.3] 9.07 6.55 +38% # Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%) # Ivy Bridge 5.05[+4.6] 9.65 5.54 +74% # Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%) # Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%) # Bulldozer 5.77[+6.0] 11.72 6.37 +84% # Ryzen(**) 2.71[+1.93] 4.64 2.74 +69% # Goldmont(**) 3.82[+1.70] 5.52 4.20 +31% # # AES-192-CBC # Westmere 4.51 9.81 6.80 +44% # Sandy Bridge 6.05 11.06(12.15) 6.11(7.19) +81%(+69%) # Ivy Bridge 6.05 10.65 6.07 +75% # Haswell 5.29 8.86(9.44) 5.32(5.32) +67%(+77%) # Bulldozer 6.89 12.84 6.96 +84% # # AES-256-CBC # Westmere 5.25 10.55 7.21 +46% # Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%) # Ivy Bridge 7.05 11.65 7.12 +64% # Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%) # Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61%) # Bulldozer 8.00 13.95 8.25 +69% # Ryzen(**) 3.71 5.64 3.72 +52% # Goldmont(**) 5.35 7.05 5.76 +22% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 # results collected on AVX-capable CPU, i.e. apply on OSes that # don't support AVX. # (**) SHAEXT results. # # Needless to mention that it makes no sense to implement "stitched" # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1 # fully utilize parallelism, so stitching would not give any gain # anyway. Well, there might be some, e.g. because of better cache # locality... For reference, here are performance results for # standalone AESNI-CBC decrypt: # # AES-128-CBC AES-192-CBC AES-256-CBC # Westmere 1.25 1.50 1.75 # Sandy Bridge 0.74 0.91 1.09 # Ivy Bridge 0.74 0.90 1.11 # Haswell 0.63 0.76 0.88 # Bulldozer 0.70 0.85 0.99 # And indeed: # # AES-256-CBC +SHA1 stitch gain # Westmere 1.75 7.20 6.68 +7.8% # Sandy Bridge 1.09 6.09(7.22) 5.82(6.95) +4.6%(+3.9%) # Ivy Bridge 1.11 5.70 5.45 +4.6% # Haswell 0.88 4.45(5.00) 4.39(4.69) +1.4%(*)(+6.6%) # Bulldozer 0.99 6.95 5.95 +17%(**) # # (*) Tiny improvement coefficient on Haswell is because we compare # AVX1 stitch to sum with AVX2 SHA1. # (**) Execution is fully dominated by integer code sequence and # SIMD still hardly shows [in single-process benchmark;-] $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.09); $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); -$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); +$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); $shaext=1; ### set to zero if compiling for 1.0.1 $stitched_decrypt=0; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void aesni_cbc_sha1_enc(const void *inp, # void *out, # size_t length, # const AES_KEY *key, # unsigned char *iv, # SHA_CTX *ctx, # const void *in0); $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc,\@abi-omnipotent .align 32 aesni_cbc_sha1_enc: .cfi_startproc # caller should check for SSSE3 and AES-NI bits mov OPENSSL_ia32cap_P+0(%rip),%r10d mov OPENSSL_ia32cap_P+4(%rip),%r11 ___ $code.=<<___ if ($shaext); bt \$61,%r11 # check SHA bit jc aesni_cbc_sha1_enc_shaext ___ $code.=<<___ if ($avx); and \$`1<<28`,%r11d # mask AVX bit and \$`1<<30`,%r10d # mask "Intel CPU" bit or %r11d,%r10d cmp \$`1<<28|1<<30`,%r10d je aesni_cbc_sha1_enc_avx ___ $code.=<<___; jmp aesni_cbc_sha1_enc_ssse3 ret .cfi_endproc .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc ___ my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0; my $K_XX_XX="%r11"; my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); # for enc my @rndkey=("%xmm14","%xmm15"); # for enc my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec if (1) { # reassign for Atom Silvermont # The goal is to minimize amount of instructions with more than # 3 prefix bytes. Or in more practical terms to keep AES-NI *and* # SSSE3 instructions to upper half of the register bank. @X=map("%xmm$_",(8..11,4..7)); @Tx=map("%xmm$_",(12,13,3)); ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); @rndkey=("%xmm0","%xmm1"); } sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_ssse3,\@function,6 .align 32 aesni_cbc_sha1_enc_ssse3: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact #jz .Lepilogue_ssse3 # debugging artefact push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` #mov $in0,$inp # debugging artefact #lea 64(%rsp),$ctx # debugging artefact ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_ssse3: ___ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization movdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] movdqa 64($K_XX_XX),@Tx[2] # pbswap mask movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @Tx[2],@X[-4&7] # byte swap pshufb @Tx[2],@X[-3&7] pshufb @Tx[2],@X[-2&7] add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @Tx[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU psubd @Tx[1],@X[-4&7] # restore X[] movdqa @X[-3&7],16(%rsp) psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] movups -112($key),$rndkey0 # $key[0] movups 16-112($key),$rndkey[0] # forward reference jmp .Loop_ssse3 ___ my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; movups `16*$n`($in0),$in # load input xorps $rndkey0,$in ___ $code.=<<___ if ($n); movups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; xorps $in,$iv movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv movups 16-112($key),$rndkey[1] # forward reference ___ } else { $code.=<<___; movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); &movdqa (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xupdate_ssse3_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_ror/); eval(shift(@insns)) if (@insns[0] =~ /_ror/); &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)"); } eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],30); eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xuplast_ssse3_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); &je (shift); unshift(@Tx,pop(@Tx)); &movdqa (@Tx[2],"64($K_XX_XX)"); # pbswap mask &movdqa (@Tx[1],"0($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); &movdqu (@X[-1&7],"48($inp)"); &pshufb (@X[-4&7],@Tx[2]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } $Xi++; } sub Xtail_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } my @body_00_19 = ( '($a,$b,$c,$d,$e)=@V;'. '&$_ror ($b,$j?7:2);', # $b>>>2 '&xor (@T[0],$d);', '&mov (@T[1],$a);', # $b for next round '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer '&xor ($b,$c);', # $c^$d for next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&and (@T[1],$b);', # ($b&($c^$d)) for next round '&xor ($b,$c);', # restore $b '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); sub body_00_19 () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39() if ($rx==19); $rx++; use integer; my ($k,$n); my @r=@body_00_19; $n = scalar(@r); $k = (($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n); $jj++; return @r; } my @body_20_39 = ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer '&xor (@T[0],$d) if($j==19);'. '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) '&mov (@T[1],$a);', # $b for next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round '&$_ror ($b,7);', # $b>>>2 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); sub body_20_39 () { # b^d^c # on entry @T[0]=b^d return &body_40_59() if ($rx==39); $rx++; use integer; my ($k,$n); my @r=@body_20_39; $n = scalar(@r); $k = (($jj+1)*8/20)*20*$n/8; # 8 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=20); $jj++; return @r; } my @body_40_59 = ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) '&xor ($c,$d) if ($j>=40);', # restore $c '&$_ror ($b,7);', # $b>>>2 '&mov (@T[1],$a);', # $b for next round '&xor (@T[0],$c);', '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j==59);'. '&xor (@T[1],$b) if ($j< 59);', # b^c for next round '&xor ($b,$c) if ($j< 59);', # c^d for next round '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); sub body_40_59 () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) $rx++; use integer; my ($k,$n); my @r=@body_40_59; $n = scalar(@r); $k=(($jj+1)*12/20)*20*$n/12; # 12 aesencs per these 20 rounds @r[$k%$n].='&$aesenc();' if ($jj==$k/$n && $rx!=40); $jj++; return @r; } $code.=<<___; .align 32 .Loop_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_32_79(\&body_00_19); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); $code.=<<___; movups $iv,48($out,$in0) # write output lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_ssse3 .Ldone_ssse3: ___ $jj=$j=$saved_j; @V=@saved_V; $r=$saved_r; @rndkey=@saved_rndkey; &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); $code.=<<___; movups $iv,48($out,$in0) # write output mov 88(%rsp),$ivp # restore $ivp add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) movups $iv,($ivp) # write IV ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_def_cfa %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_ssse3: ret .cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 ___ if ($stitched_decrypt) {{{ # reset ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $j=$jj=$r=$rx=0; $Xi=4; # reassign for Atom Silvermont (see above) ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4)); @X=map("%xmm$_",(8..13,6,7)); @Tx=map("%xmm$_",(14,15,5)); my @aes256_dec = ( '&movdqu($inout0,"0x00($in0)");', '&movdqu($inout1,"0x10($in0)"); &pxor ($inout0,$rndkey0);', '&movdqu($inout2,"0x20($in0)"); &pxor ($inout1,$rndkey0);', '&movdqu($inout3,"0x30($in0)"); &pxor ($inout2,$rndkey0);', '&pxor ($inout3,$rndkey0); &movups ($rndkey0,"16-112($key)");', '&movaps("64(%rsp)",@X[2]);', # save IV, originally @X[3] undef,undef ); for ($i=0;$i<13;$i++) { push (@aes256_dec,( '&aesdec ($inout0,$rndkey0);', '&aesdec ($inout1,$rndkey0);', '&aesdec ($inout2,$rndkey0);', '&aesdec ($inout3,$rndkey0); &movups($rndkey0,"'.(16*($i+2)-112).'($key)");' )); push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); push (@aes256_dec,(undef,undef)) if ($i==5); } push(@aes256_dec,( '&aesdeclast ($inout0,$rndkey0); &movups (@X[0],"0x00($in0)");', '&aesdeclast ($inout1,$rndkey0); &movups (@X[1],"0x10($in0)");', '&aesdeclast ($inout2,$rndkey0); &movups (@X[2],"0x20($in0)");', '&aesdeclast ($inout3,$rndkey0); &movups (@X[3],"0x30($in0)");', '&xorps ($inout0,"64(%rsp)"); &movdqu ($rndkey0,"-112($key)");', '&xorps ($inout1,@X[0]); &movups ("0x00($out,$in0)",$inout0);', '&xorps ($inout2,@X[1]); &movups ("0x10($out,$in0)",$inout1);', '&xorps ($inout3,@X[2]); &movups ("0x20($out,$in0)",$inout2);', '&movups ("0x30($out,$in0)",$inout3);' )); sub body_00_19_dec () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39_dec() if ($rx==19); my @r=@body_00_19; unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); $rx++; return @r; } sub body_20_39_dec () { # b^d^c # on entry @T[0]=b^d return &body_40_59_dec() if ($rx==39); my @r=@body_20_39; unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); $rx++; return @r; } sub body_40_59_dec () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) my @r=@body_40_59; unshift (@r,@aes256_dec[$rx]) if (@aes256_dec[$rx]); $rx++; return @r; } $code.=<<___; .globl aesni256_cbc_sha1_dec .type aesni256_cbc_sha1_dec,\@abi-omnipotent .align 32 aesni256_cbc_sha1_dec: .cfi_startproc # caller should check for SSSE3 and AES-NI bits mov OPENSSL_ia32cap_P+0(%rip),%r10d mov OPENSSL_ia32cap_P+4(%rip),%r11d ___ $code.=<<___ if ($avx); and \$`1<<28`,%r11d # mask AVX bit and \$`1<<30`,%r10d # mask "Intel CPU" bit or %r11d,%r10d cmp \$`1<<28|1<<30`,%r10d je aesni256_cbc_sha1_dec_avx ___ $code.=<<___; jmp aesni256_cbc_sha1_dec_ssse3 ret .cfi_endproc .size aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec .type aesni256_cbc_sha1_dec_ssse3,\@function,6 .align 32 aesni256_cbc_sha1_dec_ssse3: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_dec_ssse3: ___ $code.=<<___; mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization movdqu ($ivp),@X[3] # load IV #mov $ivp,88(%rsp) # save $ivp ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments $code.=<<___; shl \$6,$len sub $in0,$out add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] movdqa 64($K_XX_XX),@Tx[2] # pbswap mask movdqa 0($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @Tx[2],@X[-4&7] # byte swap add \$64,$inp pshufb @Tx[2],@X[-3&7] pshufb @Tx[2],@X[-2&7] pshufb @Tx[2],@X[-1&7] paddd @Tx[1],@X[-4&7] # add K_00_19 paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU psubd @Tx[1],@X[-4&7] # restore X[] movdqa @X[-3&7],16(%rsp) psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] movdqu -112($key),$rndkey0 # $key[0] jmp .Loop_dec_ssse3 .align 32 .Loop_dec_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_16_31(\&body_00_19_dec); &Xupdate_ssse3_32_79(\&body_00_19_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_40_59_dec); &Xupdate_ssse3_32_79(\&body_20_39_dec); &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_rx=$rx; &Xloop_ssse3(\&body_20_39_dec); &Xloop_ssse3(\&body_20_39_dec); &Xloop_ssse3(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_dec_ssse3 .Ldone_dec_ssse3: ___ $jj=$j=$saved_j; @V=@saved_V; $rx=$saved_rx; &Xtail_ssse3(\&body_20_39_dec); &Xtail_ssse3(\&body_20_39_dec); &Xtail_ssse3(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) movups @X[3],($ivp) # write IV ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_cfa_def %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_cfa_def %rsp,8 .Lepilogue_dec_ssse3: ret .cfi_endproc .size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3 ___ }}} $j=$jj=$r=$rx=0; if ($avx) { my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13)); my @rndkey=("%xmm14","%xmm15"); my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15)); # for dec my $Kx=@Tx[2]; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; .type aesni_cbc_sha1_enc_avx,\@function,6 .align 32 aesni_cbc_sha1_enc_avx: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument #shr \$6,$len # debugging artefact #jz .Lepilogue_avx # debugging artefact push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` #mov $in0,$inp # debugging artefact #lea 64(%rsp),$ctx # debugging artefact ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_avx: ___ $code.=<<___; vzeroall mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization vmovdqu ($ivp),$iv # load IV mov $ivp,88(%rsp) # save $ivp ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments my $rounds="${ivp}d"; $code.=<<___; shl \$6,$len sub $in0,$out mov 240-112($key),$rounds add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask vmovdqa 0($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] vmovdqu 48($inp),@X[-1&7] vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap add \$64,$inp vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) vmovups -112($key),$rndkey[1] # $key[0] vmovups 16-112($key),$rndkey[0] # forward reference jmp .Loop_avx ___ my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; vmovdqu `16*$n`($in0),$in # load input vpxor $rndkey[1],$in,$in ___ $code.=<<___ if ($n); vmovups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; vpxor $in,$iv,$iv vaesenc $rndkey[0],$iv,$iv vmovups `32+16*$k-112`($key),$rndkey[1] ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Lvaesenclast$sn vaesenc $rndkey[0],$iv,$iv vmovups `32+16*($k+0)-112`($key),$rndkey[1] vaesenc $rndkey[1],$iv,$iv vmovups `32+16*($k+1)-112`($key),$rndkey[0] je .Lvaesenclast$sn vaesenc $rndkey[0],$iv,$iv vmovups `32+16*($k+2)-112`($key),$rndkey[1] vaesenc $rndkey[1],$iv,$iv vmovups `32+16*($k+3)-112`($key),$rndkey[0] .Lvaesenclast$sn: vaesenclast $rndkey[0],$iv,$iv vmovups -112($key),$rndkey[0] vmovups 16-112($key),$rndkey[1] # forward reference ___ } else { $code.=<<___; vaesenc $rndkey[0],$iv,$iv vmovups `32+16*$k-112`($key),$rndkey[1] ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@Tx[1],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 &vpsrld (@Tx[0],@Tx[1],30); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslld (@Tx[1],@Tx[1],2); &vpxor (@X[0],@X[0],@Tx[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); &vmovdqa ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 48 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); &vpaddd (@Tx[1],$Kx,@X[-1&7]); &vmovdqa ($Kx,eval(16*($Xi/5))."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); # ror eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpsrld (@Tx[0],@X[0],30); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_avx_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$len); &je (shift); &vmovdqa(@Tx[1],"64($K_XX_XX)"); # pbswap mask &vmovdqa($Kx,"0($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); &vmovdqu(@X[-1&7],"48($inp)"); &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } $Xi++; } sub Xtail_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } $code.=<<___; .align 32 .Loop_avx: ___ &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_32_79(\&body_00_19); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); &Xuplast_avx_80(\&body_20_39,".Ldone_avx"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_r=$r; @saved_rndkey=@rndkey; &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); $code.=<<___; vmovups $iv,48($out,$in0) # write output lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_avx .Ldone_avx: ___ $jj=$j=$saved_j; @V=@saved_V; $r=$saved_r; @rndkey=@saved_rndkey; &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); $code.=<<___; vmovups $iv,48($out,$in0) # write output mov 88(%rsp),$ivp # restore $ivp add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) vmovups $iv,($ivp) # write IV vzeroall ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_def_cfa %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_avx: ret .cfi_endproc .size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx ___ if ($stitched_decrypt) {{{ # reset ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $j=$jj=$r=$rx=0; $Xi=4; @aes256_dec = ( '&vpxor ($inout0,$rndkey0,"0x00($in0)");', '&vpxor ($inout1,$rndkey0,"0x10($in0)");', '&vpxor ($inout2,$rndkey0,"0x20($in0)");', '&vpxor ($inout3,$rndkey0,"0x30($in0)");', '&vmovups($rndkey0,"16-112($key)");', '&vmovups("64(%rsp)",@X[2]);', # save IV, originally @X[3] undef,undef ); for ($i=0;$i<13;$i++) { push (@aes256_dec,( '&vaesdec ($inout0,$inout0,$rndkey0);', '&vaesdec ($inout1,$inout1,$rndkey0);', '&vaesdec ($inout2,$inout2,$rndkey0);', '&vaesdec ($inout3,$inout3,$rndkey0); &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");' )); push (@aes256_dec,(undef,undef)) if (($i>=3 && $i<=5) || $i>=11); push (@aes256_dec,(undef,undef)) if ($i==5); } push(@aes256_dec,( '&vaesdeclast ($inout0,$inout0,$rndkey0); &vmovups(@X[0],"0x00($in0)");', '&vaesdeclast ($inout1,$inout1,$rndkey0); &vmovups(@X[1],"0x10($in0)");', '&vaesdeclast ($inout2,$inout2,$rndkey0); &vmovups(@X[2],"0x20($in0)");', '&vaesdeclast ($inout3,$inout3,$rndkey0); &vmovups(@X[3],"0x30($in0)");', '&vxorps ($inout0,$inout0,"64(%rsp)"); &vmovdqu($rndkey0,"-112($key)");', '&vxorps ($inout1,$inout1,@X[0]); &vmovups("0x00($out,$in0)",$inout0);', '&vxorps ($inout2,$inout2,@X[1]); &vmovups("0x10($out,$in0)",$inout1);', '&vxorps ($inout3,$inout3,@X[2]); &vmovups("0x20($out,$in0)",$inout2);', '&vmovups ("0x30($out,$in0)",$inout3);' )); $code.=<<___; .type aesni256_cbc_sha1_dec_avx,\@function,6 .align 32 aesni256_cbc_sha1_dec_avx: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea `-104-($win64?10*16:0)`(%rsp),%rsp .cfi_adjust_cfa_offset `104+($win64?10*16:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,96+0(%rsp) movaps %xmm7,96+16(%rsp) movaps %xmm8,96+32(%rsp) movaps %xmm9,96+48(%rsp) movaps %xmm10,96+64(%rsp) movaps %xmm11,96+80(%rsp) movaps %xmm12,96+96(%rsp) movaps %xmm13,96+112(%rsp) movaps %xmm14,96+128(%rsp) movaps %xmm15,96+144(%rsp) .Lprologue_dec_avx: ___ $code.=<<___; vzeroall mov $in0,%r12 # reassign arguments mov $out,%r13 mov $len,%r14 lea 112($key),%r15 # size optimization vmovdqu ($ivp),@X[3] # load IV ___ ($in0,$out,$len,$key)=map("%r$_",(12..15)); # reassign arguments $code.=<<___; shl \$6,$len sub $in0,$out add $inp,$len # end of input lea K_XX_XX(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask vmovdqa 0($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] vmovdqu 48($inp),@X[-1&7] vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap add \$64,$inp vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) vmovups -112($key),$rndkey0 # $key[0] jmp .Loop_dec_avx .align 32 .Loop_dec_avx: ___ &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_16_31(\&body_00_19_dec); &Xupdate_avx_32_79(\&body_00_19_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_40_59_dec); &Xupdate_avx_32_79(\&body_20_39_dec); &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx"); # can jump to "done" $saved_j=$j; @saved_V=@V; $saved_rx=$rx; &Xloop_avx(\&body_20_39_dec); &Xloop_avx(\&body_20_39_dec); &Xloop_avx(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; lea 64($in0),$in0 add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_dec_avx .Ldone_dec_avx: ___ $jj=$j=$saved_j; @V=@saved_V; $rx=$saved_rx; &Xtail_avx(\&body_20_39_dec); &Xtail_avx(\&body_20_39_dec); &Xtail_avx(\&body_20_39_dec); eval(@aes256_dec[-1]); # last store $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) vmovups @X[3],($ivp) # write IV vzeroall ___ $code.=<<___ if ($win64); movaps 96+0(%rsp),%xmm6 movaps 96+16(%rsp),%xmm7 movaps 96+32(%rsp),%xmm8 movaps 96+48(%rsp),%xmm9 movaps 96+64(%rsp),%xmm10 movaps 96+80(%rsp),%xmm11 movaps 96+96(%rsp),%xmm12 movaps 96+112(%rsp),%xmm13 movaps 96+128(%rsp),%xmm14 movaps 96+144(%rsp),%xmm15 ___ $code.=<<___; lea `104+($win64?10*16:0)`(%rsp),%rsi .cfi_def_cfa %rsi,56 mov 0(%rsi),%r15 .cfi_restore %r15 mov 8(%rsi),%r14 .cfi_restore %r14 mov 16(%rsi),%r13 .cfi_restore %r13 mov 24(%rsi),%r12 .cfi_restore %r12 mov 32(%rsi),%rbp .cfi_restore %rbp mov 40(%rsi),%rbx .cfi_restore %rbx lea 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_dec_avx: ret .cfi_endproc .size aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx ___ }}} } $code.=<<___; .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by " .align 64 ___ if ($shaext) {{{ ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $rounds="%r11d"; ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15)); @rndkey=("%xmm0","%xmm1"); $r=0; my ($BSWAP,$ABCD,$E,$E_,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(7..12)); my @MSG=map("%xmm$_",(3..6)); $code.=<<___; .type aesni_cbc_sha1_enc_shaext,\@function,6 .align 32 aesni_cbc_sha1_enc_shaext: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument ___ $code.=<<___ if ($win64); lea `-8-10*16`(%rsp),%rsp movaps %xmm6,-8-10*16(%rax) movaps %xmm7,-8-9*16(%rax) movaps %xmm8,-8-8*16(%rax) movaps %xmm9,-8-7*16(%rax) movaps %xmm10,-8-6*16(%rax) movaps %xmm11,-8-5*16(%rax) movaps %xmm12,-8-4*16(%rax) movaps %xmm13,-8-3*16(%rax) movaps %xmm14,-8-2*16(%rax) movaps %xmm15,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; movdqu ($ctx),$ABCD movd 16($ctx),$E movdqa K_XX_XX+0x50(%rip),$BSWAP # byte-n-word swap mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization pshufd \$0b00011011,$ABCD,$ABCD # flip word order pshufd \$0b00011011,$E,$E # flip word order jmp .Loop_shaext .align 16 .Loop_shaext: ___ &$aesenc(); $code.=<<___; movdqu ($inp),@MSG[0] movdqa $E,$E_SAVE # offload $E pshufb $BSWAP,@MSG[0] movdqu 0x10($inp),@MSG[1] movdqa $ABCD,$ABCD_SAVE # offload $ABCD ___ &$aesenc(); $code.=<<___; pshufb $BSWAP,@MSG[1] paddd @MSG[0],$E movdqu 0x20($inp),@MSG[2] lea 0x40($inp),$inp pxor $E_SAVE,@MSG[0] # black magic ___ &$aesenc(); $code.=<<___; pxor $E_SAVE,@MSG[0] # black magic movdqa $ABCD,$E_ pshufb $BSWAP,@MSG[2] sha1rnds4 \$0,$E,$ABCD # 0-3 sha1nexte @MSG[1],$E_ ___ &$aesenc(); $code.=<<___; sha1msg1 @MSG[1],@MSG[0] movdqu -0x10($inp),@MSG[3] movdqa $ABCD,$E pshufb $BSWAP,@MSG[3] ___ &$aesenc(); $code.=<<___; sha1rnds4 \$0,$E_,$ABCD # 4-7 sha1nexte @MSG[2],$E pxor @MSG[2],@MSG[0] sha1msg1 @MSG[2],@MSG[1] ___ &$aesenc(); for($i=2;$i<20-4;$i++) { $code.=<<___; movdqa $ABCD,$E_ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 8-11 sha1nexte @MSG[3],$E_ ___ &$aesenc(); $code.=<<___; sha1msg2 @MSG[3],@MSG[0] pxor @MSG[3],@MSG[1] sha1msg1 @MSG[3],@MSG[2] ___ ($E,$E_)=($E_,$E); push(@MSG,shift(@MSG)); &$aesenc(); } $code.=<<___; movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 64-67 sha1nexte @MSG[3],$E_ sha1msg2 @MSG[3],@MSG[0] pxor @MSG[3],@MSG[1] ___ &$aesenc(); $code.=<<___; movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 68-71 sha1nexte @MSG[0],$E sha1msg2 @MSG[0],@MSG[1] ___ &$aesenc(); $code.=<<___; movdqa $E_SAVE,@MSG[0] movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 72-75 sha1nexte @MSG[1],$E_ ___ &$aesenc(); $code.=<<___; movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 76-79 sha1nexte $MSG[0],$E ___ while($r<40) { &$aesenc(); } # remaining aesenc's $code.=<<___; dec $len paddd $ABCD_SAVE,$ABCD movups $iv,48($out,$in0) # write output lea 64($in0),$in0 jnz .Loop_shaext pshufd \$0b00011011,$ABCD,$ABCD pshufd \$0b00011011,$E,$E movups $iv,($ivp) # write IV movdqu $ABCD,($ctx) movd $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-10*16(%rax),%xmm6 movaps -8-9*16(%rax),%xmm7 movaps -8-8*16(%rax),%xmm8 movaps -8-7*16(%rax),%xmm9 movaps -8-6*16(%rax),%xmm10 movaps -8-5*16(%rax),%xmm11 movaps -8-4*16(%rax),%xmm12 movaps -8-3*16(%rax),%xmm13 movaps -8-2*16(%rax),%xmm14 movaps -8-1*16(%rax),%xmm15 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext ___ }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail ___ $code.=<<___ if ($shaext); lea aesni_cbc_sha1_enc_shaext(%rip),%r10 cmp %r10,%rbx jb .Lseh_no_shaext lea (%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea 168(%rax),%rax # adjust stack pointer jmp .Lcommon_seh_tail .Lseh_no_shaext: ___ $code.=<<___; lea 96(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea `104+10*16`(%rax),%rax # adjust stack pointer mov 0(%rax),%r15 mov 8(%rax),%r14 mov 16(%rax),%r13 mov 24(%rax),%r12 mov 32(%rax),%rbp mov 40(%rax),%rbx lea 48(%rax),%rax mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size ssse3_handler,.-ssse3_handler .section .pdata .align 4 .rva .LSEH_begin_aesni_cbc_sha1_enc_ssse3 .rva .LSEH_end_aesni_cbc_sha1_enc_ssse3 .rva .LSEH_info_aesni_cbc_sha1_enc_ssse3 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_aesni_cbc_sha1_enc_avx .rva .LSEH_end_aesni_cbc_sha1_enc_avx .rva .LSEH_info_aesni_cbc_sha1_enc_avx ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext .rva .LSEH_end_aesni_cbc_sha1_enc_shaext .rva .LSEH_info_aesni_cbc_sha1_enc_shaext ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_aesni_cbc_sha1_enc_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_aesni_cbc_sha1_enc_avx: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($shaext); .LSEH_info_aesni_cbc_sha1_enc_shaext: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); unshift @opcode,$rex|0x40 if($rex); } sub sha1rnds4 { if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x3a,0xcc); rex(\@opcode,$3,$2); push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } else { return "sha1rnds4\t".@_[0]; } } sub sha1op38 { my $instr = shift; my %opcodelet = ( "sha1nexte" => 0xc8, "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } sub aesni { my $line=shift; my @opcode=(0x0f,0x38); if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) { my %opcodelet = ( "aesenc" => 0xdc, "aesenclast" => 0xdd, "aesdec" => 0xde, "aesdeclast" => 0xdf ); return undef if (!defined($opcodelet{$1})); rex(\@opcode,$3,$2); push @opcode,$opcodelet{$1},0xc0|($2&7)|(($3&7)<<3); # ModR/M unshift @opcode,0x66; return ".byte\t".join(',',@opcode); } return $line; } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/aes/asm/aesni-sha256-x86_64.pl (revision 364963) @@ -1,1802 +1,1802 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # January 2013 # # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled # in http://download.intel.com/design/intarch/papers/323686.pdf, is # that since AESNI-CBC encrypt exhibit *very* low instruction-level # parallelism, interleaving it with another algorithm would allow to # utilize processor resources better and achieve better performance. # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and # AESNI code is weaved into it. As SHA256 dominates execution time, # stitch performance does not depend on AES key length. Below are # performance numbers in cycles per processed byte, less is better, # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched # subroutine: # # AES-128/-192/-256+SHA256 this(**) gain # Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43% # Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50% # Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59% # Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40% # Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58% # Ryzen(***) 2.71/-/3.71+2.05 2.74/-/3.73 +74%/-/54% # Goldmont(***) 3.82/-/5.35+4.16 4.73/-/5.94 +69%/-/60% # # (*) there are XOP, AVX1 and AVX2 code paths, meaning that # Westmere is omitted from loop, this is because gain was not # estimated high enough to justify the effort; # (**) these are EVP-free results, results obtained with 'speed # -evp aes-256-cbc-hmac-sha256' will vary by percent or two; # (***) these are SHAEXT results; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=$avx; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $func="aesni_cbc_sha256_enc"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; ######################################################################## # void aesni_cbc_sha256_enc(const void *inp, # void *out, # size_t length, # const AES_KEY *key, # unsigned char *iv, # SHA256_CTX *ctx, # const void *in0); ($inp, $out, $len, $key, $ivp, $ctx, $in0) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); $Tbl="%rbp"; $_inp="16*$SZ+0*8(%rsp)"; $_out="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $_key="16*$SZ+3*8(%rsp)"; $_ivp="16*$SZ+4*8(%rsp)"; $_ctx="16*$SZ+5*8(%rsp)"; $_in0="16*$SZ+6*8(%rsp)"; $_rsp="`16*$SZ+7*8`(%rsp)"; $framesz=16*$SZ+8*8; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl $func .type $func,\@abi-omnipotent .align 16 $func: .cfi_startproc ___ if ($avx) { $code.=<<___; lea OPENSSL_ia32cap_P(%rip),%r11 mov \$1,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe mov 0(%r11),%eax mov 4(%r11),%r10 ___ $code.=<<___ if ($shaext); bt \$61,%r10 # check for SHA jc ${func}_shaext ___ $code.=<<___; mov %r10,%r11 shr \$32,%r11 test \$`1<<11`,%r10d # check for XOP jnz ${func}_xop ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 cmp \$`1<<8|1<<5|1<<3`,%r11d je ${func}_avx2 ___ $code.=<<___; and \$`1<<28`,%r10d # check for AVX jnz ${func}_avx ud2 ___ } $code.=<<___; xor %eax,%eax cmp \$0,`$win64?"%rcx":"%rdi"` je .Lprobe ud2 .Lprobe: ret .cfi_endproc .size $func,.-$func .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 .long 0,0,0,0, 0,0,0,0 .asciz "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by " .align 64 ___ ###################################################################### # SIMD code paths # {{{ ($iv,$inout,$roundkey,$temp, $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15)); $aesni_cbc_idx=0; @aesni_cbc_block = ( ## &vmovdqu ($roundkey,"0x00-0x80($inp)");' ## &vmovdqu ($inout,($inp)); ## &mov ($_inp,$inp); '&vpxor ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x10-0x80($inp)");', '&vpxor ($inout,$inout,$iv);', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x20-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x30-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x40-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x50-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x60-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x70-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x80-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x90-0x80($inp)");', '&vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xa0-0x80($inp)");', '&vaesenclast ($temp,$inout,$roundkey);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xb0-0x80($inp)");', '&vpand ($iv,$temp,$mask10);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xc0-0x80($inp)");', '&vaesenclast ($temp,$inout,$roundkey);'. ' &vaesenc ($inout,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0xd0-0x80($inp)");', '&vpand ($temp,$temp,$mask12);'. ' &vaesenc ($inout,$inout,$roundkey);'. '&vmovdqu ($roundkey,"0xe0-0x80($inp)");', '&vpor ($iv,$iv,$temp);'. ' &vaesenclast ($temp,$inout,$roundkey);'. ' &vmovdqu ($roundkey,"0x00-0x80($inp)");' ## &mov ($inp,$_inp); ## &mov ($out,$_out); ## &vpand ($temp,$temp,$mask14); ## &vpor ($iv,$iv,$temp); ## &vmovdqu ($iv,($out,$inp); ## &lea (inp,16($inp)); ); my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&xor ($a0,$e)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e @aesni_cbc_block[$aesni_cbc_idx++]. '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&xor ($a2,$b)', # a^b, b^c in next round '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&add ($h,$a4)', # h+=Ch(e,f,g) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&add ($d,$h)', # d+=h '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } if ($avx) {{ ###################################################################### # XOP code path # $code.=<<___; .type ${func}_xop,\@function,6 .align 64 ${func}_xop: .cfi_startproc .Lxop_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later mov $out,$_out mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_xop: vzeroall mov $inp,%r12 # borrow $a4 lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV sub \$9,%r14 mov $SZ*0(%r15),$A mov $SZ*1(%r15),$B mov $SZ*2(%r15),$C mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqa 0x00(%r13,%r14,8),$mask14 vmovdqa 0x10(%r13,%r14,8),$mask12 vmovdqa 0x20(%r13,%r14,8),$mask10 vmovdqu 0x00-0x80($inp),$roundkey jmp .Lloop_xop ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00(%rsi,%r12),@X[0] vmovdqu 0x10(%rsi,%r12),@X[1] vmovdqu 0x20(%rsi,%r12),@X[2] vmovdqu 0x30(%rsi,%r12),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: sub \$-16*2*$SZ,$Tbl # size optimization vmovdqu (%r12),$inout # $a4 mov %r12,$_inp # $a4 ___ sub XOP_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[3],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq ($t3,$t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq ($t3,$t3,8); # 22 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &mov ("%r12",$_inp); # borrow $a4 &vpand ($temp,$temp,$mask14); &mov ("%r15",$_out); # borrow $a2 &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r12)",$iv); # write output &lea ("%r12","16(%r12)"); # inp++ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); &vmovdqu ($inout,"(%r12)"); &mov ($_inp,"%r12"); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_inp,%r12 # borrow $a4 mov $_out,%r13 # borrow $a0 mov $_ctx,%r15 # borrow $a2 mov $_in0,%rsi # borrow $a3 vpand $mask14,$temp,$temp mov $a1,$A vpor $temp,$iv,$iv vmovdqu $iv,(%r13,%r12) # write output lea 16(%r12),%r12 # inp++ add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H cmp $_end,%r12 mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jb .Lloop_xop mov $_ivp,$ivp mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`(%rsp),%xmm6 movaps `$framesz+16*1`(%rsp),%xmm7 movaps `$framesz+16*2`(%rsp),%xmm8 movaps `$framesz+16*3`(%rsp),%xmm9 movaps `$framesz+16*4`(%rsp),%xmm10 movaps `$framesz+16*5`(%rsp),%xmm11 movaps `$framesz+16*6`(%rsp),%xmm12 movaps `$framesz+16*7`(%rsp),%xmm13 movaps `$framesz+16*8`(%rsp),%xmm14 movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_xop: ret .cfi_endproc .size ${func}_xop,.-${func}_xop ___ ###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; .type ${func}_avx,\@function,6 .align 64 ${func}_avx: .cfi_startproc .Lavx_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`$framesz+$win64*16*10`,%rsp and \$-64,%rsp # align stack frame shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later mov $out,$_out mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_avx: vzeroall mov $inp,%r12 # borrow $a4 lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r13 # borrow $a0 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV sub \$9,%r14 mov $SZ*0(%r15),$A mov $SZ*1(%r15),$B mov $SZ*2(%r15),$C mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqa 0x00(%r13,%r14,8),$mask14 vmovdqa 0x10(%r13,%r14,8),$mask12 vmovdqa 0x20(%r13,%r14,8),$mask10 vmovdqu 0x00-0x80($inp),$roundkey ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00(%rsi,%r12),@X[0] vmovdqu 0x10(%rsi,%r12),@X[1] vmovdqu 0x20(%rsi,%r12),@X[2] vmovdqu 0x30(%rsi,%r12),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: sub \$-16*2*$SZ,$Tbl # size optimization vmovdqu (%r12),$inout # $a4 mov %r12,$_inp # $a4 ___ sub Xupdate_256_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] '&vpsrld ($t2,$t0,$sigma0[0]);', '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] '&vpsrld ($t3,$t0,$sigma0[2])', '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrld ($t2,$t3,$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) '&vpsrlq ($t3,$t3,$sigma1[0]);', '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', # sigma1(X[14..15]) '&vpshufd ($t2,$t2,0b10000100)', '&vpsrldq ($t2,$t2,8)', '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,$sigma1[2])', '&vpsrlq ($t3,$t3,$sigma1[0])', '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufd ($t2,$t2,0b11101000)', '&vpslldq ($t2,$t2,8)', '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) ); } sub AVX_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &mov ("%r12",$_inp); # borrow $a4 &vpand ($temp,$temp,$mask14); &mov ("%r15",$_out); # borrow $a2 &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r12)",$iv); # write output &lea ("%r12","16(%r12)"); # inp++ &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); &vmovdqu ($inout,"(%r12)"); &mov ($_inp,"%r12"); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_inp,%r12 # borrow $a4 mov $_out,%r13 # borrow $a0 mov $_ctx,%r15 # borrow $a2 mov $_in0,%rsi # borrow $a3 vpand $mask14,$temp,$temp mov $a1,$A vpor $temp,$iv,$iv vmovdqu $iv,(%r13,%r12) # write output lea 16(%r12),%r12 # inp++ add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H cmp $_end,%r12 mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jb .Lloop_avx mov $_ivp,$ivp mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`(%rsp),%xmm6 movaps `$framesz+16*1`(%rsp),%xmm7 movaps `$framesz+16*2`(%rsp),%xmm8 movaps `$framesz+16*3`(%rsp),%xmm9 movaps `$framesz+16*4`(%rsp),%xmm10 movaps `$framesz+16*5`(%rsp),%xmm11 movaps `$framesz+16*6`(%rsp),%xmm12 movaps `$framesz+16*7`(%rsp),%xmm13 movaps `$framesz+16*8`(%rsp),%xmm14 movaps `$framesz+16*9`(%rsp),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size ${func}_avx,.-${func}_avx ___ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; sub bodyx_00_15 () { # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] '&and ($a4,$e)', # f&e '&rorx ($a0,$e,$Sigma1[2])', '&rorx ($a2,$e,$Sigma1[1])', '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past '&lea ($h,"($h,$a4)")', '&andn ($a4,$e,$g)', # ~e&g '&xor ($a0,$a2)', '&rorx ($a1,$e,$Sigma1[0])', '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) '&xor ($a0,$a1)', # Sigma1(e) '&mov ($a2,$a)', '&rorx ($a4,$a,$Sigma0[2])', '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) '&xor ($a2,$b)', # a^b, b^c in next round '&rorx ($a1,$a,$Sigma0[1])', '&rorx ($a0,$a,$Sigma0[0])', '&lea ($d,"($d,$h)")', # d+=h '&and ($a3,$a2)', # (b^c)&(a^b) @aesni_cbc_block[$aesni_cbc_idx++]. '&xor ($a1,$a4)', '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&xor ($a1,$a0)', # Sigma0(a) '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) '&mov ($a4,$e)', # copy of f in future '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); # and at the finish one has to $a+=$a1 } $code.=<<___; .type ${func}_avx2,\@function,6 .align 64 ${func}_avx2: .cfi_startproc .Lavx2_shortcut: mov `($win64?56:8)`(%rsp),$in0 # load 7th parameter mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp and \$-256*$SZ,%rsp # align stack frame add \$`2*$SZ*($rounds-8)`,%rsp shl \$6,$len sub $inp,$out # re-bias sub $inp,$in0 add $inp,$len # end of input #mov $inp,$_inp # saved later #mov $out,$_out # kept in $offload mov $len,$_end #mov $key,$_key # remains resident in $inp register mov $ivp,$_ivp mov $ctx,$_ctx mov $in0,$_in0 mov %rax,$_rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,`$framesz+16*0`(%rsp) movaps %xmm7,`$framesz+16*1`(%rsp) movaps %xmm8,`$framesz+16*2`(%rsp) movaps %xmm9,`$framesz+16*3`(%rsp) movaps %xmm10,`$framesz+16*4`(%rsp) movaps %xmm11,`$framesz+16*5`(%rsp) movaps %xmm12,`$framesz+16*6`(%rsp) movaps %xmm13,`$framesz+16*7`(%rsp) movaps %xmm14,`$framesz+16*8`(%rsp) movaps %xmm15,`$framesz+16*9`(%rsp) ___ $code.=<<___; .Lprologue_avx2: vzeroall mov $inp,%r13 # borrow $a0 vpinsrq \$1,$out,$offload,$offload lea 0x80($key),$inp # size optimization, reassign lea $TABLE+`$SZ*2*$rounds+32`(%rip),%r12 # borrow $a4 mov 0xf0-0x80($inp),%r14d # rounds, borrow $a1 mov $ctx,%r15 # borrow $a2 mov $in0,%rsi # borrow $a3 vmovdqu ($ivp),$iv # load IV lea -9(%r14),%r14 vmovdqa 0x00(%r12,%r14,8),$mask14 vmovdqa 0x10(%r12,%r14,8),$mask12 vmovdqa 0x20(%r12,%r14,8),$mask10 sub \$-16*$SZ,%r13 # inp++, size optimization mov $SZ*0(%r15),$A lea (%rsi,%r13),%r12 # borrow $a0 mov $SZ*1(%r15),$B cmp $len,%r13 # $_end mov $SZ*2(%r15),$C cmove %rsp,%r12 # next block or random data mov $SZ*3(%r15),$D mov $SZ*4(%r15),$E mov $SZ*5(%r15),$F mov $SZ*6(%r15),$G mov $SZ*7(%r15),$H vmovdqu 0x00-0x80($inp),$roundkey ___ if ($SZ==4) { # SHA256 my @X = map("%ymm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7)); $code.=<<___; jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0 vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1 vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2 vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3 vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t3,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t3,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[2],@X[2] lea -16*$SZ(%r13),%r13 vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rsi as frame pointer mov $_rsp,%rsi .cfi_def_cfa %rsi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rsi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; mov $B,$a3 vmovdqa $t2,0x00(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x20(%rsp) mov $F,$a4 sub \$-16*2*$SZ,$Tbl # size optimization jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: vmovdqu (%r13),$inout vpinsrq \$0,%r13,$offload,$offload ___ sub AVX2_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 96 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%2)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } $aesni_cbc_idx=0; for ($i=0,$j=0; $j<4; $j++) { &AVX2_256_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &vmovq ("%r13",$offload); # borrow $a0 &vpextrq ("%r15",$offload,1); # borrow $a2 &vpand ($temp,$temp,$mask14); &vpor ($iv,$iv,$temp); &vmovdqu ("(%r15,%r13)",$iv); # write output &lea ("%r13","16(%r13)"); # inp++ &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1)."($Tbl)",0); &jne (".Lavx2_00_47"); &vmovdqu ($inout,"(%r13)"); &vpinsrq ($offload,$offload,"%r13",0); $aesni_cbc_idx=0; for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } $code.=<<___; vpextrq \$1,$offload,%r12 # $_out, borrow $a4 vmovq $offload,%r13 # $_inp, borrow $a0 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 add $a1,$A lea `2*$SZ*($rounds-8)`(%rsp),$Tbl vpand $mask14,$temp,$temp vpor $temp,$iv,$iv vmovdqu $iv,(%r12,%r13) # write output lea 16(%r13),%r13 add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G add $SZ*7(%r15),$H mov $A,$SZ*0(%r15) mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) cmp `$PUSH8+2*8`($Tbl),%r13 # $_end je .Ldone_avx2 xor $a1,$a1 mov $B,$a3 mov $F,$a4 xor $C,$a3 # magic jmp .Lower_avx2 .align 16 .Lower_avx2: vmovdqu (%r13),$inout vpinsrq \$0,%r13,$offload,$offload ___ $aesni_cbc_idx=0; for ($i=0; $i<16; ) { my $base="+16($Tbl)"; foreach(bodyx_00_15()) { eval; } &lea ($Tbl,"-$PUSH8($Tbl)") if ($i==8); } $code.=<<___; vmovq $offload,%r13 # borrow $a0 vpextrq \$1,$offload,%r15 # borrow $a2 vpand $mask14,$temp,$temp vpor $temp,$iv,$iv lea -$PUSH8($Tbl),$Tbl vmovdqu $iv,(%r15,%r13) # write output lea 16(%r13),%r13 # inp++ cmp %rsp,$Tbl jae .Lower_avx2 mov `2*$SZ*$rounds+5*8`(%rsp),%r15 # $_ctx, borrow $a2 lea 16*$SZ(%r13),%r13 mov `2*$SZ*$rounds+6*8`(%rsp),%rsi # $_in0, borrow $a3 add $a1,$A lea `2*$SZ*($rounds-8)`(%rsp),%rsp add $SZ*0(%r15),$A add $SZ*1(%r15),$B add $SZ*2(%r15),$C add $SZ*3(%r15),$D add $SZ*4(%r15),$E add $SZ*5(%r15),$F add $SZ*6(%r15),$G lea (%rsi,%r13),%r12 add $SZ*7(%r15),$H cmp $_end,%r13 mov $A,$SZ*0(%r15) cmove %rsp,%r12 # next block or stale data mov $B,$SZ*1(%r15) mov $C,$SZ*2(%r15) mov $D,$SZ*3(%r15) mov $E,$SZ*4(%r15) mov $F,$SZ*5(%r15) mov $G,$SZ*6(%r15) mov $H,$SZ*7(%r15) jbe .Loop_avx2 lea (%rsp),$Tbl # temporarily use $Tbl as index to $_rsp # this avoids the need to save a secondary frame pointer at -8(%rsp) .cfi_cfa_expression $Tbl+`16*$SZ+7*8`,deref,+8 .Ldone_avx2: mov 16*$SZ+4*8($Tbl),$ivp mov 16*$SZ+7*8($Tbl),%rsi .cfi_def_cfa %rsi,8 vmovdqu $iv,($ivp) # output IV vzeroall ___ $code.=<<___ if ($win64); movaps `$framesz+16*0`($Tbl),%xmm6 movaps `$framesz+16*1`($Tbl),%xmm7 movaps `$framesz+16*2`($Tbl),%xmm8 movaps `$framesz+16*3`($Tbl),%xmm9 movaps `$framesz+16*4`($Tbl),%xmm10 movaps `$framesz+16*5`($Tbl),%xmm11 movaps `$framesz+16*6`($Tbl),%xmm12 movaps `$framesz+16*7`($Tbl),%xmm13 movaps `$framesz+16*8`($Tbl),%xmm14 movaps `$framesz+16*9`($Tbl),%xmm15 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} }} {{ my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10"); my ($rounds,$Tbl)=("%r11d","%rbx"); my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15)); my @rndkey=("%xmm4","%xmm5"); my $r=0; my $sn=0; my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9)); my @MSG=map("%xmm$_",(10..13)); my $aesenc=sub { use integer; my ($n,$k)=($r/10,$r%10); if ($k==0) { $code.=<<___; movups `16*$n`($in0),$in # load input xorps $rndkey0,$in ___ $code.=<<___ if ($n); movups $iv,`16*($n-1)`($out,$in0) # write output ___ $code.=<<___; xorps $in,$iv movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } elsif ($k==9) { $sn++; $code.=<<___; cmp \$11,$rounds jb .Laesenclast$sn movups `32+16*($k+0)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+1)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv je .Laesenclast$sn movups `32+16*($k+2)-112`($key),$rndkey[1] aesenc $rndkey[0],$iv movups `32+16*($k+3)-112`($key),$rndkey[0] aesenc $rndkey[1],$iv .Laesenclast$sn: aesenclast $rndkey[0],$iv movups 16-112($key),$rndkey[1] # forward reference nop ___ } else { $code.=<<___; movups `32+16*$k-112`($key),$rndkey[1] aesenc $rndkey[0],$iv ___ } $r++; unshift(@rndkey,pop(@rndkey)); }; if ($shaext) { my $Tbl="%rax"; $code.=<<___; .type ${func}_shaext,\@function,6 .align 32 ${func}_shaext: .cfi_startproc mov `($win64?56:8)`(%rsp),$inp # load 7th argument ___ $code.=<<___ if ($win64); lea `-8-10*16`(%rsp),%rsp movaps %xmm6,-8-10*16(%rax) movaps %xmm7,-8-9*16(%rax) movaps %xmm8,-8-8*16(%rax) movaps %xmm9,-8-7*16(%rax) movaps %xmm10,-8-6*16(%rax) movaps %xmm11,-8-5*16(%rax) movaps %xmm12,-8-4*16(%rax) movaps %xmm13,-8-3*16(%rax) movaps %xmm14,-8-2*16(%rax) movaps %xmm15,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x200-0x80($Tbl),$TMP # byte swap mask mov 240($key),$rounds sub $in0,$out movups ($key),$rndkey0 # $key[0] movups ($ivp),$iv # load IV movups 16($key),$rndkey[0] # forward reference lea 112($key),$key # size optimization pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload movdqa $ABEF,$ABEF_SAVE # offload ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 1*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] lea 0x40($inp),$inp ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 2*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256msg1 @MSG[1],@MSG[0] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP paddd $TMP,@MSG[0] ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 3*32-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256msg1 @MSG[2],@MSG[1] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP paddd $TMP,@MSG[1] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { &$aesenc() if (($r%10)==0); $code.=<<___; movdqa $i*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256msg1 @MSG[3],@MSG[2] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP paddd $TMP,@MSG[2] ___ &$aesenc(); &$aesenc() if ($r==19); $code.=<<___; sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256msg1 @MSG[3],@MSG[2] ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP paddd $TMP,@MSG[2] ___ &$aesenc(); &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 14*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP ___ &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF movdqa 15*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi ___ &$aesenc(); &$aesenc(); $code.=<<___; sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi ___ &$aesenc(); $code.=<<___; sha256rnds2 $CDGH,$ABEF #pxor $CDGH,$rndkey0 # black magic ___ while ($r<40) { &$aesenc(); } # remaining aesenc's $code.=<<___; #xorps $CDGH,$rndkey0 # black magic paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF dec $len movups $iv,48($out,$in0) # write output lea 64($in0),$in0 jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movups $iv,($ivp) # write IV movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps 0*16(%rsp),%xmm6 movaps 1*16(%rsp),%xmm7 movaps 2*16(%rsp),%xmm8 movaps 3*16(%rsp),%xmm9 movaps 4*16(%rsp),%xmm10 movaps 5*16(%rsp),%xmm11 movaps 6*16(%rsp),%xmm12 movaps 7*16(%rsp),%xmm13 movaps 8*16(%rsp),%xmm14 movaps 9*16(%rsp),%xmm15 lea 8+10*16(%rsp),%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size ${func}_shaext,.-${func}_shaext ___ } }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64 && $avx) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HanderlData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ $code.=<<___ if ($shaext); lea aesni_cbc_sha256_enc_shaext(%rip),%r10 cmp %r10,%rbx jb .Lnot_in_shaext lea (%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq lea 168(%rax),%rax # adjust stack pointer jmp .Lin_prologue .Lnot_in_shaext: ___ $code.=<<___ if ($avx>1); lea .Lavx2_shortcut(%rip),%r10 cmp %r10,%rbx # context->RipRbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea 16*$SZ+8*8(%rsi),%rsi # Xmm6- save area lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .rva .LSEH_begin_${func}_xop .rva .LSEH_end_${func}_xop .rva .LSEH_info_${func}_xop .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_${func}_avx2 .rva .LSEH_end_${func}_avx2 .rva .LSEH_info_${func}_avx2 ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_${func}_shaext .rva .LSEH_end_${func}_shaext .rva .LSEH_info_${func}_shaext ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_${func}_xop: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] .LSEH_info_${func}_avx: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_${func}_avx2: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ $code.=<<___ if ($shaext); .LSEH_info_${func}_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if($dst>=8); $rex|=0x01 if($src>=8); unshift @opcode,$rex|0x40 if($rex); } { my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); sub sha256op38 { my $instr = shift; if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } } $code =~ s/\`([^\`]*)\`/eval $1/gem; $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/bn/asm/rsaz-avx2.pl =================================================================== --- stable/12/crypto/openssl/crypto/bn/asm/rsaz-avx2.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/bn/asm/rsaz-avx2.pl (revision 364963) @@ -1,1982 +1,1982 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2012, Intel Corporation. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # # References: # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular # Exponentiation, Using Advanced Vector Instructions Architectures", # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 # [2] S. Gueron: "Efficient Software Implementations of Modular # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE # Proceedings of 9th International Conference on Information Technology: # New Generations (ITNG 2012), pp.821-823 (2012) # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # resistant 1024-bit modular exponentiation, for optimizing RSA2048 # on AVX2 capable x86_64 platforms", # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest # # +13% improvement over original submission by # # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this # 2.3GHz Haswell 621 765/+23% 1113/+79% # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63% # # (*) if system doesn't support AVX2, for reference purposes; # (**) scaled to 2.3GHz to simplify comparison; # (***) scalar AD*X code is faster than AVX2 and is preferred code # path for Broadwell; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); $addx = ($1>=2.23); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); $addx = ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); $addx = ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT = *OUT; if ($avx>1) {{{ { # void AMS_WW( my $rp="%rdi"; # BN_ULONG *rp, my $ap="%rsi"; # const BN_ULONG *ap, my $np="%rdx"; # const BN_ULONG *np, my $n0="%ecx"; # const BN_ULONG n0, my $rep="%r8d"; # int repeat); # The registers that hold the accumulated redundant result # The AMM works on 1024 bit operands, and redundant word size is 29 # Therefore: ceil(1024/29)/4 = 9 my $ACC0="%ymm0"; my $ACC1="%ymm1"; my $ACC2="%ymm2"; my $ACC3="%ymm3"; my $ACC4="%ymm4"; my $ACC5="%ymm5"; my $ACC6="%ymm6"; my $ACC7="%ymm7"; my $ACC8="%ymm8"; my $ACC9="%ymm9"; # Registers that hold the broadcasted words of bp, currently used my $B1="%ymm10"; my $B2="%ymm11"; # Registers that hold the broadcasted words of Y, currently used my $Y1="%ymm12"; my $Y2="%ymm13"; # Helper registers my $TEMP1="%ymm14"; my $AND_MASK="%ymm15"; # alu registers that hold the first words of the ACC my $r0="%r9"; my $r1="%r10"; my $r2="%r11"; my $r3="%r12"; my $i="%r14d"; # loop counter my $tmp = "%r15"; my $FrameSize=32*18+32*8; # place for A^2 and 2*A my $aap=$r0; my $tp0="%rbx"; my $tp1=$r3; my $tpa=$tmp; $np="%r13"; # reassigned argument $code.=<<___; .text .globl rsaz_1024_sqr_avx2 .type rsaz_1024_sqr_avx2,\@function,5 .align 64 rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2 .cfi_startproc lea (%rsp), %rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 vzeroupper ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp vmovaps %xmm6,-0xd8(%rax) vmovaps %xmm7,-0xc8(%rax) vmovaps %xmm8,-0xb8(%rax) vmovaps %xmm9,-0xa8(%rax) vmovaps %xmm10,-0x98(%rax) vmovaps %xmm11,-0x88(%rax) vmovaps %xmm12,-0x78(%rax) vmovaps %xmm13,-0x68(%rax) vmovaps %xmm14,-0x58(%rax) vmovaps %xmm15,-0x48(%rax) .Lsqr_1024_body: ___ $code.=<<___; mov %rax,%rbp .cfi_def_cfa_register %rbp mov %rdx, $np # reassigned argument sub \$$FrameSize, %rsp mov $np, $tmp sub \$-128, $rp # size optimization sub \$-128, $ap sub \$-128, $np and \$4095, $tmp # see if $np crosses page add \$32*10, $tmp shr \$12, $tmp vpxor $ACC9,$ACC9,$ACC9 jz .Lsqr_1024_no_n_copy # unaligned 256-bit load that crosses page boundary can # cause >2x performance degradation here, so if $np does # cross page boundary, copy it to stack and make sure stack # frame doesn't... sub \$32*10,%rsp vmovdqu 32*0-128($np), $ACC0 and \$-2048, %rsp vmovdqu 32*1-128($np), $ACC1 vmovdqu 32*2-128($np), $ACC2 vmovdqu 32*3-128($np), $ACC3 vmovdqu 32*4-128($np), $ACC4 vmovdqu 32*5-128($np), $ACC5 vmovdqu 32*6-128($np), $ACC6 vmovdqu 32*7-128($np), $ACC7 vmovdqu 32*8-128($np), $ACC8 lea $FrameSize+128(%rsp),$np vmovdqu $ACC0, 32*0-128($np) vmovdqu $ACC1, 32*1-128($np) vmovdqu $ACC2, 32*2-128($np) vmovdqu $ACC3, 32*3-128($np) vmovdqu $ACC4, 32*4-128($np) vmovdqu $ACC5, 32*5-128($np) vmovdqu $ACC6, 32*6-128($np) vmovdqu $ACC7, 32*7-128($np) vmovdqu $ACC8, 32*8-128($np) vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero .Lsqr_1024_no_n_copy: and \$-1024, %rsp vmovdqu 32*1-128($ap), $ACC1 vmovdqu 32*2-128($ap), $ACC2 vmovdqu 32*3-128($ap), $ACC3 vmovdqu 32*4-128($ap), $ACC4 vmovdqu 32*5-128($ap), $ACC5 vmovdqu 32*6-128($ap), $ACC6 vmovdqu 32*7-128($ap), $ACC7 vmovdqu 32*8-128($ap), $ACC8 lea 192(%rsp), $tp0 # 64+128=192 vmovdqu .Land_mask(%rip), $AND_MASK jmp .LOOP_GRANDE_SQR_1024 .align 32 .LOOP_GRANDE_SQR_1024: lea 32*18+128(%rsp), $aap # size optimization lea 448(%rsp), $tp1 # 64+128+256=448 # the squaring is performed as described in Variant B of # "Speeding up Big-Number Squaring", so start by calculating # the A*2=A+A vector vpaddq $ACC1, $ACC1, $ACC1 vpbroadcastq 32*0-128($ap), $B1 vpaddq $ACC2, $ACC2, $ACC2 vmovdqa $ACC1, 32*0-128($aap) vpaddq $ACC3, $ACC3, $ACC3 vmovdqa $ACC2, 32*1-128($aap) vpaddq $ACC4, $ACC4, $ACC4 vmovdqa $ACC3, 32*2-128($aap) vpaddq $ACC5, $ACC5, $ACC5 vmovdqa $ACC4, 32*3-128($aap) vpaddq $ACC6, $ACC6, $ACC6 vmovdqa $ACC5, 32*4-128($aap) vpaddq $ACC7, $ACC7, $ACC7 vmovdqa $ACC6, 32*5-128($aap) vpaddq $ACC8, $ACC8, $ACC8 vmovdqa $ACC7, 32*6-128($aap) vpxor $ACC9, $ACC9, $ACC9 vmovdqa $ACC8, 32*7-128($aap) vpmuludq 32*0-128($ap), $B1, $ACC0 vpbroadcastq 32*1-128($ap), $B2 vmovdqu $ACC9, 32*9-192($tp0) # zero upper half vpmuludq $B1, $ACC1, $ACC1 vmovdqu $ACC9, 32*10-448($tp1) vpmuludq $B1, $ACC2, $ACC2 vmovdqu $ACC9, 32*11-448($tp1) vpmuludq $B1, $ACC3, $ACC3 vmovdqu $ACC9, 32*12-448($tp1) vpmuludq $B1, $ACC4, $ACC4 vmovdqu $ACC9, 32*13-448($tp1) vpmuludq $B1, $ACC5, $ACC5 vmovdqu $ACC9, 32*14-448($tp1) vpmuludq $B1, $ACC6, $ACC6 vmovdqu $ACC9, 32*15-448($tp1) vpmuludq $B1, $ACC7, $ACC7 vmovdqu $ACC9, 32*16-448($tp1) vpmuludq $B1, $ACC8, $ACC8 vpbroadcastq 32*2-128($ap), $B1 vmovdqu $ACC9, 32*17-448($tp1) mov $ap, $tpa mov \$4, $i jmp .Lsqr_entry_1024 ___ $TEMP0=$Y1; $TEMP2=$Y2; $code.=<<___; .align 32 .LOOP_SQR_1024: vpbroadcastq 32*1-128($tpa), $B2 vpmuludq 32*0-128($ap), $B1, $ACC0 vpaddq 32*0-192($tp0), $ACC0, $ACC0 vpmuludq 32*0-128($aap), $B1, $ACC1 vpaddq 32*1-192($tp0), $ACC1, $ACC1 vpmuludq 32*1-128($aap), $B1, $ACC2 vpaddq 32*2-192($tp0), $ACC2, $ACC2 vpmuludq 32*2-128($aap), $B1, $ACC3 vpaddq 32*3-192($tp0), $ACC3, $ACC3 vpmuludq 32*3-128($aap), $B1, $ACC4 vpaddq 32*4-192($tp0), $ACC4, $ACC4 vpmuludq 32*4-128($aap), $B1, $ACC5 vpaddq 32*5-192($tp0), $ACC5, $ACC5 vpmuludq 32*5-128($aap), $B1, $ACC6 vpaddq 32*6-192($tp0), $ACC6, $ACC6 vpmuludq 32*6-128($aap), $B1, $ACC7 vpaddq 32*7-192($tp0), $ACC7, $ACC7 vpmuludq 32*7-128($aap), $B1, $ACC8 vpbroadcastq 32*2-128($tpa), $B1 vpaddq 32*8-192($tp0), $ACC8, $ACC8 .Lsqr_entry_1024: vmovdqu $ACC0, 32*0-192($tp0) vmovdqu $ACC1, 32*1-192($tp0) vpmuludq 32*1-128($ap), $B2, $TEMP0 vpaddq $TEMP0, $ACC2, $ACC2 vpmuludq 32*1-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC3, $ACC3 vpmuludq 32*2-128($aap), $B2, $TEMP2 vpaddq $TEMP2, $ACC4, $ACC4 vpmuludq 32*3-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq 32*4-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC6, $ACC6 vpmuludq 32*5-128($aap), $B2, $TEMP2 vpaddq $TEMP2, $ACC7, $ACC7 vpmuludq 32*6-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq 32*7-128($aap), $B2, $ACC0 vpbroadcastq 32*3-128($tpa), $B2 vpaddq 32*9-192($tp0), $ACC0, $ACC0 vmovdqu $ACC2, 32*2-192($tp0) vmovdqu $ACC3, 32*3-192($tp0) vpmuludq 32*2-128($ap), $B1, $TEMP2 vpaddq $TEMP2, $ACC4, $ACC4 vpmuludq 32*2-128($aap), $B1, $TEMP0 vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq 32*3-128($aap), $B1, $TEMP1 vpaddq $TEMP1, $ACC6, $ACC6 vpmuludq 32*4-128($aap), $B1, $TEMP2 vpaddq $TEMP2, $ACC7, $ACC7 vpmuludq 32*5-128($aap), $B1, $TEMP0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq 32*6-128($aap), $B1, $TEMP1 vpaddq $TEMP1, $ACC0, $ACC0 vpmuludq 32*7-128($aap), $B1, $ACC1 vpbroadcastq 32*4-128($tpa), $B1 vpaddq 32*10-448($tp1), $ACC1, $ACC1 vmovdqu $ACC4, 32*4-192($tp0) vmovdqu $ACC5, 32*5-192($tp0) vpmuludq 32*3-128($ap), $B2, $TEMP0 vpaddq $TEMP0, $ACC6, $ACC6 vpmuludq 32*3-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC7, $ACC7 vpmuludq 32*4-128($aap), $B2, $TEMP2 vpaddq $TEMP2, $ACC8, $ACC8 vpmuludq 32*5-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC0, $ACC0 vpmuludq 32*6-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC1, $ACC1 vpmuludq 32*7-128($aap), $B2, $ACC2 vpbroadcastq 32*5-128($tpa), $B2 vpaddq 32*11-448($tp1), $ACC2, $ACC2 vmovdqu $ACC6, 32*6-192($tp0) vmovdqu $ACC7, 32*7-192($tp0) vpmuludq 32*4-128($ap), $B1, $TEMP0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq 32*4-128($aap), $B1, $TEMP1 vpaddq $TEMP1, $ACC0, $ACC0 vpmuludq 32*5-128($aap), $B1, $TEMP2 vpaddq $TEMP2, $ACC1, $ACC1 vpmuludq 32*6-128($aap), $B1, $TEMP0 vpaddq $TEMP0, $ACC2, $ACC2 vpmuludq 32*7-128($aap), $B1, $ACC3 vpbroadcastq 32*6-128($tpa), $B1 vpaddq 32*12-448($tp1), $ACC3, $ACC3 vmovdqu $ACC8, 32*8-192($tp0) vmovdqu $ACC0, 32*9-192($tp0) lea 8($tp0), $tp0 vpmuludq 32*5-128($ap), $B2, $TEMP2 vpaddq $TEMP2, $ACC1, $ACC1 vpmuludq 32*5-128($aap), $B2, $TEMP0 vpaddq $TEMP0, $ACC2, $ACC2 vpmuludq 32*6-128($aap), $B2, $TEMP1 vpaddq $TEMP1, $ACC3, $ACC3 vpmuludq 32*7-128($aap), $B2, $ACC4 vpbroadcastq 32*7-128($tpa), $B2 vpaddq 32*13-448($tp1), $ACC4, $ACC4 vmovdqu $ACC1, 32*10-448($tp1) vmovdqu $ACC2, 32*11-448($tp1) vpmuludq 32*6-128($ap), $B1, $TEMP0 vpaddq $TEMP0, $ACC3, $ACC3 vpmuludq 32*6-128($aap), $B1, $TEMP1 vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1 vpaddq $TEMP1, $ACC4, $ACC4 vpmuludq 32*7-128($aap), $B1, $ACC5 vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration vpaddq 32*14-448($tp1), $ACC5, $ACC5 vmovdqu $ACC3, 32*12-448($tp1) vmovdqu $ACC4, 32*13-448($tp1) lea 8($tpa), $tpa vpmuludq 32*7-128($ap), $B2, $TEMP0 vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq 32*7-128($aap), $B2, $ACC6 vpaddq 32*15-448($tp1), $ACC6, $ACC6 vpmuludq 32*8-128($ap), $ACC0, $ACC7 vmovdqu $ACC5, 32*14-448($tp1) vpaddq 32*16-448($tp1), $ACC7, $ACC7 vmovdqu $ACC6, 32*15-448($tp1) vmovdqu $ACC7, 32*16-448($tp1) lea 8($tp1), $tp1 dec $i jnz .LOOP_SQR_1024 ___ $ZERO = $ACC9; $TEMP0 = $B1; $TEMP2 = $B2; $TEMP3 = $Y1; $TEMP4 = $Y2; $code.=<<___; # we need to fix indices 32-39 to avoid overflow vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0), vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0) vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0) lea 192(%rsp), $tp0 # 64+128=192 vpsrlq \$29, $ACC8, $TEMP1 vpand $AND_MASK, $ACC8, $ACC8 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpermq \$0x93, $TEMP1, $TEMP1 vpxor $ZERO, $ZERO, $ZERO vpermq \$0x93, $TEMP2, $TEMP2 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC8, $ACC8 vpblendd \$3, $TEMP2, $ZERO, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpaddq $TEMP2, $ACC2, $ACC2 vmovdqu $ACC1, 32*9-192($tp0) vmovdqu $ACC2, 32*10-192($tp0) mov (%rsp), %rax mov 8(%rsp), $r1 mov 16(%rsp), $r2 mov 24(%rsp), $r3 vmovdqu 32*1(%rsp), $ACC1 vmovdqu 32*2-192($tp0), $ACC2 vmovdqu 32*3-192($tp0), $ACC3 vmovdqu 32*4-192($tp0), $ACC4 vmovdqu 32*5-192($tp0), $ACC5 vmovdqu 32*6-192($tp0), $ACC6 vmovdqu 32*7-192($tp0), $ACC7 mov %rax, $r0 imull $n0, %eax and \$0x1fffffff, %eax vmovd %eax, $Y1 mov %rax, %rdx imulq -128($np), %rax vpbroadcastq $Y1, $Y1 add %rax, $r0 mov %rdx, %rax imulq 8-128($np), %rax shr \$29, $r0 add %rax, $r1 mov %rdx, %rax imulq 16-128($np), %rax add $r0, $r1 add %rax, $r2 imulq 24-128($np), %rdx add %rdx, $r3 mov $r1, %rax imull $n0, %eax and \$0x1fffffff, %eax mov \$9, $i jmp .LOOP_REDUCE_1024 .align 32 .LOOP_REDUCE_1024: vmovd %eax, $Y2 vpbroadcastq $Y2, $Y2 vpmuludq 32*1-128($np), $Y1, $TEMP0 mov %rax, %rdx imulq -128($np), %rax vpaddq $TEMP0, $ACC1, $ACC1 add %rax, $r1 vpmuludq 32*2-128($np), $Y1, $TEMP1 mov %rdx, %rax imulq 8-128($np), %rax vpaddq $TEMP1, $ACC2, $ACC2 vpmuludq 32*3-128($np), $Y1, $TEMP2 .byte 0x67 add %rax, $r2 .byte 0x67 mov %rdx, %rax imulq 16-128($np), %rax shr \$29, $r1 vpaddq $TEMP2, $ACC3, $ACC3 vpmuludq 32*4-128($np), $Y1, $TEMP0 add %rax, $r3 add $r1, $r2 vpaddq $TEMP0, $ACC4, $ACC4 vpmuludq 32*5-128($np), $Y1, $TEMP1 mov $r2, %rax imull $n0, %eax vpaddq $TEMP1, $ACC5, $ACC5 vpmuludq 32*6-128($np), $Y1, $TEMP2 and \$0x1fffffff, %eax vpaddq $TEMP2, $ACC6, $ACC6 vpmuludq 32*7-128($np), $Y1, $TEMP0 vpaddq $TEMP0, $ACC7, $ACC7 vpmuludq 32*8-128($np), $Y1, $TEMP1 vmovd %eax, $Y1 #vmovdqu 32*1-8-128($np), $TEMP2 # moved below vpaddq $TEMP1, $ACC8, $ACC8 #vmovdqu 32*2-8-128($np), $TEMP0 # moved below vpbroadcastq $Y1, $Y1 vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above vmovdqu 32*3-8-128($np), $TEMP1 mov %rax, %rdx imulq -128($np), %rax vpaddq $TEMP2, $ACC1, $ACC1 vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above vmovdqu 32*4-8-128($np), $TEMP2 add %rax, $r2 mov %rdx, %rax imulq 8-128($np), %rax vpaddq $TEMP0, $ACC2, $ACC2 add $r3, %rax shr \$29, $r2 vpmuludq $Y2, $TEMP1, $TEMP1 vmovdqu 32*5-8-128($np), $TEMP0 add $r2, %rax vpaddq $TEMP1, $ACC3, $ACC3 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*6-8-128($np), $TEMP1 .byte 0x67 mov %rax, $r3 imull $n0, %eax vpaddq $TEMP2, $ACC4, $ACC4 vpmuludq $Y2, $TEMP0, $TEMP0 .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2 and \$0x1fffffff, %eax vpaddq $TEMP0, $ACC5, $ACC5 vpmuludq $Y2, $TEMP1, $TEMP1 vmovdqu 32*8-8-128($np), $TEMP0 vpaddq $TEMP1, $ACC6, $ACC6 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*9-8-128($np), $ACC9 vmovd %eax, $ACC0 # borrow ACC0 for Y2 imulq -128($np), %rax vpaddq $TEMP2, $ACC7, $ACC7 vpmuludq $Y2, $TEMP0, $TEMP0 vmovdqu 32*1-16-128($np), $TEMP1 vpbroadcastq $ACC0, $ACC0 vpaddq $TEMP0, $ACC8, $ACC8 vpmuludq $Y2, $ACC9, $ACC9 vmovdqu 32*2-16-128($np), $TEMP2 add %rax, $r3 ___ ($ACC0,$Y2)=($Y2,$ACC0); $code.=<<___; vmovdqu 32*1-24-128($np), $ACC0 vpmuludq $Y1, $TEMP1, $TEMP1 vmovdqu 32*3-16-128($np), $TEMP0 vpaddq $TEMP1, $ACC1, $ACC1 vpmuludq $Y2, $ACC0, $ACC0 vpmuludq $Y1, $TEMP2, $TEMP2 .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1 vpaddq $ACC1, $ACC0, $ACC0 vpaddq $TEMP2, $ACC2, $ACC2 vpmuludq $Y1, $TEMP0, $TEMP0 vmovdqu 32*5-16-128($np), $TEMP2 .byte 0x67 vmovq $ACC0, %rax vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 vpaddq $TEMP0, $ACC3, $ACC3 vpmuludq $Y1, $TEMP1, $TEMP1 vmovdqu 32*6-16-128($np), $TEMP0 vpaddq $TEMP1, $ACC4, $ACC4 vpmuludq $Y1, $TEMP2, $TEMP2 vmovdqu 32*7-16-128($np), $TEMP1 vpaddq $TEMP2, $ACC5, $ACC5 vpmuludq $Y1, $TEMP0, $TEMP0 vmovdqu 32*8-16-128($np), $TEMP2 vpaddq $TEMP0, $ACC6, $ACC6 vpmuludq $Y1, $TEMP1, $TEMP1 shr \$29, $r3 vmovdqu 32*9-16-128($np), $TEMP0 add $r3, %rax vpaddq $TEMP1, $ACC7, $ACC7 vpmuludq $Y1, $TEMP2, $TEMP2 #vmovdqu 32*2-24-128($np), $TEMP1 # moved below mov %rax, $r0 imull $n0, %eax vpaddq $TEMP2, $ACC8, $ACC8 vpmuludq $Y1, $TEMP0, $TEMP0 and \$0x1fffffff, %eax vmovd %eax, $Y1 vmovdqu 32*3-24-128($np), $TEMP2 .byte 0x67 vpaddq $TEMP0, $ACC9, $ACC9 vpbroadcastq $Y1, $Y1 vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above vmovdqu 32*4-24-128($np), $TEMP0 mov %rax, %rdx imulq -128($np), %rax mov 8(%rsp), $r1 vpaddq $TEMP1, $ACC2, $ACC1 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*5-24-128($np), $TEMP1 add %rax, $r0 mov %rdx, %rax imulq 8-128($np), %rax .byte 0x67 shr \$29, $r0 mov 16(%rsp), $r2 vpaddq $TEMP2, $ACC3, $ACC2 vpmuludq $Y2, $TEMP0, $TEMP0 vmovdqu 32*6-24-128($np), $TEMP2 add %rax, $r1 mov %rdx, %rax imulq 16-128($np), %rax vpaddq $TEMP0, $ACC4, $ACC3 vpmuludq $Y2, $TEMP1, $TEMP1 vmovdqu 32*7-24-128($np), $TEMP0 imulq 24-128($np), %rdx # future $r3 add %rax, $r2 lea ($r0,$r1), %rax vpaddq $TEMP1, $ACC5, $ACC4 vpmuludq $Y2, $TEMP2, $TEMP2 vmovdqu 32*8-24-128($np), $TEMP1 mov %rax, $r1 imull $n0, %eax vpmuludq $Y2, $TEMP0, $TEMP0 vpaddq $TEMP2, $ACC6, $ACC5 vmovdqu 32*9-24-128($np), $TEMP2 and \$0x1fffffff, %eax vpaddq $TEMP0, $ACC7, $ACC6 vpmuludq $Y2, $TEMP1, $TEMP1 add 24(%rsp), %rdx vpaddq $TEMP1, $ACC8, $ACC7 vpmuludq $Y2, $TEMP2, $TEMP2 vpaddq $TEMP2, $ACC9, $ACC8 vmovq $r3, $ACC9 mov %rdx, $r3 dec $i jnz .LOOP_REDUCE_1024 ___ ($ACC0,$Y2)=($Y2,$ACC0); $code.=<<___; lea 448(%rsp), $tp1 # size optimization vpaddq $ACC9, $Y2, $ACC0 vpxor $ZERO, $ZERO, $ZERO vpaddq 32*9-192($tp0), $ACC0, $ACC0 vpaddq 32*10-448($tp1), $ACC1, $ACC1 vpaddq 32*11-448($tp1), $ACC2, $ACC2 vpaddq 32*12-448($tp1), $ACC3, $ACC3 vpaddq 32*13-448($tp1), $ACC4, $ACC4 vpaddq 32*14-448($tp1), $ACC5, $ACC5 vpaddq 32*15-448($tp1), $ACC6, $ACC6 vpaddq 32*16-448($tp1), $ACC7, $ACC7 vpaddq 32*17-448($tp1), $ACC8, $ACC8 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vpaddq $TEMP4, $ACC4, $ACC4 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vmovdqu $ACC0, 32*0-128($rp) vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vmovdqu $ACC1, 32*1-128($rp) vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vmovdqu $ACC2, 32*2-128($rp) vpaddq $TEMP4, $ACC4, $ACC4 vmovdqu $ACC3, 32*3-128($rp) ___ $TEMP5=$ACC0; $code.=<<___; vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vmovdqu $ACC4, 32*4-128($rp) vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vmovdqu $ACC5, 32*5-128($rp) vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vmovdqu $ACC6, 32*6-128($rp) vpaddq $TEMP4, $ACC8, $ACC8 vmovdqu $ACC7, 32*7-128($rp) vmovdqu $ACC8, 32*8-128($rp) mov $rp, $ap dec $rep jne .LOOP_GRANDE_SQR_1024 vzeroall mov %rbp, %rax .cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); .Lsqr_1024_in_tail: movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lsqr_1024_epilogue: ret .cfi_endproc .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 ___ } { # void AMM_WW( my $rp="%rdi"; # BN_ULONG *rp, my $ap="%rsi"; # const BN_ULONG *ap, my $bp="%rdx"; # const BN_ULONG *bp, my $np="%rcx"; # const BN_ULONG *np, my $n0="%r8d"; # unsigned int n0); # The registers that hold the accumulated redundant result # The AMM works on 1024 bit operands, and redundant word size is 29 # Therefore: ceil(1024/29)/4 = 9 my $ACC0="%ymm0"; my $ACC1="%ymm1"; my $ACC2="%ymm2"; my $ACC3="%ymm3"; my $ACC4="%ymm4"; my $ACC5="%ymm5"; my $ACC6="%ymm6"; my $ACC7="%ymm7"; my $ACC8="%ymm8"; my $ACC9="%ymm9"; # Registers that hold the broadcasted words of multiplier, currently used my $Bi="%ymm10"; my $Yi="%ymm11"; # Helper registers my $TEMP0=$ACC0; my $TEMP1="%ymm12"; my $TEMP2="%ymm13"; my $ZERO="%ymm14"; my $AND_MASK="%ymm15"; # alu registers that hold the first words of the ACC my $r0="%r9"; my $r1="%r10"; my $r2="%r11"; my $r3="%r12"; my $i="%r14d"; my $tmp="%r15"; $bp="%r13"; # reassigned argument $code.=<<___; .globl rsaz_1024_mul_avx2 .type rsaz_1024_mul_avx2,\@function,5 .align 64 rsaz_1024_mul_avx2: .cfi_startproc lea (%rsp), %rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); vzeroupper lea -0xa8(%rsp),%rsp vmovaps %xmm6,-0xd8(%rax) vmovaps %xmm7,-0xc8(%rax) vmovaps %xmm8,-0xb8(%rax) vmovaps %xmm9,-0xa8(%rax) vmovaps %xmm10,-0x98(%rax) vmovaps %xmm11,-0x88(%rax) vmovaps %xmm12,-0x78(%rax) vmovaps %xmm13,-0x68(%rax) vmovaps %xmm14,-0x58(%rax) vmovaps %xmm15,-0x48(%rax) .Lmul_1024_body: ___ $code.=<<___; mov %rax,%rbp .cfi_def_cfa_register %rbp vzeroall mov %rdx, $bp # reassigned argument sub \$64,%rsp # unaligned 256-bit load that crosses page boundary can # cause severe performance degradation here, so if $ap does # cross page boundary, swap it with $bp [meaning that caller # is advised to lay down $ap and $bp next to each other, so # that only one can cross page boundary]. .byte 0x67,0x67 mov $ap, $tmp and \$4095, $tmp add \$32*10, $tmp shr \$12, $tmp mov $ap, $tmp cmovnz $bp, $ap cmovnz $tmp, $bp mov $np, $tmp sub \$-128,$ap # size optimization sub \$-128,$np sub \$-128,$rp and \$4095, $tmp # see if $np crosses page add \$32*10, $tmp .byte 0x67,0x67 shr \$12, $tmp jz .Lmul_1024_no_n_copy # unaligned 256-bit load that crosses page boundary can # cause severe performance degradation here, so if $np does # cross page boundary, copy it to stack and make sure stack # frame doesn't... sub \$32*10,%rsp vmovdqu 32*0-128($np), $ACC0 and \$-512, %rsp vmovdqu 32*1-128($np), $ACC1 vmovdqu 32*2-128($np), $ACC2 vmovdqu 32*3-128($np), $ACC3 vmovdqu 32*4-128($np), $ACC4 vmovdqu 32*5-128($np), $ACC5 vmovdqu 32*6-128($np), $ACC6 vmovdqu 32*7-128($np), $ACC7 vmovdqu 32*8-128($np), $ACC8 lea 64+128(%rsp),$np vmovdqu $ACC0, 32*0-128($np) vpxor $ACC0, $ACC0, $ACC0 vmovdqu $ACC1, 32*1-128($np) vpxor $ACC1, $ACC1, $ACC1 vmovdqu $ACC2, 32*2-128($np) vpxor $ACC2, $ACC2, $ACC2 vmovdqu $ACC3, 32*3-128($np) vpxor $ACC3, $ACC3, $ACC3 vmovdqu $ACC4, 32*4-128($np) vpxor $ACC4, $ACC4, $ACC4 vmovdqu $ACC5, 32*5-128($np) vpxor $ACC5, $ACC5, $ACC5 vmovdqu $ACC6, 32*6-128($np) vpxor $ACC6, $ACC6, $ACC6 vmovdqu $ACC7, 32*7-128($np) vpxor $ACC7, $ACC7, $ACC7 vmovdqu $ACC8, 32*8-128($np) vmovdqa $ACC0, $ACC8 vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall .Lmul_1024_no_n_copy: and \$-64,%rsp mov ($bp), %rbx vpbroadcastq ($bp), $Bi vmovdqu $ACC0, (%rsp) # clear top of stack xor $r0, $r0 .byte 0x67 xor $r1, $r1 xor $r2, $r2 xor $r3, $r3 vmovdqu .Land_mask(%rip), $AND_MASK mov \$9, $i vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall jmp .Loop_mul_1024 .align 32 .Loop_mul_1024: vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*) mov %rbx, %rax imulq -128($ap), %rax add $r0, %rax mov %rbx, $r1 imulq 8-128($ap), $r1 add 8(%rsp), $r1 mov %rax, $r0 imull $n0, %eax and \$0x1fffffff, %eax mov %rbx, $r2 imulq 16-128($ap), $r2 add 16(%rsp), $r2 mov %rbx, $r3 imulq 24-128($ap), $r3 add 24(%rsp), $r3 vpmuludq 32*1-128($ap),$Bi,$TEMP0 vmovd %eax, $Yi vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq 32*2-128($ap),$Bi,$TEMP1 vpbroadcastq $Yi, $Yi vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq 32*3-128($ap),$Bi,$TEMP2 vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq 32*4-128($ap),$Bi,$TEMP0 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq 32*5-128($ap),$Bi,$TEMP1 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq 32*6-128($ap),$Bi,$TEMP2 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq 32*7-128($ap),$Bi,$TEMP0 vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq 32*8-128($ap),$Bi,$TEMP1 vpbroadcastq 8($bp), $Bi vpaddq $TEMP1,$ACC8,$ACC8 mov %rax,%rdx imulq -128($np),%rax add %rax,$r0 mov %rdx,%rax imulq 8-128($np),%rax add %rax,$r1 mov %rdx,%rax imulq 16-128($np),%rax add %rax,$r2 shr \$29, $r0 imulq 24-128($np),%rdx add %rdx,$r3 add $r0, $r1 vpmuludq 32*1-128($np),$Yi,$TEMP2 vmovq $Bi, %rbx vpaddq $TEMP2,$ACC1,$ACC1 vpmuludq 32*2-128($np),$Yi,$TEMP0 vpaddq $TEMP0,$ACC2,$ACC2 vpmuludq 32*3-128($np),$Yi,$TEMP1 vpaddq $TEMP1,$ACC3,$ACC3 vpmuludq 32*4-128($np),$Yi,$TEMP2 vpaddq $TEMP2,$ACC4,$ACC4 vpmuludq 32*5-128($np),$Yi,$TEMP0 vpaddq $TEMP0,$ACC5,$ACC5 vpmuludq 32*6-128($np),$Yi,$TEMP1 vpaddq $TEMP1,$ACC6,$ACC6 vpmuludq 32*7-128($np),$Yi,$TEMP2 vpblendd \$3, $ZERO, $ACC9, $TEMP1 # correct $ACC3 vpaddq $TEMP2,$ACC7,$ACC7 vpmuludq 32*8-128($np),$Yi,$TEMP0 vpaddq $TEMP1, $ACC3, $ACC3 # correct $ACC3 vpaddq $TEMP0,$ACC8,$ACC8 mov %rbx, %rax imulq -128($ap),%rax add %rax,$r1 vmovdqu -8+32*1-128($ap),$TEMP1 mov %rbx, %rax imulq 8-128($ap),%rax add %rax,$r2 vmovdqu -8+32*2-128($ap),$TEMP2 mov $r1, %rax vpblendd \$0xfc, $ZERO, $ACC9, $ACC9 # correct $ACC3 imull $n0, %eax vpaddq $ACC9,$ACC4,$ACC4 # correct $ACC3 and \$0x1fffffff, %eax imulq 16-128($ap),%rbx add %rbx,$r3 vpmuludq $Bi,$TEMP1,$TEMP1 vmovd %eax, $Yi vmovdqu -8+32*3-128($ap),$TEMP0 vpaddq $TEMP1,$ACC1,$ACC1 vpmuludq $Bi,$TEMP2,$TEMP2 vpbroadcastq $Yi, $Yi vmovdqu -8+32*4-128($ap),$TEMP1 vpaddq $TEMP2,$ACC2,$ACC2 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -8+32*5-128($ap),$TEMP2 vpaddq $TEMP0,$ACC3,$ACC3 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -8+32*6-128($ap),$TEMP0 vpaddq $TEMP1,$ACC4,$ACC4 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -8+32*7-128($ap),$TEMP1 vpaddq $TEMP2,$ACC5,$ACC5 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -8+32*8-128($ap),$TEMP2 vpaddq $TEMP0,$ACC6,$ACC6 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -8+32*9-128($ap),$ACC9 vpaddq $TEMP1,$ACC7,$ACC7 vpmuludq $Bi,$TEMP2,$TEMP2 vpaddq $TEMP2,$ACC8,$ACC8 vpmuludq $Bi,$ACC9,$ACC9 vpbroadcastq 16($bp), $Bi mov %rax,%rdx imulq -128($np),%rax add %rax,$r1 vmovdqu -8+32*1-128($np),$TEMP0 mov %rdx,%rax imulq 8-128($np),%rax add %rax,$r2 vmovdqu -8+32*2-128($np),$TEMP1 shr \$29, $r1 imulq 16-128($np),%rdx add %rdx,$r3 add $r1, $r2 vpmuludq $Yi,$TEMP0,$TEMP0 vmovq $Bi, %rbx vmovdqu -8+32*3-128($np),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -8+32*4-128($np),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -8+32*5-128($np),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -8+32*6-128($np),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -8+32*7-128($np),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -8+32*8-128($np),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -8+32*9-128($np),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Yi,$TEMP1,$TEMP1 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Yi,$TEMP2,$TEMP2 vpaddq $TEMP2,$ACC9,$ACC9 vmovdqu -16+32*1-128($ap),$TEMP0 mov %rbx,%rax imulq -128($ap),%rax add $r2,%rax vmovdqu -16+32*2-128($ap),$TEMP1 mov %rax,$r2 imull $n0, %eax and \$0x1fffffff, %eax imulq 8-128($ap),%rbx add %rbx,$r3 vpmuludq $Bi,$TEMP0,$TEMP0 vmovd %eax, $Yi vmovdqu -16+32*3-128($ap),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Bi,$TEMP1,$TEMP1 vpbroadcastq $Yi, $Yi vmovdqu -16+32*4-128($ap),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -16+32*5-128($ap),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -16+32*6-128($ap),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -16+32*7-128($ap),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -16+32*8-128($ap),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -16+32*9-128($ap),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Bi,$TEMP1,$TEMP1 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Bi,$TEMP2,$TEMP2 vpbroadcastq 24($bp), $Bi vpaddq $TEMP2,$ACC9,$ACC9 vmovdqu -16+32*1-128($np),$TEMP0 mov %rax,%rdx imulq -128($np),%rax add %rax,$r2 vmovdqu -16+32*2-128($np),$TEMP1 imulq 8-128($np),%rdx add %rdx,$r3 shr \$29, $r2 vpmuludq $Yi,$TEMP0,$TEMP0 vmovq $Bi, %rbx vmovdqu -16+32*3-128($np),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -16+32*4-128($np),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -16+32*5-128($np),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -16+32*6-128($np),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -16+32*7-128($np),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -16+32*8-128($np),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -16+32*9-128($np),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -24+32*1-128($ap),$TEMP0 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -24+32*2-128($ap),$TEMP1 vpaddq $TEMP2,$ACC9,$ACC9 add $r2, $r3 imulq -128($ap),%rbx add %rbx,$r3 mov $r3, %rax imull $n0, %eax and \$0x1fffffff, %eax vpmuludq $Bi,$TEMP0,$TEMP0 vmovd %eax, $Yi vmovdqu -24+32*3-128($ap),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC1 vpmuludq $Bi,$TEMP1,$TEMP1 vpbroadcastq $Yi, $Yi vmovdqu -24+32*4-128($ap),$TEMP0 vpaddq $TEMP1,$ACC2,$ACC2 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -24+32*5-128($ap),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC3 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -24+32*6-128($ap),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC4 vpmuludq $Bi,$TEMP1,$TEMP1 vmovdqu -24+32*7-128($ap),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC5 vpmuludq $Bi,$TEMP2,$TEMP2 vmovdqu -24+32*8-128($ap),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC6 vpmuludq $Bi,$TEMP0,$TEMP0 vmovdqu -24+32*9-128($ap),$TEMP2 vpaddq $TEMP0,$ACC7,$ACC7 vpmuludq $Bi,$TEMP1,$TEMP1 vpaddq $TEMP1,$ACC8,$ACC8 vpmuludq $Bi,$TEMP2,$TEMP2 vpbroadcastq 32($bp), $Bi vpaddq $TEMP2,$ACC9,$ACC9 add \$32, $bp # $bp++ vmovdqu -24+32*1-128($np),$TEMP0 imulq -128($np),%rax add %rax,$r3 shr \$29, $r3 vmovdqu -24+32*2-128($np),$TEMP1 vpmuludq $Yi,$TEMP0,$TEMP0 vmovq $Bi, %rbx vmovdqu -24+32*3-128($np),$TEMP2 vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu $ACC0, (%rsp) # transfer $r0-$r3 vpaddq $TEMP1,$ACC2,$ACC1 vmovdqu -24+32*4-128($np),$TEMP0 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -24+32*5-128($np),$TEMP1 vpaddq $TEMP2,$ACC3,$ACC2 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -24+32*6-128($np),$TEMP2 vpaddq $TEMP0,$ACC4,$ACC3 vpmuludq $Yi,$TEMP1,$TEMP1 vmovdqu -24+32*7-128($np),$TEMP0 vpaddq $TEMP1,$ACC5,$ACC4 vpmuludq $Yi,$TEMP2,$TEMP2 vmovdqu -24+32*8-128($np),$TEMP1 vpaddq $TEMP2,$ACC6,$ACC5 vpmuludq $Yi,$TEMP0,$TEMP0 vmovdqu -24+32*9-128($np),$TEMP2 mov $r3, $r0 vpaddq $TEMP0,$ACC7,$ACC6 vpmuludq $Yi,$TEMP1,$TEMP1 add (%rsp), $r0 vpaddq $TEMP1,$ACC8,$ACC7 vpmuludq $Yi,$TEMP2,$TEMP2 vmovq $r3, $TEMP1 vpaddq $TEMP2,$ACC9,$ACC8 dec $i jnz .Loop_mul_1024 ___ # (*) Original implementation was correcting ACC1-ACC3 for overflow # after 7 loop runs, or after 28 iterations, or 56 additions. # But as we underutilize resources, it's possible to correct in # each iteration with marginal performance loss. But then, as # we do it in each iteration, we can correct less digits, and # avoid performance penalties completely. $TEMP0 = $ACC9; $TEMP3 = $Bi; $TEMP4 = $Yi; $code.=<<___; vpaddq (%rsp), $TEMP1, $ACC0 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpermq \$0x93, $TEMP4, $TEMP4 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vpaddq $TEMP4, $ACC4, $ACC4 vpsrlq \$29, $ACC0, $TEMP1 vpand $AND_MASK, $ACC0, $ACC0 vpsrlq \$29, $ACC1, $TEMP2 vpand $AND_MASK, $ACC1, $ACC1 vpsrlq \$29, $ACC2, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC2, $ACC2 vpsrlq \$29, $ACC3, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC3, $ACC3 vpermq \$0x93, $TEMP3, $TEMP3 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC0, $ACC0 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC1, $ACC1 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC2, $ACC2 vpblendd \$3, $TEMP4, $ZERO, $TEMP4 vpaddq $TEMP3, $ACC3, $ACC3 vpaddq $TEMP4, $ACC4, $ACC4 vmovdqu $ACC0, 0-128($rp) vmovdqu $ACC1, 32-128($rp) vmovdqu $ACC2, 64-128($rp) vmovdqu $ACC3, 96-128($rp) ___ $TEMP5=$ACC0; $code.=<<___; vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 vpsrlq \$29, $ACC4, $TEMP1 vpand $AND_MASK, $ACC4, $ACC4 vpsrlq \$29, $ACC5, $TEMP2 vpand $AND_MASK, $ACC5, $ACC5 vpsrlq \$29, $ACC6, $TEMP3 vpermq \$0x93, $TEMP1, $TEMP1 vpand $AND_MASK, $ACC6, $ACC6 vpsrlq \$29, $ACC7, $TEMP4 vpermq \$0x93, $TEMP2, $TEMP2 vpand $AND_MASK, $ACC7, $ACC7 vpsrlq \$29, $ACC8, $TEMP5 vpermq \$0x93, $TEMP3, $TEMP3 vpand $AND_MASK, $ACC8, $ACC8 vpermq \$0x93, $TEMP4, $TEMP4 vpblendd \$3, $ZERO, $TEMP1, $TEMP0 vpermq \$0x93, $TEMP5, $TEMP5 vpblendd \$3, $TEMP1, $TEMP2, $TEMP1 vpaddq $TEMP0, $ACC4, $ACC4 vpblendd \$3, $TEMP2, $TEMP3, $TEMP2 vpaddq $TEMP1, $ACC5, $ACC5 vpblendd \$3, $TEMP3, $TEMP4, $TEMP3 vpaddq $TEMP2, $ACC6, $ACC6 vpblendd \$3, $TEMP4, $TEMP5, $TEMP4 vpaddq $TEMP3, $ACC7, $ACC7 vpaddq $TEMP4, $ACC8, $ACC8 vmovdqu $ACC4, 128-128($rp) vmovdqu $ACC5, 160-128($rp) vmovdqu $ACC6, 192-128($rp) vmovdqu $ACC7, 224-128($rp) vmovdqu $ACC8, 256-128($rp) vzeroupper mov %rbp, %rax .cfi_def_cfa_register %rax ___ $code.=<<___ if ($win64); .Lmul_1024_in_tail: movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lmul_1024_epilogue: ret .cfi_endproc .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 ___ } { my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi"); my @T = map("%r$_",(8..11)); $code.=<<___; .globl rsaz_1024_red2norm_avx2 .type rsaz_1024_red2norm_avx2,\@abi-omnipotent .align 32 rsaz_1024_red2norm_avx2: .cfi_startproc sub \$-128,$inp # size optimization xor %rax,%rax ___ for ($j=0,$i=0; $i<16; $i++) { my $k=0; while (29*$j<64*($i+1)) { # load data till boundary $code.=" mov `8*$j-128`($inp), @T[0]\n"; $j++; $k++; push(@T,shift(@T)); } $l=$k; while ($k>1) { # shift loaded data but last value $code.=" shl \$`29*($j-$k)`,@T[-$k]\n"; $k--; } $code.=<<___; # shift last value mov @T[-1], @T[0] shl \$`29*($j-1)`, @T[-1] shr \$`-29*($j-1)`, @T[0] ___ while ($l) { # accumulate all values $code.=" add @T[-$l], %rax\n"; $l--; } $code.=<<___; adc \$0, @T[0] # consume eventual carry mov %rax, 8*$i($out) mov @T[0], %rax ___ push(@T,shift(@T)); } $code.=<<___; ret .cfi_endproc .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 .globl rsaz_1024_norm2red_avx2 .type rsaz_1024_norm2red_avx2,\@abi-omnipotent .align 32 rsaz_1024_norm2red_avx2: .cfi_startproc sub \$-128,$out # size optimization mov ($inp),@T[0] mov \$0x1fffffff,%eax ___ for ($j=0,$i=0; $i<16; $i++) { $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15); $code.=" xor @T[1],@T[1]\n" if ($i==15); my $k=1; while (29*($j+1)<64*($i+1)) { $code.=<<___; mov @T[0],@T[-$k] shr \$`29*$j`,@T[-$k] and %rax,@T[-$k] # &0x1fffffff mov @T[-$k],`8*$j-128`($out) ___ $j++; $k++; } $code.=<<___; shrd \$`29*$j`,@T[1],@T[0] and %rax,@T[0] mov @T[0],`8*$j-128`($out) ___ $j++; push(@T,shift(@T)); } $code.=<<___; mov @T[0],`8*$j-128`($out) # zero mov @T[0],`8*($j+1)-128`($out) mov @T[0],`8*($j+2)-128`($out) mov @T[0],`8*($j+3)-128`($out) ret .cfi_endproc .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 ___ } { my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); $code.=<<___; .globl rsaz_1024_scatter5_avx2 .type rsaz_1024_scatter5_avx2,\@abi-omnipotent .align 32 rsaz_1024_scatter5_avx2: .cfi_startproc vzeroupper vmovdqu .Lscatter_permd(%rip),%ymm5 shl \$4,$power lea ($out,$power),$out mov \$9,%eax jmp .Loop_scatter_1024 .align 32 .Loop_scatter_1024: vmovdqu ($inp),%ymm0 lea 32($inp),$inp vpermd %ymm0,%ymm5,%ymm0 vmovdqu %xmm0,($out) lea 16*32($out),$out dec %eax jnz .Loop_scatter_1024 vzeroupper ret .cfi_endproc .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 .globl rsaz_1024_gather5_avx2 .type rsaz_1024_gather5_avx2,\@abi-omnipotent .align 32 rsaz_1024_gather5_avx2: .cfi_startproc vzeroupper mov %rsp,%r11 .cfi_def_cfa_register %r11 ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax .LSEH_begin_rsaz_1024_gather5: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax) ___ $code.=<<___; lea -0x100(%rsp),%rsp and \$-32, %rsp lea .Linc(%rip), %r10 lea -128(%rsp),%rax # control u-op density vmovd $power, %xmm4 vmovdqa (%r10),%ymm0 vmovdqa 32(%r10),%ymm1 vmovdqa 64(%r10),%ymm5 vpbroadcastd %xmm4,%ymm4 vpaddd %ymm5, %ymm0, %ymm2 vpcmpeqd %ymm4, %ymm0, %ymm0 vpaddd %ymm5, %ymm1, %ymm3 vpcmpeqd %ymm4, %ymm1, %ymm1 vmovdqa %ymm0, 32*0+128(%rax) vpaddd %ymm5, %ymm2, %ymm0 vpcmpeqd %ymm4, %ymm2, %ymm2 vmovdqa %ymm1, 32*1+128(%rax) vpaddd %ymm5, %ymm3, %ymm1 vpcmpeqd %ymm4, %ymm3, %ymm3 vmovdqa %ymm2, 32*2+128(%rax) vpaddd %ymm5, %ymm0, %ymm2 vpcmpeqd %ymm4, %ymm0, %ymm0 vmovdqa %ymm3, 32*3+128(%rax) vpaddd %ymm5, %ymm1, %ymm3 vpcmpeqd %ymm4, %ymm1, %ymm1 vmovdqa %ymm0, 32*4+128(%rax) vpaddd %ymm5, %ymm2, %ymm8 vpcmpeqd %ymm4, %ymm2, %ymm2 vmovdqa %ymm1, 32*5+128(%rax) vpaddd %ymm5, %ymm3, %ymm9 vpcmpeqd %ymm4, %ymm3, %ymm3 vmovdqa %ymm2, 32*6+128(%rax) vpaddd %ymm5, %ymm8, %ymm10 vpcmpeqd %ymm4, %ymm8, %ymm8 vmovdqa %ymm3, 32*7+128(%rax) vpaddd %ymm5, %ymm9, %ymm11 vpcmpeqd %ymm4, %ymm9, %ymm9 vpaddd %ymm5, %ymm10, %ymm12 vpcmpeqd %ymm4, %ymm10, %ymm10 vpaddd %ymm5, %ymm11, %ymm13 vpcmpeqd %ymm4, %ymm11, %ymm11 vpaddd %ymm5, %ymm12, %ymm14 vpcmpeqd %ymm4, %ymm12, %ymm12 vpaddd %ymm5, %ymm13, %ymm15 vpcmpeqd %ymm4, %ymm13, %ymm13 vpcmpeqd %ymm4, %ymm14, %ymm14 vpcmpeqd %ymm4, %ymm15, %ymm15 vmovdqa -32(%r10),%ymm7 # .Lgather_permd lea 128($inp), $inp mov \$9,$power .Loop_gather_1024: vmovdqa 32*0-128($inp), %ymm0 vmovdqa 32*1-128($inp), %ymm1 vmovdqa 32*2-128($inp), %ymm2 vmovdqa 32*3-128($inp), %ymm3 vpand 32*0+128(%rax), %ymm0, %ymm0 vpand 32*1+128(%rax), %ymm1, %ymm1 vpand 32*2+128(%rax), %ymm2, %ymm2 vpor %ymm0, %ymm1, %ymm4 vpand 32*3+128(%rax), %ymm3, %ymm3 vmovdqa 32*4-128($inp), %ymm0 vmovdqa 32*5-128($inp), %ymm1 vpor %ymm2, %ymm3, %ymm5 vmovdqa 32*6-128($inp), %ymm2 vmovdqa 32*7-128($inp), %ymm3 vpand 32*4+128(%rax), %ymm0, %ymm0 vpand 32*5+128(%rax), %ymm1, %ymm1 vpand 32*6+128(%rax), %ymm2, %ymm2 vpor %ymm0, %ymm4, %ymm4 vpand 32*7+128(%rax), %ymm3, %ymm3 vpand 32*8-128($inp), %ymm8, %ymm0 vpor %ymm1, %ymm5, %ymm5 vpand 32*9-128($inp), %ymm9, %ymm1 vpor %ymm2, %ymm4, %ymm4 vpand 32*10-128($inp),%ymm10, %ymm2 vpor %ymm3, %ymm5, %ymm5 vpand 32*11-128($inp),%ymm11, %ymm3 vpor %ymm0, %ymm4, %ymm4 vpand 32*12-128($inp),%ymm12, %ymm0 vpor %ymm1, %ymm5, %ymm5 vpand 32*13-128($inp),%ymm13, %ymm1 vpor %ymm2, %ymm4, %ymm4 vpand 32*14-128($inp),%ymm14, %ymm2 vpor %ymm3, %ymm5, %ymm5 vpand 32*15-128($inp),%ymm15, %ymm3 lea 32*16($inp), $inp vpor %ymm0, %ymm4, %ymm4 vpor %ymm1, %ymm5, %ymm5 vpor %ymm2, %ymm4, %ymm4 vpor %ymm3, %ymm5, %ymm5 vpor %ymm5, %ymm4, %ymm4 vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared vpor %xmm4, %xmm5, %xmm5 vpermd %ymm5,%ymm7,%ymm5 vmovdqu %ymm5,($out) lea 32($out),$out dec $power jnz .Loop_gather_1024 vpxor %ymm0,%ymm0,%ymm0 vmovdqu %ymm0,($out) vzeroupper ___ $code.=<<___ if ($win64); movaps -0xa8(%r11),%xmm6 movaps -0x98(%r11),%xmm7 movaps -0x88(%r11),%xmm8 movaps -0x78(%r11),%xmm9 movaps -0x68(%r11),%xmm10 movaps -0x58(%r11),%xmm11 movaps -0x48(%r11),%xmm12 movaps -0x38(%r11),%xmm13 movaps -0x28(%r11),%xmm14 movaps -0x18(%r11),%xmm15 ___ $code.=<<___; lea (%r11),%rsp .cfi_def_cfa_register %rsp ret .cfi_endproc .LSEH_end_rsaz_1024_gather5: .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 ___ } $code.=<<___; .extern OPENSSL_ia32cap_P .globl rsaz_avx2_eligible .type rsaz_avx2_eligible,\@abi-omnipotent .align 32 rsaz_avx2_eligible: mov OPENSSL_ia32cap_P+8(%rip),%eax ___ $code.=<<___ if ($addx); mov \$`1<<8|1<<19`,%ecx mov \$0,%edx and %eax,%ecx cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X cmove %edx,%eax ___ $code.=<<___; and \$`1<<5`,%eax shr \$5,%eax ret .size rsaz_avx2_eligible,.-rsaz_avx2_eligible .align 64 .Land_mask: .quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff .Lscatter_permd: .long 0,2,4,6,7,7,7,7 .Lgather_permd: .long 0,7,1,7,2,7,3,7 .Linc: .long 0,0,0,0, 1,1,1,1 .long 2,2,2,2, 3,3,3,3 .long 4,4,4,4, 4,4,4,4 .align 64 ___ if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___ .extern __imp_RtlVirtualUnwind .type rsaz_se_handler,\@abi-omnipotent .align 16 rsaz_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRip>=epilogue label jae .Lcommon_seh_tail mov 160($context),%rbp # pull context->Rbp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # "in tail" label cmp %r10,%rbx # context->Rip>="in tail" label cmovc %rbp,%rax mov -48(%rax),%r15 mov -40(%rax),%r14 mov -32(%rax),%r13 mov -24(%rax),%r12 mov -16(%rax),%rbp mov -8(%rax),%rbx mov %r15,240($context) mov %r14,232($context) mov %r13,224($context) mov %r12,216($context) mov %rbp,160($context) mov %rbx,144($context) lea -0xd8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size rsaz_se_handler,.-rsaz_se_handler .section .pdata .align 4 .rva .LSEH_begin_rsaz_1024_sqr_avx2 .rva .LSEH_end_rsaz_1024_sqr_avx2 .rva .LSEH_info_rsaz_1024_sqr_avx2 .rva .LSEH_begin_rsaz_1024_mul_avx2 .rva .LSEH_end_rsaz_1024_mul_avx2 .rva .LSEH_info_rsaz_1024_mul_avx2 .rva .LSEH_begin_rsaz_1024_gather5 .rva .LSEH_end_rsaz_1024_gather5 .rva .LSEH_info_rsaz_1024_gather5 .section .xdata .align 8 .LSEH_info_rsaz_1024_sqr_avx2: .byte 9,0,0,0 .rva rsaz_se_handler .rva .Lsqr_1024_body,.Lsqr_1024_epilogue,.Lsqr_1024_in_tail .long 0 .LSEH_info_rsaz_1024_mul_avx2: .byte 9,0,0,0 .rva rsaz_se_handler .rva .Lmul_1024_body,.Lmul_1024_epilogue,.Lmul_1024_in_tail .long 0 .LSEH_info_rsaz_1024_gather5: .byte 0x01,0x36,0x17,0x0b .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; print $_,"\n"; } }}} else {{{ print <<___; # assembler is too old .text .globl rsaz_avx2_eligible .type rsaz_avx2_eligible,\@abi-omnipotent rsaz_avx2_eligible: xor %eax,%eax ret .size rsaz_avx2_eligible,.-rsaz_avx2_eligible .globl rsaz_1024_sqr_avx2 .globl rsaz_1024_mul_avx2 .globl rsaz_1024_norm2red_avx2 .globl rsaz_1024_red2norm_avx2 .globl rsaz_1024_scatter5_avx2 .globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,\@abi-omnipotent rsaz_1024_sqr_avx2: rsaz_1024_mul_avx2: rsaz_1024_norm2red_avx2: rsaz_1024_red2norm_avx2: rsaz_1024_scatter5_avx2: rsaz_1024_gather5_avx2: .byte 0x0f,0x0b # ud2 ret .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 ___ }}} close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/bn/asm/rsaz-x86_64.pl (revision 364963) @@ -1,2431 +1,2431 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2012, Intel Corporation. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # # References: # [1] S. Gueron, "Efficient Software Implementations of Modular # Exponentiation", http://eprint.iacr.org/2011/239 # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". # IEEE Proceedings of 9th International Conference on Information # Technology: New Generations (ITNG 2012), 821-823 (2012). # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation # Journal of Cryptographic Engineering 2:31-43 (2012). # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis # resistant 512-bit and 1024-bit modular exponentiation for optimizing # RSA1024 and RSA2048 on x86_64 platforms", # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest # # While original submission covers 512- and 1024-bit exponentiation, # this module is limited to 512-bit version only (and as such # accelerates RSA1024 sign). This is because improvement for longer # keys is not high enough to justify the effort, highest measured # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming # for the moment of this writing!] Nor does this module implement # "monolithic" complete exponentiation jumbo-subroutine, but adheres # to more modular mixture of C and assembly. And it's optimized even # for processors other than Intel Core family (see table below for # improvement coefficients). # # # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*) # ----------------+--------------------------- # Opteron +13% |+5% +20% # Bulldozer -0% |-1% +10% # P4 +11% |+7% +8% # Westmere +5% |+14% +17% # Sandy Bridge +2% |+12% +29% # Ivy Bridge +1% |+11% +35% # Haswell(**) -0% |+12% +39% # Atom +13% |+11% +4% # VIA Nano +70% |+9% +25% # # (*) rsax engine and fips numbers are presented for reference # purposes; # (**) MULX was attempted, but found to give only marginal improvement; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API { my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d"); $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl rsaz_512_sqr .type rsaz_512_sqr,\@function,5 .align 32 rsaz_512_sqr: # 25-29% faster than rsaz_512_mul .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lsqr_body: movq $mod, %xmm1 # common off-load movq ($inp), %rdx movq 8($inp), %rax movq $n0, 128(%rsp) ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Loop_sqrx ___ $code.=<<___; jmp .Loop_sqr .align 32 .Loop_sqr: movl $times,128+8(%rsp) #first iteration movq %rdx, %rbx # 0($inp) mov %rax, %rbp # 8($inp) mulq %rdx movq %rax, %r8 movq 16($inp), %rax movq %rdx, %r9 mulq %rbx addq %rax, %r9 movq 24($inp), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r10 movq 32($inp), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r11 movq 40($inp), %rax movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r12 movq 48($inp), %rax movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r13 movq 56($inp), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r14 movq %rbx, %rax adcq \$0, %rdx xorq %rcx,%rcx # rcx:r8 = r8 << 1 addq %r8, %r8 movq %rdx, %r15 adcq \$0, %rcx mulq %rax addq %r8, %rdx adcq \$0, %rcx movq %rax, (%rsp) movq %rdx, 8(%rsp) #second iteration movq 16($inp), %rax mulq %rbp addq %rax, %r10 movq 24($inp), %rax movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r11 movq 32($inp), %rax adcq \$0, %rdx addq %rbx, %r11 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r12 movq 40($inp), %rax adcq \$0, %rdx addq %rbx, %r12 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r13 movq 48($inp), %rax adcq \$0, %rdx addq %rbx, %r13 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r14 movq 56($inp), %rax adcq \$0, %rdx addq %rbx, %r14 movq %rdx, %rbx adcq \$0, %rbx mulq %rbp addq %rax, %r15 movq %rbp, %rax adcq \$0, %rdx addq %rbx, %r15 adcq \$0, %rdx xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 addq %r9, %r9 movq %rdx, %r8 adcq %r10, %r10 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax movq 16($inp), %rbp addq %rax, %r9 movq 24($inp), %rax adcq %rdx, %r10 adcq \$0, %rbx movq %r9, 16(%rsp) movq %r10, 24(%rsp) #third iteration mulq %rbp addq %rax, %r12 movq 32($inp), %rax movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r13 movq 40($inp), %rax adcq \$0, %rdx addq %rcx, %r13 movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r14 movq 48($inp), %rax adcq \$0, %rdx addq %rcx, %r14 movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r15 movq 56($inp), %rax adcq \$0, %rdx addq %rcx, %r15 movq %rdx, %rcx adcq \$0, %rcx mulq %rbp addq %rax, %r8 movq %rbp, %rax adcq \$0, %rdx addq %rcx, %r8 adcq \$0, %rdx xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 addq %r11, %r11 movq %rdx, %r9 adcq %r12, %r12 adcq \$0, %rcx mulq %rax # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rbx, %rax movq 24($inp), %r10 addq %rax, %r11 movq 32($inp), %rax adcq %rdx, %r12 adcq \$0, %rcx movq %r11, 32(%rsp) movq %r12, 40(%rsp) #fourth iteration mov %rax, %r11 # 32($inp) mulq %r10 addq %rax, %r14 movq 40($inp), %rax movq %rdx, %rbx adcq \$0, %rbx mov %rax, %r12 # 40($inp) mulq %r10 addq %rax, %r15 movq 48($inp), %rax adcq \$0, %rdx addq %rbx, %r15 movq %rdx, %rbx adcq \$0, %rbx mov %rax, %rbp # 48($inp) mulq %r10 addq %rax, %r8 movq 56($inp), %rax adcq \$0, %rdx addq %rbx, %r8 movq %rdx, %rbx adcq \$0, %rbx mulq %r10 addq %rax, %r9 movq %r10, %rax adcq \$0, %rdx addq %rbx, %r9 adcq \$0, %rdx xorq %rbx, %rbx # rbx:r13:r14 = r13:r14 << 1 addq %r13, %r13 movq %rdx, %r10 adcq %r14, %r14 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax addq %rax, %r13 movq %r12, %rax # 40($inp) adcq %rdx, %r14 adcq \$0, %rbx movq %r13, 48(%rsp) movq %r14, 56(%rsp) #fifth iteration mulq %r11 addq %rax, %r8 movq %rbp, %rax # 48($inp) movq %rdx, %rcx adcq \$0, %rcx mulq %r11 addq %rax, %r9 movq 56($inp), %rax adcq \$0, %rdx addq %rcx, %r9 movq %rdx, %rcx adcq \$0, %rcx mov %rax, %r14 # 56($inp) mulq %r11 addq %rax, %r10 movq %r11, %rax adcq \$0, %rdx addq %rcx, %r10 adcq \$0, %rdx xorq %rcx, %rcx # rcx:r8:r15 = r8:r15 << 1 addq %r15, %r15 movq %rdx, %r11 adcq %r8, %r8 adcq \$0, %rcx mulq %rax # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rbx, %rax addq %rax, %r15 movq %rbp, %rax # 48($inp) adcq %rdx, %r8 adcq \$0, %rcx movq %r15, 64(%rsp) movq %r8, 72(%rsp) #sixth iteration mulq %r12 addq %rax, %r10 movq %r14, %rax # 56($inp) movq %rdx, %rbx adcq \$0, %rbx mulq %r12 addq %rax, %r11 movq %r12, %rax adcq \$0, %rdx addq %rbx, %r11 adcq \$0, %rdx xorq %rbx, %rbx # rbx:r10:r9 = r10:r9 << 1 addq %r9, %r9 movq %rdx, %r12 adcq %r10, %r10 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax addq %rax, %r9 movq %r14, %rax # 56($inp) adcq %rdx, %r10 adcq \$0, %rbx movq %r9, 80(%rsp) movq %r10, 88(%rsp) #seventh iteration mulq %rbp addq %rax, %r12 movq %rbp, %rax adcq \$0, %rdx xorq %rcx, %rcx # rcx:r12:r11 = r12:r11 << 1 addq %r11, %r11 movq %rdx, %r13 adcq %r12, %r12 adcq \$0, %rcx mulq %rax # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rbx, %rax addq %rax, %r11 movq %r14, %rax # 56($inp) adcq %rdx, %r12 adcq \$0, %rcx movq %r11, 96(%rsp) movq %r12, 104(%rsp) #eighth iteration xorq %rbx, %rbx # rbx:r13 = r13 << 1 addq %r13, %r13 adcq \$0, %rbx mulq %rax # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here addq %rcx, %rax addq %r13, %rax adcq %rbx, %rdx movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 movq %xmm1, %rbp movq %rax, 112(%rsp) movq %rdx, 120(%rsp) call __rsaz_512_reduce addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract movq %r8, %rdx movq %r9, %rax movl 128+8(%rsp), $times movq $out, $inp decl $times jnz .Loop_sqr ___ if ($addx) { $code.=<<___; jmp .Lsqr_tail .align 32 .Loop_sqrx: movl $times,128+8(%rsp) movq $out, %xmm0 # off-load #first iteration mulx %rax, %r8, %r9 mov %rax, %rbx mulx 16($inp), %rcx, %r10 xor %rbp, %rbp # cf=0, of=0 mulx 24($inp), %rax, %r11 adcx %rcx, %r9 .byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($inp), %rcx, %r12 adcx %rax, %r10 .byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 # mulx 40($inp), %rax, %r13 adcx %rcx, %r11 mulx 48($inp), %rcx, %r14 adcx %rax, %r12 adcx %rcx, %r13 mulx 56($inp), %rax, %r15 adcx %rax, %r14 adcx %rbp, %r15 # %rbp is 0 mulx %rdx, %rax, $out mov %rbx, %rdx # 8($inp) xor %rcx, %rcx adox %r8, %r8 adcx $out, %r8 adox %rbp, %rcx adcx %rbp, %rcx mov %rax, (%rsp) mov %r8, 8(%rsp) #second iteration .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 # mulx 16($inp), %rax, %rbx adox %rax, %r10 adcx %rbx, %r11 mulx 24($inp), $out, %r8 adox $out, %r11 .byte 0x66 adcx %r8, %r12 mulx 32($inp), %rax, %rbx adox %rax, %r12 adcx %rbx, %r13 mulx 40($inp), $out, %r8 adox $out, %r13 adcx %r8, %r14 .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx adox %rax, %r14 adcx %rbx, %r15 .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8 adox $out, %r15 adcx %rbp, %r8 mulx %rdx, %rax, $out adox %rbp, %r8 .byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 # mov 16($inp), %rdx xor %rbx, %rbx adox %r9, %r9 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %r10, %r10 adcx %rax, %r9 adox %rbp, %rbx adcx $out, %r10 adcx %rbp, %rbx mov %r9, 16(%rsp) .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp) #third iteration mulx 24($inp), $out, %r9 adox $out, %r12 adcx %r9, %r13 mulx 32($inp), %rax, %rcx adox %rax, %r13 adcx %rcx, %r14 .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r9 adox $out, %r14 adcx %r9, %r15 .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx adox %rax, %r15 adcx %rcx, %r8 mulx 56($inp), $out, %r9 adox $out, %r8 adcx %rbp, %r9 mulx %rdx, %rax, $out adox %rbp, %r9 mov 24($inp), %rdx xor %rcx, %rcx adox %r11, %r11 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rbx, %rax adox %r12, %r12 adcx %rax, %r11 adox %rbp, %rcx adcx $out, %r12 adcx %rbp, %rcx mov %r11, 32(%rsp) mov %r12, 40(%rsp) #fourth iteration mulx 32($inp), %rax, %rbx adox %rax, %r14 adcx %rbx, %r15 mulx 40($inp), $out, %r10 adox $out, %r15 adcx %r10, %r8 mulx 48($inp), %rax, %rbx adox %rax, %r8 adcx %rbx, %r9 mulx 56($inp), $out, %r10 adox $out, %r9 adcx %rbp, %r10 mulx %rdx, %rax, $out adox %rbp, %r10 mov 32($inp), %rdx xor %rbx, %rbx adox %r13, %r13 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %r14, %r14 adcx %rax, %r13 adox %rbp, %rbx adcx $out, %r14 adcx %rbp, %rbx mov %r13, 48(%rsp) mov %r14, 56(%rsp) #fifth iteration mulx 40($inp), $out, %r11 adox $out, %r8 adcx %r11, %r9 mulx 48($inp), %rax, %rcx adox %rax, %r9 adcx %rcx, %r10 mulx 56($inp), $out, %r11 adox $out, %r10 adcx %rbp, %r11 mulx %rdx, %rax, $out mov 40($inp), %rdx adox %rbp, %r11 xor %rcx, %rcx adox %r15, %r15 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rbx, %rax adox %r8, %r8 adcx %rax, %r15 adox %rbp, %rcx adcx $out, %r8 adcx %rbp, %rcx mov %r15, 64(%rsp) mov %r8, 72(%rsp) #sixth iteration .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx adox %rax, %r10 adcx %rbx, %r11 .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12 adox $out, %r11 adcx %rbp, %r12 mulx %rdx, %rax, $out adox %rbp, %r12 mov 48($inp), %rdx xor %rbx, %rbx adox %r9, %r9 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %r10, %r10 adcx %rax, %r9 adcx $out, %r10 adox %rbp, %rbx adcx %rbp, %rbx mov %r9, 80(%rsp) mov %r10, 88(%rsp) #seventh iteration .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13 adox %rax, %r12 adox %rbp, %r13 mulx %rdx, %rax, $out xor %rcx, %rcx mov 56($inp), %rdx adox %r11, %r11 # rbx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rbx, %rax adox %r12, %r12 adcx %rax, %r11 adox %rbp, %rcx adcx $out, %r12 adcx %rbp, %rcx .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp) .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp) #eighth iteration mulx %rdx, %rax, %rdx xor %rbx, %rbx adox %r13, %r13 # rcx <= 2 and rax <= 0xFFFF..F9, so carry must be zero here adcx %rcx, %rax adox %rbp, %rbx adcx %r13, %rax adcx %rdx, %rbx movq %xmm0, $out movq %xmm1, %rbp movq 128(%rsp), %rdx # pull $n0 movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 movq %rax, 112(%rsp) movq %rbx, 120(%rsp) call __rsaz_512_reducex addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract movq %r8, %rdx movq %r9, %rax movl 128+8(%rsp), $times movq $out, $inp decl $times jnz .Loop_sqrx .Lsqr_tail: ___ } $code.=<<___; leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lsqr_epilogue: ret .cfi_endproc .size rsaz_512_sqr,.-rsaz_512_sqr ___ } { my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8"); $code.=<<___; .globl rsaz_512_mul .type rsaz_512_mul,\@function,5 .align 32 rsaz_512_mul: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lmul_body: movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 movq $n0, 128(%rsp) ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Lmulx ___ $code.=<<___; movq ($bp), %rbx # pass b[0] movq $bp, %rbp # pass argument call __rsaz_512_mul movq %xmm0, $out movq %xmm1, %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lmul_tail .align 32 .Lmulx: movq $bp, %rbp # pass argument movq ($bp), %rdx # pass b[0] call __rsaz_512_mulx movq %xmm0, $out movq %xmm1, %rbp movq 128(%rsp), %rdx # pull $n0 movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reducex .Lmul_tail: ___ $code.=<<___; addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size rsaz_512_mul,.-rsaz_512_mul ___ } { my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); $code.=<<___; .globl rsaz_512_mul_gather4 .type rsaz_512_mul_gather4,\@function,6 .align 32 rsaz_512_mul_gather4: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$`128+24+($win64?0xb0:0)`, %rsp .cfi_adjust_cfa_offset `128+24+($win64?0xb0:0)` ___ $code.=<<___ if ($win64); movaps %xmm6,0xa0(%rsp) movaps %xmm7,0xb0(%rsp) movaps %xmm8,0xc0(%rsp) movaps %xmm9,0xd0(%rsp) movaps %xmm10,0xe0(%rsp) movaps %xmm11,0xf0(%rsp) movaps %xmm12,0x100(%rsp) movaps %xmm13,0x110(%rsp) movaps %xmm14,0x120(%rsp) movaps %xmm15,0x130(%rsp) ___ $code.=<<___; .Lmul_gather4_body: movd $pwr,%xmm8 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 pshufd \$0,%xmm8,%xmm8 # broadcast $power movdqa %xmm1,%xmm7 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..15 to $power # for($i=0;$i<4;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` movdqa %xmm7,%xmm`$i+3` ___ } for(;$i<7;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` ___ } $code.=<<___; pcmpeqd %xmm8,%xmm7 movdqa 16*0($bp),%xmm8 movdqa 16*1($bp),%xmm9 movdqa 16*2($bp),%xmm10 movdqa 16*3($bp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4($bp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5($bp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6($bp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7($bp),%xmm15 leaq 128($bp), %rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Lmulx_gather ___ $code.=<<___; movq %xmm8,%rbx movq $n0, 128(%rsp) # off-load arguments movq $out, 128+8(%rsp) movq $mod, 128+16(%rsp) movq ($ap), %rax movq 8($ap), %rcx mulq %rbx # 0 iteration movq %rax, (%rsp) movq %rcx, %rax movq %rdx, %r8 mulq %rbx addq %rax, %r8 movq 16($ap), %rax movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r9 movq 24($ap), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r10 movq 32($ap), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r11 movq 40($ap), %rax movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r12 movq 48($ap), %rax movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r13 movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r14 movq ($ap), %rax movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rsp), %rdi movl \$7, %ecx jmp .Loop_mul_gather .align 32 .Loop_mul_gather: movdqa 16*0(%rbp),%xmm8 movdqa 16*1(%rbp),%xmm9 movdqa 16*2(%rbp),%xmm10 movdqa 16*3(%rbp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4(%rbp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5(%rbp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6(%rbp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7(%rbp),%xmm15 leaq 128(%rbp), %rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,%rbx mulq %rbx addq %rax, %r8 movq 8($ap), %rax movq %r8, (%rdi) movq %rdx, %r8 adcq \$0, %r8 mulq %rbx addq %rax, %r9 movq 16($ap), %rax adcq \$0, %rdx addq %r9, %r8 movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r10 movq 24($ap), %rax adcq \$0, %rdx addq %r10, %r9 movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r11 movq 32($ap), %rax adcq \$0, %rdx addq %r11, %r10 movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r12 movq 40($ap), %rax adcq \$0, %rdx addq %r12, %r11 movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r13 movq 48($ap), %rax adcq \$0, %rdx addq %r13, %r12 movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r14 movq 56($ap), %rax adcq \$0, %rdx addq %r14, %r13 movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r15 movq ($ap), %rax adcq \$0, %rdx addq %r15, %r14 movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rdi), %rdi decl %ecx jnz .Loop_mul_gather movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 32(%rdi) movq %r13, 40(%rdi) movq %r14, 48(%rdi) movq %r15, 56(%rdi) movq 128+8(%rsp), $out movq 128+16(%rsp), %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lmul_gather_tail .align 32 .Lmulx_gather: movq %xmm8,%rdx mov $n0, 128(%rsp) # off-load arguments mov $out, 128+8(%rsp) mov $mod, 128+16(%rsp) mulx ($ap), %rbx, %r8 # 0 iteration mov %rbx, (%rsp) xor %edi, %edi # cf=0, of=0 mulx 8($ap), %rax, %r9 mulx 16($ap), %rbx, %r10 adcx %rax, %r8 mulx 24($ap), %rax, %r11 adcx %rbx, %r9 mulx 32($ap), %rbx, %r12 adcx %rax, %r10 mulx 40($ap), %rax, %r13 adcx %rbx, %r11 mulx 48($ap), %rbx, %r14 adcx %rax, %r12 mulx 56($ap), %rax, %r15 adcx %rbx, %r13 adcx %rax, %r14 .byte 0x67 mov %r8, %rbx adcx %rdi, %r15 # %rdi is 0 mov \$-7, %rcx jmp .Loop_mulx_gather .align 32 .Loop_mulx_gather: movdqa 16*0(%rbp),%xmm8 movdqa 16*1(%rbp),%xmm9 movdqa 16*2(%rbp),%xmm10 movdqa 16*3(%rbp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4(%rbp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5(%rbp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6(%rbp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7(%rbp),%xmm15 leaq 128(%rbp), %rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,%rdx .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 mulx 8($ap), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 mulx 16($ap), %rax, %r10 adcx %rax, %r9 adox %r11, %r10 .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11 adcx %rax, %r10 adox %r12, %r11 mulx 32($ap), %rax, %r12 adcx %rax, %r11 adox %r13, %r12 mulx 40($ap), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 adcx %rax, %r13 .byte 0x67 adox %r15, %r14 mulx 56($ap), %rax, %r15 mov %rbx, 64(%rsp,%rcx,8) adcx %rax, %r14 adox %rdi, %r15 mov %r8, %rbx adcx %rdi, %r15 # cf=0 inc %rcx # of=0 jnz .Loop_mulx_gather mov %r8, 64(%rsp) mov %r9, 64+8(%rsp) mov %r10, 64+16(%rsp) mov %r11, 64+24(%rsp) mov %r12, 64+32(%rsp) mov %r13, 64+40(%rsp) mov %r14, 64+48(%rsp) mov %r15, 64+56(%rsp) mov 128(%rsp), %rdx # pull arguments mov 128+8(%rsp), $out mov 128+16(%rsp), %rbp mov (%rsp), %r8 mov 8(%rsp), %r9 mov 16(%rsp), %r10 mov 24(%rsp), %r11 mov 32(%rsp), %r12 mov 40(%rsp), %r13 mov 48(%rsp), %r14 mov 56(%rsp), %r15 call __rsaz_512_reducex .Lmul_gather_tail: ___ $code.=<<___; addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 sbbq %rcx, %rcx call __rsaz_512_subtract leaq 128+24+48(%rsp), %rax ___ $code.=<<___ if ($win64); movaps 0xa0-0xc8(%rax),%xmm6 movaps 0xb0-0xc8(%rax),%xmm7 movaps 0xc0-0xc8(%rax),%xmm8 movaps 0xd0-0xc8(%rax),%xmm9 movaps 0xe0-0xc8(%rax),%xmm10 movaps 0xf0-0xc8(%rax),%xmm11 movaps 0x100-0xc8(%rax),%xmm12 movaps 0x110-0xc8(%rax),%xmm13 movaps 0x120-0xc8(%rax),%xmm14 movaps 0x130-0xc8(%rax),%xmm15 lea 0xb0(%rax),%rax ___ $code.=<<___; .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_gather4_epilogue: ret .cfi_endproc .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 ___ } { my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d"); $code.=<<___; .globl rsaz_512_mul_scatter4 .type rsaz_512_mul_scatter4,\@function,6 .align 32 rsaz_512_mul_scatter4: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 mov $pwr, $pwr subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lmul_scatter4_body: leaq ($tbl,$pwr,8), $tbl movq $out, %xmm0 # off-load arguments movq $mod, %xmm1 movq $tbl, %xmm2 movq $n0, 128(%rsp) movq $out, %rbp ___ $code.=<<___ if ($addx); movl \$0x80100,%r11d andl OPENSSL_ia32cap_P+8(%rip),%r11d cmpl \$0x80100,%r11d # check for MULX and ADO/CX je .Lmulx_scatter ___ $code.=<<___; movq ($out),%rbx # pass b[0] call __rsaz_512_mul movq %xmm0, $out movq %xmm1, %rbp movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lmul_scatter_tail .align 32 .Lmulx_scatter: movq ($out), %rdx # pass b[0] call __rsaz_512_mulx movq %xmm0, $out movq %xmm1, %rbp movq 128(%rsp), %rdx # pull $n0 movq (%rsp), %r8 movq 8(%rsp), %r9 movq 16(%rsp), %r10 movq 24(%rsp), %r11 movq 32(%rsp), %r12 movq 40(%rsp), %r13 movq 48(%rsp), %r14 movq 56(%rsp), %r15 call __rsaz_512_reducex .Lmul_scatter_tail: ___ $code.=<<___; addq 64(%rsp), %r8 adcq 72(%rsp), %r9 adcq 80(%rsp), %r10 adcq 88(%rsp), %r11 adcq 96(%rsp), %r12 adcq 104(%rsp), %r13 adcq 112(%rsp), %r14 adcq 120(%rsp), %r15 movq %xmm2, $inp sbbq %rcx, %rcx call __rsaz_512_subtract movq %r8, 128*0($inp) # scatter movq %r9, 128*1($inp) movq %r10, 128*2($inp) movq %r11, 128*3($inp) movq %r12, 128*4($inp) movq %r13, 128*5($inp) movq %r14, 128*6($inp) movq %r15, 128*7($inp) leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_scatter4_epilogue: ret .cfi_endproc .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 ___ } { my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx"); $code.=<<___; .globl rsaz_512_mul_by_one .type rsaz_512_mul_by_one,\@function,4 .align 32 rsaz_512_mul_by_one: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 subq \$128+24, %rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: ___ $code.=<<___ if ($addx); movl OPENSSL_ia32cap_P+8(%rip),%eax ___ $code.=<<___; movq $mod, %rbp # reassign argument movq $n0, 128(%rsp) movq ($inp), %r8 pxor %xmm0, %xmm0 movq 8($inp), %r9 movq 16($inp), %r10 movq 24($inp), %r11 movq 32($inp), %r12 movq 40($inp), %r13 movq 48($inp), %r14 movq 56($inp), %r15 movdqa %xmm0, (%rsp) movdqa %xmm0, 16(%rsp) movdqa %xmm0, 32(%rsp) movdqa %xmm0, 48(%rsp) movdqa %xmm0, 64(%rsp) movdqa %xmm0, 80(%rsp) movdqa %xmm0, 96(%rsp) ___ $code.=<<___ if ($addx); andl \$0x80100,%eax cmpl \$0x80100,%eax # check for MULX and ADO/CX je .Lby_one_callx ___ $code.=<<___; call __rsaz_512_reduce ___ $code.=<<___ if ($addx); jmp .Lby_one_tail .align 32 .Lby_one_callx: movq 128(%rsp), %rdx # pull $n0 call __rsaz_512_reducex .Lby_one_tail: ___ $code.=<<___; movq %r8, ($out) movq %r9, 8($out) movq %r10, 16($out) movq %r11, 24($out) movq %r12, 32($out) movq %r13, 40($out) movq %r14, 48($out) movq %r15, 56($out) leaq 128+24+48(%rsp), %rax .cfi_def_cfa %rax,8 movq -48(%rax), %r15 .cfi_restore %r15 movq -40(%rax), %r14 .cfi_restore %r14 movq -32(%rax), %r13 .cfi_restore %r13 movq -24(%rax), %r12 .cfi_restore %r12 movq -16(%rax), %rbp .cfi_restore %rbp movq -8(%rax), %rbx .cfi_restore %rbx leaq (%rax), %rsp .cfi_def_cfa_register %rsp .Lmul_by_one_epilogue: ret .cfi_endproc .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one ___ } { # __rsaz_512_reduce # # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 # output: %r8-%r15 # clobbers: everything except %rbp and %rdi $code.=<<___; .type __rsaz_512_reduce,\@abi-omnipotent .align 32 __rsaz_512_reduce: .cfi_startproc movq %r8, %rbx imulq 128+8(%rsp), %rbx movq 0(%rbp), %rax movl \$8, %ecx jmp .Lreduction_loop .align 32 .Lreduction_loop: mulq %rbx movq 8(%rbp), %rax negq %r8 movq %rdx, %r8 adcq \$0, %r8 mulq %rbx addq %rax, %r9 movq 16(%rbp), %rax adcq \$0, %rdx addq %r9, %r8 movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r10 movq 24(%rbp), %rax adcq \$0, %rdx addq %r10, %r9 movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r11 movq 32(%rbp), %rax adcq \$0, %rdx addq %r11, %r10 movq 128+8(%rsp), %rsi #movq %rdx, %r11 #adcq \$0, %r11 adcq \$0, %rdx movq %rdx, %r11 mulq %rbx addq %rax, %r12 movq 40(%rbp), %rax adcq \$0, %rdx imulq %r8, %rsi addq %r12, %r11 movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r13 movq 48(%rbp), %rax adcq \$0, %rdx addq %r13, %r12 movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r14 movq 56(%rbp), %rax adcq \$0, %rdx addq %r14, %r13 movq %rdx, %r14 adcq \$0, %r14 mulq %rbx movq %rsi, %rbx addq %rax, %r15 movq 0(%rbp), %rax adcq \$0, %rdx addq %r15, %r14 movq %rdx, %r15 adcq \$0, %r15 decl %ecx jne .Lreduction_loop ret .cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce ___ } if ($addx) { # __rsaz_512_reducex # # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0 # output: %r8-%r15 # clobbers: everything except %rbp and %rdi $code.=<<___; .type __rsaz_512_reducex,\@abi-omnipotent .align 32 __rsaz_512_reducex: .cfi_startproc #movq 128+8(%rsp), %rdx # pull $n0 imulq %r8, %rdx xorq %rsi, %rsi # cf=0,of=0 movl \$8, %ecx jmp .Lreduction_loopx .align 32 .Lreduction_loopx: mov %r8, %rbx mulx 0(%rbp), %rax, %r8 adcx %rbx, %rax adox %r9, %r8 mulx 8(%rbp), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 mulx 16(%rbp), %rbx, %r10 adcx %rbx, %r9 adox %r11, %r10 mulx 24(%rbp), %rbx, %r11 adcx %rbx, %r10 adox %r12, %r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12 mov %rdx, %rax mov %r8, %rdx adcx %rbx, %r11 adox %r13, %r12 mulx 128+8(%rsp), %rbx, %rdx mov %rax, %rdx mulx 40(%rbp), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14 adcx %rax, %r13 adox %r15, %r14 mulx 56(%rbp), %rax, %r15 mov %rbx, %rdx adcx %rax, %r14 adox %rsi, %r15 # %rsi is 0 adcx %rsi, %r15 # cf=0 decl %ecx # of=0 jne .Lreduction_loopx ret .cfi_endproc .size __rsaz_512_reducex,.-__rsaz_512_reducex ___ } { # __rsaz_512_subtract # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask # output: # clobbers: everything but %rdi, %rsi and %rbp $code.=<<___; .type __rsaz_512_subtract,\@abi-omnipotent .align 32 __rsaz_512_subtract: .cfi_startproc movq %r8, ($out) movq %r9, 8($out) movq %r10, 16($out) movq %r11, 24($out) movq %r12, 32($out) movq %r13, 40($out) movq %r14, 48($out) movq %r15, 56($out) movq 0($mod), %r8 movq 8($mod), %r9 negq %r8 notq %r9 andq %rcx, %r8 movq 16($mod), %r10 andq %rcx, %r9 notq %r10 movq 24($mod), %r11 andq %rcx, %r10 notq %r11 movq 32($mod), %r12 andq %rcx, %r11 notq %r12 movq 40($mod), %r13 andq %rcx, %r12 notq %r13 movq 48($mod), %r14 andq %rcx, %r13 notq %r14 movq 56($mod), %r15 andq %rcx, %r14 notq %r15 andq %rcx, %r15 addq ($out), %r8 adcq 8($out), %r9 adcq 16($out), %r10 adcq 24($out), %r11 adcq 32($out), %r12 adcq 40($out), %r13 adcq 48($out), %r14 adcq 56($out), %r15 movq %r8, ($out) movq %r9, 8($out) movq %r10, 16($out) movq %r11, 24($out) movq %r12, 32($out) movq %r13, 40($out) movq %r14, 48($out) movq %r15, 56($out) ret .cfi_endproc .size __rsaz_512_subtract,.-__rsaz_512_subtract ___ } { # __rsaz_512_mul # # input: %rsi - ap, %rbp - bp # output: # clobbers: everything my ($ap,$bp) = ("%rsi","%rbp"); $code.=<<___; .type __rsaz_512_mul,\@abi-omnipotent .align 32 __rsaz_512_mul: .cfi_startproc leaq 8(%rsp), %rdi movq ($ap), %rax mulq %rbx movq %rax, (%rdi) movq 8($ap), %rax movq %rdx, %r8 mulq %rbx addq %rax, %r8 movq 16($ap), %rax movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r9 movq 24($ap), %rax movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r10 movq 32($ap), %rax movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r11 movq 40($ap), %rax movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r12 movq 48($ap), %rax movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r13 movq 56($ap), %rax movq %rdx, %r14 adcq \$0, %r14 mulq %rbx addq %rax, %r14 movq ($ap), %rax movq %rdx, %r15 adcq \$0, %r15 leaq 8($bp), $bp leaq 8(%rdi), %rdi movl \$7, %ecx jmp .Loop_mul .align 32 .Loop_mul: movq ($bp), %rbx mulq %rbx addq %rax, %r8 movq 8($ap), %rax movq %r8, (%rdi) movq %rdx, %r8 adcq \$0, %r8 mulq %rbx addq %rax, %r9 movq 16($ap), %rax adcq \$0, %rdx addq %r9, %r8 movq %rdx, %r9 adcq \$0, %r9 mulq %rbx addq %rax, %r10 movq 24($ap), %rax adcq \$0, %rdx addq %r10, %r9 movq %rdx, %r10 adcq \$0, %r10 mulq %rbx addq %rax, %r11 movq 32($ap), %rax adcq \$0, %rdx addq %r11, %r10 movq %rdx, %r11 adcq \$0, %r11 mulq %rbx addq %rax, %r12 movq 40($ap), %rax adcq \$0, %rdx addq %r12, %r11 movq %rdx, %r12 adcq \$0, %r12 mulq %rbx addq %rax, %r13 movq 48($ap), %rax adcq \$0, %rdx addq %r13, %r12 movq %rdx, %r13 adcq \$0, %r13 mulq %rbx addq %rax, %r14 movq 56($ap), %rax adcq \$0, %rdx addq %r14, %r13 movq %rdx, %r14 leaq 8($bp), $bp adcq \$0, %r14 mulq %rbx addq %rax, %r15 movq ($ap), %rax adcq \$0, %rdx addq %r15, %r14 movq %rdx, %r15 adcq \$0, %r15 leaq 8(%rdi), %rdi decl %ecx jnz .Loop_mul movq %r8, (%rdi) movq %r9, 8(%rdi) movq %r10, 16(%rdi) movq %r11, 24(%rdi) movq %r12, 32(%rdi) movq %r13, 40(%rdi) movq %r14, 48(%rdi) movq %r15, 56(%rdi) ret .cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul ___ } if ($addx) { # __rsaz_512_mulx # # input: %rsi - ap, %rbp - bp # output: # clobbers: everything my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi"); $code.=<<___; .type __rsaz_512_mulx,\@abi-omnipotent .align 32 __rsaz_512_mulx: .cfi_startproc mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller mov \$-6, %rcx mulx 8($ap), %rax, %r9 movq %rbx, 8(%rsp) mulx 16($ap), %rbx, %r10 adc %rax, %r8 mulx 24($ap), %rax, %r11 adc %rbx, %r9 mulx 32($ap), %rbx, %r12 adc %rax, %r10 mulx 40($ap), %rax, %r13 adc %rbx, %r11 mulx 48($ap), %rbx, %r14 adc %rax, %r12 mulx 56($ap), %rax, %r15 mov 8($bp), %rdx adc %rbx, %r13 adc %rax, %r14 adc \$0, %r15 xor $zero, $zero # cf=0,of=0 jmp .Loop_mulx .align 32 .Loop_mulx: movq %r8, %rbx mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 mulx 8($ap), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 mulx 16($ap), %rax, %r10 adcx %rax, %r9 adox %r11, %r10 mulx 24($ap), %rax, %r11 adcx %rax, %r10 adox %r12, %r11 .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12 adcx %rax, %r11 adox %r13, %r12 mulx 40($ap), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 mulx 48($ap), %rax, %r14 adcx %rax, %r13 adox %r15, %r14 mulx 56($ap), %rax, %r15 movq 64($bp,%rcx,8), %rdx movq %rbx, 8+64-8(%rsp,%rcx,8) adcx %rax, %r14 adox $zero, %r15 adcx $zero, %r15 # cf=0 inc %rcx # of=0 jnz .Loop_mulx movq %r8, %rbx mulx ($ap), %rax, %r8 adcx %rax, %rbx adox %r9, %r8 .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9 adcx %rax, %r8 adox %r10, %r9 .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10 adcx %rax, %r9 adox %r11, %r10 mulx 24($ap), %rax, %r11 adcx %rax, %r10 adox %r12, %r11 mulx 32($ap), %rax, %r12 adcx %rax, %r11 adox %r13, %r12 mulx 40($ap), %rax, %r13 adcx %rax, %r12 adox %r14, %r13 .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14 adcx %rax, %r13 adox %r15, %r14 .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15 adcx %rax, %r14 adox $zero, %r15 adcx $zero, %r15 mov %rbx, 8+64-8(%rsp) mov %r8, 8+64(%rsp) mov %r9, 8+64+8(%rsp) mov %r10, 8+64+16(%rsp) mov %r11, 8+64+24(%rsp) mov %r12, 8+64+32(%rsp) mov %r13, 8+64+40(%rsp) mov %r14, 8+64+48(%rsp) mov %r15, 8+64+56(%rsp) ret .cfi_endproc .size __rsaz_512_mulx,.-__rsaz_512_mulx ___ } { my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx"); $code.=<<___; .globl rsaz_512_scatter4 .type rsaz_512_scatter4,\@abi-omnipotent .align 16 rsaz_512_scatter4: .cfi_startproc leaq ($out,$power,8), $out movl \$8, %r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq ($inp), %rax leaq 8($inp), $inp movq %rax, ($out) leaq 128($out), $out decl %r9d jnz .Loop_scatter ret .cfi_endproc .size rsaz_512_scatter4,.-rsaz_512_scatter4 .globl rsaz_512_gather4 .type rsaz_512_gather4,\@abi-omnipotent .align 16 rsaz_512_gather4: .cfi_startproc ___ $code.=<<___ if ($win64); .LSEH_begin_rsaz_512_gather4: .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp) .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp) .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp) .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp) .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp) .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp) .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp) .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp) .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp) .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp) ___ $code.=<<___; movd $power,%xmm8 movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002 movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000 pshufd \$0,%xmm8,%xmm8 # broadcast $power movdqa %xmm1,%xmm7 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..15 to $power # for($i=0;$i<4;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` movdqa %xmm7,%xmm`$i+3` ___ } for(;$i<7;$i++) { $code.=<<___; paddd %xmm`$i`,%xmm`$i+1` pcmpeqd %xmm8,%xmm`$i` ___ } $code.=<<___; pcmpeqd %xmm8,%xmm7 movl \$8, %r9d jmp .Loop_gather .align 16 .Loop_gather: movdqa 16*0($inp),%xmm8 movdqa 16*1($inp),%xmm9 movdqa 16*2($inp),%xmm10 movdqa 16*3($inp),%xmm11 pand %xmm0,%xmm8 movdqa 16*4($inp),%xmm12 pand %xmm1,%xmm9 movdqa 16*5($inp),%xmm13 pand %xmm2,%xmm10 movdqa 16*6($inp),%xmm14 pand %xmm3,%xmm11 movdqa 16*7($inp),%xmm15 leaq 128($inp), $inp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd \$0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,($out) leaq 8($out), $out decl %r9d jnz .Loop_gather ___ $code.=<<___ if ($win64); movaps 0x00(%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 add \$0xa8,%rsp ___ $code.=<<___; ret .LSEH_end_rsaz_512_gather4: .cfi_endproc .size rsaz_512_gather4,.-rsaz_512_gather4 .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 ___ } # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 128+24+48(%rax),%rax lea .Lmul_gather4_epilogue(%rip),%rbx cmp %r10,%rbx jne .Lse_not_in_mul_gather4 lea 0xb0(%rax),%rax lea -48-0xa8(%rax),%rsi lea 512($context),%rdi mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lse_not_in_mul_gather4: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_rsaz_512_sqr .rva .LSEH_end_rsaz_512_sqr .rva .LSEH_info_rsaz_512_sqr .rva .LSEH_begin_rsaz_512_mul .rva .LSEH_end_rsaz_512_mul .rva .LSEH_info_rsaz_512_mul .rva .LSEH_begin_rsaz_512_mul_gather4 .rva .LSEH_end_rsaz_512_mul_gather4 .rva .LSEH_info_rsaz_512_mul_gather4 .rva .LSEH_begin_rsaz_512_mul_scatter4 .rva .LSEH_end_rsaz_512_mul_scatter4 .rva .LSEH_info_rsaz_512_mul_scatter4 .rva .LSEH_begin_rsaz_512_mul_by_one .rva .LSEH_end_rsaz_512_mul_by_one .rva .LSEH_info_rsaz_512_mul_by_one .rva .LSEH_begin_rsaz_512_gather4 .rva .LSEH_end_rsaz_512_gather4 .rva .LSEH_info_rsaz_512_gather4 .section .xdata .align 8 .LSEH_info_rsaz_512_sqr: .byte 9,0,0,0 .rva se_handler .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul: .byte 9,0,0,0 .rva se_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul_gather4: .byte 9,0,0,0 .rva se_handler .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul_scatter4: .byte 9,0,0,0 .rva se_handler .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[] .LSEH_info_rsaz_512_mul_by_one: .byte 9,0,0,0 .rva se_handler .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[] .LSEH_info_rsaz_512_gather4: .byte 0x01,0x46,0x16,0x00 .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8 ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/bn/asm/x86_64-mont.pl =================================================================== --- stable/12/crypto/openssl/crypto/bn/asm/x86_64-mont.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/bn/asm/x86_64-mont.pl (revision 364963) @@ -1,1592 +1,1592 @@ #! /usr/bin/env perl # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # October 2005. # # Montgomery multiplication routine for x86_64. While it gives modest # 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more # than twice, >2x, as fast. Most common rsa1024 sign is improved by # respectful 50%. It remains to be seen if loop unrolling and # dedicated squaring routine can provide further improvement... # July 2011. # # Add dedicated squaring procedure. Performance improvement varies # from platform to platform, but in average it's ~5%/15%/25%/33% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. # August 2011. # # Unroll and modulo-schedule inner loops in such manner that they # are "fallen through" for input lengths of 8, which is critical for # 1024-bit RSA *sign*. Average performance improvement in comparison # to *initial* version of this module from 2005 is ~0%/30%/40%/45% # for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively. # June 2013. # # Optimize reduction in squaring procedure and improve 1024+-bit RSA # sign performance by 10-16% on Intel Sandy Bridge and later # (virtually same on non-Intel processors). # August 2013. # # Add MULX/ADOX/ADCX code path. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } # int bn_mul_mont( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, $np="%rcx"; # const BN_ULONG *np, $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num); $lo0="%r10"; $hi0="%r11"; $hi1="%r13"; $i="%r14"; $j="%r15"; $m0="%rbx"; $m1="%rbp"; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl bn_mul_mont .type bn_mul_mont,\@function,6 .align 16 bn_mul_mont: .cfi_startproc mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax test \$3,${num}d jnz .Lmul_enter cmp \$8,${num}d jb .Lmul_enter ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d ___ $code.=<<___; cmp $ap,$bp jne .Lmul4x_enter test \$7,${num}d jz .Lsqr8x_enter jmp .Lmul4x_enter .align 16 .Lmul_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -16(%rsp,$num,8),%r10 # future alloca(8*(num+2)) neg $num # restore $num and \$-1024,%r10 # minimize TLB usage # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .align 16 .Lmul_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: mov $bp,%r12 # reassign $bp ___ $bp="%r12"; $code.=<<___; mov ($n0),$n0 # pull n0[0] value mov ($bp),$m0 # m0=bp[0] mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ jmp .L1st_enter .align 16 .L1st: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] mov $lo0,$hi0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .L1st_enter: mulq $m0 # ap[j]*bp[0] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx lea 1($j),$j # j++ mov %rdx,$lo0 mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ jmp .Louter .align 16 .Louter: mov ($bp,$i,8),$m0 # m0=bp[i] xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ jmp .Linner_enter .align 16 .Linner: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .Linner_enter: mulq $m0 # ap[j]*bp[i] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 adc \$0,$hi0 lea 1($j),$j # j++ mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner add %rax,$hi1 mov ($ap),%rax # ap[0] adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx add $lo0,$hi1 # pull upmost overflow bit adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ cmp $num,$i jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] mov $num,$j # j=num .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8(%rsp,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit mov \$-1,%rbx xor %rax,%rbx # not %rax xor $i,$i mov $num,$j # j=num .Lcopy: # conditional copy mov ($rp,$i,8),%rcx mov (%rsp,$i,8),%rdx and %rbx,%rcx and %rax,%rdx mov $num,(%rsp,$i,8) # zap temporary vector or %rcx,%rdx mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size bn_mul_mont,.-bn_mul_mont ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .type bn_mul4x_mont,\@function,6 .align 16 bn_mul4x_mont: .cfi_startproc mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); and \$0x80100,%r11d cmp \$0x80100,%r11d je .Lmulx4x_enter ___ $code.=<<___; push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -32(%rsp,$num,8),%r10 # future alloca(8*(num+4)) neg $num # restore and \$-1024,%r10 # minimize TLB usage sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul4x_body: mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp mov %rdx,%r12 # reassign $bp ___ $bp="%r12"; $code.=<<___; mov ($n0),$n0 # pull n0[0] value mov ($bp),$m0 # m0=bp[0] mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax imulq $A[0],$m1 # "tp[0]"*n0 mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] mov 8($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] mov 16($ap),%rax adc \$0,%rdx add $A[1],$N[1] lea 4($j),$j # j++ adc \$0,%rdx mov $N[1],(%rsp) mov %rdx,$N[0] jmp .L1st4x .align 16 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov ($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-8(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov 8($np,$j,8),%rax adc \$0,%rdx lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov -16($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jb .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] mov $N[0],-8(%rsp,$j,8) mov $N[1],(%rsp,$j,8) # store upmost overflow bit lea 1($i),$i # i++ .align 4 .Louter4x: mov ($bp,$i,8),$m0 # m0=bp[i] xor $j,$j # j=0 mov (%rsp),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8($np),%rax adc \$0,%rdx add 8(%rsp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4($j),$j # j+=2 adc \$0,%rdx mov $N[1],(%rsp) # tp[j-1] mov %rdx,$N[0] jmp .Linner4x .align 16 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx add -8(%rsp,$j,8),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov ($np,$j,8),%rax adc \$0,%rdx add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-8(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8($np,$j,8),%rax adc \$0,%rdx add 8(%rsp,$j,8),$A[1] adc \$0,%rdx lea 4($j),$j # j++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov -16($ap,$j,8),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-32(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] cmp $num,$j jb .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -16($np,$j,8),%rax adc \$0,%rdx add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j,8),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[0],-24(%rsp,$j,8) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8($np,$j,8),%rax adc \$0,%rdx add -8(%rsp,$j,8),$A[1] adc \$0,%rdx lea 1($i),$i # i++ mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[1],-16(%rsp,$j,8) # tp[j-1] mov %rdx,$N[0] xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] add (%rsp,$num,8),$N[0] # pull upmost overflow bit adc \$0,$N[1] mov $N[0],-8(%rsp,$j,8) mov $N[1],(%rsp,$j,8) # store upmost overflow bit cmp $num,$i jb .Louter4x ___ { my @ri=("%rax","%rdx",$m0,$m1); $code.=<<___; mov 16(%rsp,$num,8),$rp # restore $rp lea -4($num),$j mov 0(%rsp),@ri[0] # tp[0] mov 8(%rsp),@ri[1] # tp[1] shr \$2,$j # j=num/4-1 lea (%rsp),$ap # borrow ap for tp xor $i,$i # i=0 and clear CF! sub 0($np),@ri[0] mov 16($ap),@ri[2] # tp[2] mov 24($ap),@ri[3] # tp[3] sbb 8($np),@ri[1] .Lsub4x: mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 16($np,$i,8),@ri[2] mov 32($ap,$i,8),@ri[0] # tp[i+1] mov 40($ap,$i,8),@ri[1] sbb 24($np,$i,8),@ri[3] mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 32($np,$i,8),@ri[0] mov 48($ap,$i,8),@ri[2] mov 56($ap,$i,8),@ri[3] sbb 40($np,$i,8),@ri[1] lea 4($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub4x mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i] mov 32($ap,$i,8),@ri[0] # load overflow bit sbb 16($np,$i,8),@ri[2] mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i] sbb 24($np,$i,8),@ri[3] mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i] sbb \$0,@ri[0] # handle upmost overflow bit mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i] pxor %xmm0,%xmm0 movq @ri[0],%xmm4 pcmpeqd %xmm5,%xmm5 pshufd \$0,%xmm4,%xmm4 mov $num,$j pxor %xmm4,%xmm5 shr \$2,$j # j=num/4 xor %eax,%eax # i=0 jmp .Lcopy4x .align 16 .Lcopy4x: # conditional copy movdqa (%rsp,%rax),%xmm1 movdqu ($rp,%rax),%xmm2 pand %xmm4,%xmm1 pand %xmm5,%xmm2 movdqa 16(%rsp,%rax),%xmm3 movdqa %xmm0,(%rsp,%rax) por %xmm2,%xmm1 movdqu 16($rp,%rax),%xmm2 movdqu %xmm1,($rp,%rax) pand %xmm4,%xmm3 pand %xmm5,%xmm2 movdqa %xmm0,16(%rsp,%rax) por %xmm2,%xmm3 movdqu %xmm3,16($rp,%rax) lea 32(%rax),%rax dec $j jnz .Lcopy4x ___ } $code.=<<___; mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi, 8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont ___ }}} {{{ ###################################################################### # void bn_sqr8x_mont( my $rptr="%rdi"; # const BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # not used my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___ if ($addx); .extern bn_sqrx8x_internal # see x86_64-mont5 module ___ $code.=<<___; .extern bn_sqr8x_internal # see x86_64-mont5 module .type bn_sqr8x_mont,\@function,6 .align 32 bn_sqr8x_mont: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lsqr8x_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqr8x_prologue: mov ${num}d,%r10d shl \$3,${num}d # convert $num to bytes shl \$3+2,%r10 # 4*$num neg $num ############################################################## # ensure that stack frame doesn't alias with $aptr modulo # 4096. this is done to allow memory disambiguation logic # do its job. # lea -64(%rsp,$num,2),%r11 mov %rsp,%rbp mov ($n0),$n0 # *n0 sub $aptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lsqr8x_sp_alt sub %r11,%rbp # align with $aptr lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: lea 4096-64(,$num,2),%r10 # 4096-frame-2*$num lea -64(%rbp,$num,2),%rbp # future alloca(frame+2*$num) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lsqr8x_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lsqr8x_page_walk jmp .Lsqr8x_page_walk_done .align 16 .Lsqr8x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lsqr8x_page_walk .Lsqr8x_page_walk_done: mov $num,%r10 neg $num mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lsqr8x_body: movq $nptr, %xmm2 # save pointer to modulus pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq %r10, %xmm3 # -$num ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%eax and \$0x80100,%eax cmp \$0x80100,%eax jne .Lsqr8x_nox call bn_sqrx8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %rcx -8*num # %r8 end of tp[2*num] lea (%r8,%rcx),%rbx mov %rcx,$num mov %rcx,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_nox: ___ $code.=<<___; call bn_sqr8x_internal # see x86_64-mont5 module # %rax top-most carry # %rbp nptr # %r8 -8*num # %rdi end of tp[2*num] lea (%rdi,$num),%rbx mov $num,%rcx mov $num,%rdx movq %xmm1,$rptr sar \$3+2,%rcx # %cf=0 jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: mov 8*0(%rbx),%r12 mov 8*1(%rbx),%r13 mov 8*2(%rbx),%r14 mov 8*3(%rbx),%r15 lea 8*4(%rbx),%rbx sbb 8*0(%rbp),%r12 sbb 8*1(%rbp),%r13 sbb 8*2(%rbp),%r14 sbb 8*3(%rbp),%r15 lea 8*4(%rbp),%rbp mov %r12,8*0($rptr) mov %r13,8*1($rptr) mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # preserves %cf jnz .Lsqr8x_sub sbb \$0,%rax # top-most carry lea (%rbx,$num),%rbx # rewind lea ($rptr,$num),$rptr # rewind movq %rax,%xmm1 pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_cond_copy: movdqa 16*0(%rbx),%xmm2 movdqa 16*1(%rbx),%xmm3 lea 16*2(%rbx),%rbx movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2(%rbx) # zero tp movdqa %xmm0,-16*1(%rbx) movdqa %xmm0,-16*2(%rbx,%rdx) movdqa %xmm0,-16*1(%rbx,%rdx) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) add \$32,$num jnz .Lsqr8x_cond_copy mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lsqr8x_epilogue: ret .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont ___ }}} if ($addx) {{{ my $bp="%rdx"; # original value $code.=<<___; .type bn_mulx4x_mont,\@function,6 .align 32 bn_mulx4x_mont: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes xor %r10,%r10 sub $num,%r10 # -$num mov ($n0),$n0 # *n0 lea -72(%rsp,%r10),%rbp # future alloca(frame+$num+8) and \$-128,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .align 16 .Lmulx4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: lea ($bp,$num),%r10 ############################################################## # Stack layout # +0 num # +8 off-loaded &b[i] # +16 end of b[num] # +24 saved n0 # +32 saved rp # +40 saved %rsp # +48 inner counter # +56 # +64 tmp[num+1] # mov $num,0(%rsp) # save $num shr \$5,$num mov %r10,16(%rsp) # end of b[num] sub \$1,$num mov $n0, 24(%rsp) # save *n0 mov $rp, 32(%rsp) # save $rp mov %rax,40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 mov $num,48(%rsp) # inner counter jmp .Lmulx4x_body .align 32 .Lmulx4x_body: ___ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); my $rptr=$bptr; $code.=<<___; lea 8($bp),$bptr mov ($bp),%rdx # b[0], $bp==%rdx actually lea 64+32(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r14 # a[1]*b[0] add %rax,%r11 mov $bptr,8(%rsp) # off-load &b[i] mulx 2*8($aptr),%r12,%r13 # ... adc %r14,%r12 adc \$0,%r13 mov $mi,$bptr # borrow $bptr imulq 24(%rsp),$mi # "t[0]"*n0 xor $zero,$zero # cf=0, of=0 mulx 3*8($aptr),%rax,%r14 mov $mi,%rdx lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 mulx 0*8($nptr),%rax,%r10 adcx %rax,$bptr # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12 mov 48(%rsp),$bptr # counter value mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-3*8($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 lea 4*8($nptr),$nptr mov %r12,-2*8($tptr) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcx $zero,%r15 # cf=0, modulo-scheduled mulx 0*8($aptr),%r10,%rax # a[4]*b[0] adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] adcx %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r14,%r12 mulx 3*8($aptr),%r13,%r14 .byte 0x67,0x67 mov $mi,%rdx adcx %rax,%r13 adcx $zero,%r14 # cf=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled add %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: mov ($bptr),%rdx # b[i] lea 8($bptr),$bptr # b++ sub $num,$aptr # rewind $aptr mov %r15,($tptr) # save top-most carry lea 64+4*8(%rsp),$tptr sub $num,$nptr # rewind $nptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 mov %rdx,$bi mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] adox -4*8($tptr),$mi adcx %r14,%r11 mulx 2*8($aptr),%r15,%r13 # ... adox -3*8($tptr),%r11 adcx %r15,%r12 adox -2*8($tptr),%r12 adcx $zero,%r13 adox $zero,%r13 mov $bptr,8(%rsp) # off-load &b[i] mov $mi,%r15 imulq 24(%rsp),$mi # "t[0]"*n0 xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0 mulx 3*8($aptr),%rax,%r14 mov $mi,%rdx adcx %rax,%r13 adox -1*8($tptr),%r13 adcx $zero,%r14 lea 4*8($aptr),$aptr adox $zero,%r14 mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov %r10,-4*8($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-3*8($tptr) lea 4*8($nptr),$nptr adcx %rax,%r12 adox $zero,%r15 # of=0 mov 48(%rsp),$bptr # counter value mov %r12,-2*8($tptr) jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulx 0*8($aptr),%r10,%rax # a[4]*b[i] adcx $zero,%r15 # cf=0, modulo-scheduled adox %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adcx 0*8($tptr),%r10 adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx adcx 2*8($tptr),%r12 adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-4*8($tptr) mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_inner mov 0(%rsp),$num # load num mov 8(%rsp),$bptr # re-load &b[i] adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$zero # pull top-most carry adc %r15,%r14 sbb %r15,%r15 # top-most carry mov %r14,-1*8($tptr) cmp 16(%rsp),$bptr jne .Lmulx4x_outer lea 64(%rsp),$tptr sub $num,$nptr # rewind $nptr neg %r15 mov $num,%rdx shr \$3+2,$num # %cf=0 mov 32(%rsp),$rptr # restore rp jmp .Lmulx4x_sub .align 32 .Lmulx4x_sub: mov 8*0($tptr),%r11 mov 8*1($tptr),%r12 mov 8*2($tptr),%r13 mov 8*3($tptr),%r14 lea 8*4($tptr),$tptr sbb 8*0($nptr),%r11 sbb 8*1($nptr),%r12 sbb 8*2($nptr),%r13 sbb 8*3($nptr),%r14 lea 8*4($nptr),$nptr mov %r11,8*0($rptr) mov %r12,8*1($rptr) mov %r13,8*2($rptr) mov %r14,8*3($rptr) lea 8*4($rptr),$rptr dec $num # preserves %cf jnz .Lmulx4x_sub sbb \$0,%r15 # top-most carry lea 64(%rsp),$tptr sub %rdx,$rptr # rewind movq %r15,%xmm1 pxor %xmm0,%xmm0 pshufd \$0,%xmm1,%xmm1 mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 jmp .Lmulx4x_cond_copy .align 32 .Lmulx4x_cond_copy: movdqa 16*0($tptr),%xmm2 movdqa 16*1($tptr),%xmm3 lea 16*2($tptr),$tptr movdqu 16*0($rptr),%xmm4 movdqu 16*1($rptr),%xmm5 lea 16*2($rptr),$rptr movdqa %xmm0,-16*2($tptr) # zero tp movdqa %xmm0,-16*1($tptr) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-16*2($rptr) movdqu %xmm5,-16*1($rptr) sub \$32,%rdx jnz .Lmulx4x_cond_copy mov %rdx,($tptr) mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont,.-bn_mulx4x_mont ___ }}} $code.=<<___; .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " .align 16 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type mul_handler,\@abi-omnipotent .align 16 mul_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer jmp .Lcommon_pop_regs .size mul_handler,.-mul_handler .type sqr_handler,\@abi-omnipotent .align 16 sqr_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lsqr_prologue jb .Lcommon_seh_tail mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # body label cmp %r10,%rbx # context->Rip<.Lsqr_body jb .Lcommon_pop_regs mov 152($context),%rax # pull context->Rsp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue jae .Lcommon_seh_tail mov 40(%rax),%rax # pull saved stack pointer .Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size sqr_handler,.-sqr_handler .section .pdata .align 4 .rva .LSEH_begin_bn_mul_mont .rva .LSEH_end_bn_mul_mont .rva .LSEH_info_bn_mul_mont .rva .LSEH_begin_bn_mul4x_mont .rva .LSEH_end_bn_mul4x_mont .rva .LSEH_info_bn_mul4x_mont .rva .LSEH_begin_bn_sqr8x_mont .rva .LSEH_end_bn_sqr8x_mont .rva .LSEH_info_bn_sqr8x_mont ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont .rva .LSEH_end_bn_mulx4x_mont .rva .LSEH_info_bn_mulx4x_mont ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_bn_mul_mont: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .LSEH_info_bn_mul4x_mont: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .LSEH_info_bn_sqr8x_mont: .byte 9,0,0,0 .rva sqr_handler .rva .Lsqr8x_prologue,.Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[] .align 8 ___ $code.=<<___ if ($addx); .LSEH_info_bn_mulx4x_mont: .byte 9,0,0,0 .rva sqr_handler .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 ___ } print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl =================================================================== --- stable/12/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/bn/asm/x86_64-mont5.pl (revision 364963) @@ -1,3963 +1,3963 @@ #! /usr/bin/env perl # Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # August 2011. # # Companion to x86_64-mont.pl that optimizes cache-timing attack # countermeasures. The subroutines are produced by replacing bp[i] # references in their x86_64-mont.pl counterparts with cache-neutral # references to powers table computed in BN_mod_exp_mont_consttime. # In addition subroutine that scatters elements of the powers table # is implemented, so that scatter-/gathering can be tuned without # bn_exp.c modifications. # August 2013. # # Add MULX/AD*X code paths and additional interfaces to optimize for # branch prediction unit. For input lengths that are multiples of 8 # the np argument is not just modulus value, but one interleaved # with 0. This is to optimize post-condition... $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } # int bn_mul_mont_gather5( $rp="%rdi"; # BN_ULONG *rp, $ap="%rsi"; # const BN_ULONG *ap, $bp="%rdx"; # const BN_ULONG *bp, $np="%rcx"; # const BN_ULONG *np, $n0="%r8"; # const BN_ULONG *n0, $num="%r9"; # int num, # int idx); # 0 to 2^5-1, "index" in $bp holding # pre-computed powers of a', interlaced # in such manner that b[0] is $bp[idx], # b[1] is [2^5+idx], etc. $lo0="%r10"; $hi0="%r11"; $hi1="%r13"; $i="%r14"; $j="%r15"; $m0="%rbx"; $m1="%rbp"; $code=<<___; .text .extern OPENSSL_ia32cap_P .globl bn_mul_mont_gather5 .type bn_mul_mont_gather5,\@function,6 .align 64 bn_mul_mont_gather5: .cfi_startproc mov ${num}d,${num}d mov %rsp,%rax .cfi_def_cfa_register %rax test \$7,${num}d jnz .Lmul_enter ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d ___ $code.=<<___; jmp .Lmul4x_enter .align 16 .Lmul_enter: movd `($win64?56:8)`(%rsp),%xmm5 # load 7th argument push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 neg $num mov %rsp,%r11 lea -280(%rsp,$num,8),%r10 # future alloca(8*(num+2)+256+8) neg $num # restore $num and \$-1024,%r10 # minimize TLB usage # An OS-agnostic version of __chkstk. # # Some OSes (Windows) insist on stack being "wired" to # physical memory in strictly sequential manner, i.e. if stack # allocation spans two pages, then reference to farmost one can # be punishable by SEGV. But page walking can do good even on # other OSes, because it guarantees that villain thread hits # the guard page before it can make damage to innocent one... sub %r10,%r11 and \$-4096,%r11 lea (%r10,%r11),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .Lmul_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r11 cmp %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: lea .Linc(%rip),%r10 mov %rax,8(%rsp,$num,8) # tp[num+1]=%rsp .cfi_cfa_expression %rsp+8,$num,8,mul,plus,deref,+8 .Lmul_body: lea 128($bp),%r12 # reassign $bp (+size optimization) ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%r10),%xmm0 # 00000001000000010000000000000000 movdqa 16(%r10),%xmm1 # 00000002000000020000000200000002 lea 24-112(%rsp,$num,8),%r10# place the mask after tp[num+3] (+ICache optimization) and \$-16,%r10 pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($k+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($k+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($k+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($k+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($k+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($k+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($k+2)+112`(%r10) pand `16*($k+0)-128`($bp),%xmm0 # while it's still in register pand `16*($k+1)-128`($bp),%xmm1 pand `16*($k+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($k+3)+112`(%r10) pand `16*($k+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($k=0;$k<$STRIDE/16-4;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm4 movdqa `16*($k+1)-128`($bp),%xmm5 movdqa `16*($k+2)-128`($bp),%xmm2 pand `16*($k+0)+112`(%r10),%xmm4 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($k+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($k+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax xor $i,$i # i=0 xor $j,$j # j=0 mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$lo0 mov ($np),%rax imulq $lo0,$m1 # "tp[0]"*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov %rdx,$hi1 lea 1($j),$j # j++ jmp .L1st_enter .align 16 .L1st: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] mov $lo0,$hi0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .L1st_enter: mulq $m0 # ap[j]*bp[0] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx lea 1($j),$j # j++ mov %rdx,$lo0 mulq $m1 # np[j]*m1 cmp $num,$j jne .L1st # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 mov $lo0,$hi0 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ jmp .Louter .align 16 .Louter: lea 24+128(%rsp,$num,8),%rdx # where 256-byte mask is (+size optimization) and \$-16,%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($k=0;$k<$STRIDE/16;$k+=4) { $code.=<<___; movdqa `16*($k+0)-128`($bp),%xmm0 movdqa `16*($k+1)-128`($bp),%xmm1 movdqa `16*($k+2)-128`($bp),%xmm2 movdqa `16*($k+3)-128`($bp),%xmm3 pand `16*($k+0)-128`(%rdx),%xmm0 pand `16*($k+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($k+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($k+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bp),$bp mov ($ap),%rax # ap[0] movq %xmm0,$m0 # m0=bp[i] xor $j,$j # j=0 mov $n0,$m1 mov (%rsp),$lo0 mulq $m0 # ap[0]*bp[i] add %rax,$lo0 # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $lo0,$m1 # tp[0]*n0 mov %rdx,$hi0 mulq $m1 # np[0]*m1 add %rax,$lo0 # discarded mov 8($ap),%rax adc \$0,%rdx mov 8(%rsp),$lo0 # tp[1] mov %rdx,$hi1 lea 1($j),$j # j++ jmp .Linner_enter .align 16 .Linner: add %rax,$hi1 mov ($ap,$j,8),%rax adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$j,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$j,8) # tp[j-1] mov %rdx,$hi1 .Linner_enter: mulq $m0 # ap[j]*bp[i] add %rax,$hi0 mov ($np,$j,8),%rax adc \$0,%rdx add $hi0,$lo0 # ap[j]*bp[i]+tp[j] mov %rdx,$hi0 adc \$0,$hi0 lea 1($j),$j # j++ mulq $m1 # np[j]*m1 cmp $num,$j jne .Linner # note that upon exit $j==$num, so # they can be used interchangeably add %rax,$hi1 adc \$0,%rdx add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j] mov (%rsp,$num,8),$lo0 adc \$0,%rdx mov $hi1,-16(%rsp,$num,8) # tp[num-1] mov %rdx,$hi1 xor %rdx,%rdx add $hi0,$hi1 adc \$0,%rdx add $lo0,$hi1 # pull upmost overflow bit adc \$0,%rdx mov $hi1,-8(%rsp,$num,8) mov %rdx,(%rsp,$num,8) # store upmost overflow bit lea 1($i),$i # i++ cmp $num,$i jb .Louter xor $i,$i # i=0 and clear CF! mov (%rsp),%rax # tp[0] lea (%rsp),$ap # borrow ap for tp mov $num,$j # j=num jmp .Lsub .align 16 .Lsub: sbb ($np,$i,8),%rax mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] mov 8($ap,$i,8),%rax # tp[i+1] lea 1($i),$i # i++ dec $j # doesn't affect CF! jnz .Lsub sbb \$0,%rax # handle upmost overflow bit mov \$-1,%rbx xor %rax,%rbx xor $i,$i mov $num,$j # j=num .Lcopy: # conditional copy mov ($rp,$i,8),%rcx mov (%rsp,$i,8),%rdx and %rbx,%rcx and %rax,%rdx mov $i,(%rsp,$i,8) # zap temporary vector or %rcx,%rdx mov %rdx,($rp,$i,8) # rp[i]=tp[i] lea 1($i),$i sub \$1,$j jnz .Lcopy mov 8(%rsp,$num,8),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: ret .cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 ___ {{{ my @A=("%r10","%r11"); my @N=("%r13","%rdi"); $code.=<<___; .type bn_mul4x_mont_gather5,\@function,6 .align 32 bn_mul4x_mont_gather5: .cfi_startproc .byte 0x67 mov %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: ___ $code.=<<___ if ($addx); and \$0x80108,%r11d cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lmulx4x_enter ___ $code.=<<___; push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmul4x_prologue: .byte 0x67 shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra [num] is allocated in order # to align with bn_power5's frame, which is cleansed after # completing exponentiation. Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmul4xsp_alt sub %r11,%rbp # align with $rp lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lmul4xsp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: neg $num mov %rax,40(%rsp) .cfi_cfa_expression %rsp+40,deref,+8 .Lmul4x_body: call mul4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: ret .cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,\@abi-omnipotent .align 32 mul4x_internal: .cfi_startproc shl \$5,$num # $num was in bytes movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument, index lea .Linc(%rip),%rax lea 128(%rdx,$num),%r13 # end of powers table (+size optimization) shr \$5,$num # restore $num ___ $bp="%r12"; $STRIDE=2**5*8; # 5 is "window size" $N=$STRIDE/4; # should match cache line size $tp=$i; $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 88-112(%rsp,$num),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128(%rdx),$bp # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 .byte 0x67,0x67 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 .byte 0x67 movdqa %xmm4,%xmm3 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($i+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($i+0)+112`(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($i+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($i+2)+112`(%r10) pand `16*($i+0)-128`($bp),%xmm0 # while it's still in register pand `16*($i+1)-128`($bp),%xmm1 pand `16*($i+2)-128`($bp),%xmm2 movdqa %xmm3,`16*($i+3)+112`(%r10) pand `16*($i+3)-128`($bp),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bp),%xmm4 movdqa `16*($i+1)-128`($bp),%xmm5 movdqa `16*($i+2)-128`($bp),%xmm2 pand `16*($i+0)+112`(%r10),%xmm4 movdqa `16*($i+3)-128`($bp),%xmm3 pand `16*($i+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($i+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($i+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; por %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[0] mov %r13,16+8(%rsp) # save end of b[num] mov $rp, 56+8(%rsp) # save $rp mov ($n0),$n0 # pull n0[0] value mov ($ap),%rax lea ($ap,$num),$ap # end of a[num] neg $num mov $n0,$m1 mulq $m0 # ap[0]*bp[0] mov %rax,$A[0] mov ($np),%rax imulq $A[0],$m1 # "tp[0]"*n0 lea 64+8(%rsp),$tp mov %rdx,$A[1] mulq $m1 # np[0]*m1 add %rax,$A[0] # discarded mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 add %rax,$N[1] mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] lea 4*8($num),$j # j=4 lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) mov %rdx,$N[0] jmp .L1st4x .align 32 .L1st4x: mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov 8*0($np),%rax adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] lea 8*4($np),$np adc \$0,%rdx mov $N[1],($tp) # tp[j-1] mov %rdx,$N[0] add \$32,$j # j+=4 jnz .L1st4x mulq $m0 # ap[j]*bp[0] add %rax,$A[0] mov -8*2($np),%rax lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[0] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[0] lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] mov $N[0],-8($tp) jmp .Louter4x .align 32 .Louter4x: lea 16+128($tp),%rdx # where 256-byte mask is (+size optimization) pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bp),%xmm0 movdqa `16*($i+1)-128`($bp),%xmm1 movdqa `16*($i+2)-128`($bp),%xmm2 movdqa `16*($i+3)-128`($bp),%xmm3 pand `16*($i+0)-128`(%rdx),%xmm0 pand `16*($i+1)-128`(%rdx),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)-128`(%rdx),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)-128`(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bp),$bp movq %xmm0,$m0 # m0=bp[i] mov ($tp,$num),$A[0] mov $n0,$m1 mulq $m0 # ap[0]*bp[i] add %rax,$A[0] # ap[0]*bp[i]+tp[0] mov ($np),%rax adc \$0,%rdx imulq $A[0],$m1 # tp[0]*n0 mov %rdx,$A[1] mov $N[1],($tp) # store upmost overflow bit lea ($tp,$num),$tp # rewind $tp mulq $m1 # np[0]*m1 add %rax,$A[0] # "$N[0]", discarded mov 8($ap,$num),%rax adc \$0,%rdx mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] # +tp[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$num),%rax adc \$0,%rdx add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j] lea 4*8($num),$j # j=4 lea 8*4($np),$np adc \$0,%rdx mov %rdx,$N[0] jmp .Linner4x .align 32 .Linner4x: mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov -8*1($np),%rax adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov 8*0($np),%rax adc \$0,%rdx add ($tp),$A[0] # ap[j]*bp[i]+tp[j] adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov 8($ap,$j),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-16($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov 8*1($np),%rax adc \$0,%rdx add 8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov 16($ap,$j),%rax adc \$0,%rdx add $A[1],$N[1] lea 8*4($np),$np adc \$0,%rdx mov $N[0],-8($tp) # tp[j-1] mov %rdx,$N[0] add \$32,$j # j+=4 jnz .Linner4x mulq $m0 # ap[j]*bp[i] add %rax,$A[0] mov -8*2($np),%rax adc \$0,%rdx add 16($tp),$A[0] # ap[j]*bp[i]+tp[j] lea 32($tp),$tp adc \$0,%rdx mov %rdx,$A[1] mulq $m1 # np[j]*m1 add %rax,$N[0] mov -8($ap),%rax adc \$0,%rdx add $A[0],$N[0] adc \$0,%rdx mov $N[1],-32($tp) # tp[j-1] mov %rdx,$N[1] mulq $m0 # ap[j]*bp[i] add %rax,$A[1] mov $m1,%rax mov -8*1($np),$m1 adc \$0,%rdx add -8($tp),$A[1] adc \$0,%rdx mov %rdx,$A[0] mulq $m1 # np[j]*m1 add %rax,$N[1] mov ($ap,$num),%rax # ap[0] adc \$0,%rdx add $A[1],$N[1] adc \$0,%rdx mov $N[0],-24($tp) # tp[j-1] mov %rdx,$N[0] mov $N[1],-16($tp) # tp[j-1] lea ($np,$num),$np # rewind $np xor $N[1],$N[1] add $A[0],$N[0] adc \$0,$N[1] add ($tp),$N[0] # pull upmost overflow bit adc \$0,$N[1] # upmost overflow bit mov $N[0],-8($tp) cmp 16+8(%rsp),$bp jb .Louter4x ___ if (1) { $code.=<<___; xor %rax,%rax sub $N[0],$m1 # compare top-most words adc $j,$j # $j is zero or $j,$N[1] sub $N[1],%rax # %rax=-$N[1] lea ($tp,$num),%rbx # tptr in .sqr4x_sub mov ($np),%r12 lea ($np),%rbp # nptr in .sqr4x_sub mov %r9,%rcx sar \$3+2,%rcx mov 56+8(%rsp),%rdi # rptr in .sqr4x_sub dec %r12 # so that after 'not' we get -n[0] xor %r10,%r10 mov 8*1(%rbp),%r13 mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqr4x_sub_entry ___ } else { my @ri=("%rax",$bp,$m0,$m1); my $rp="%rdx"; $code.=<<___ xor \$1,$N[1] lea ($tp,$num),$tp # rewind $tp sar \$5,$num # cf=0 lea ($np,$N[1],8),$np mov 56+8(%rsp),$rp # restore $rp jmp .Lsub4x .align 32 .Lsub4x: .byte 0x66 mov 8*0($tp),@ri[0] mov 8*1($tp),@ri[1] .byte 0x66 sbb 16*0($np),@ri[0] mov 8*2($tp),@ri[2] sbb 16*1($np),@ri[1] mov 3*8($tp),@ri[3] lea 4*8($tp),$tp sbb 16*2($np),@ri[2] mov @ri[0],8*0($rp) sbb 16*3($np),@ri[3] lea 16*4($np),$np mov @ri[1],8*1($rp) mov @ri[2],8*2($rp) mov @ri[3],8*3($rp) lea 8*4($rp),$rp inc $num jnz .Lsub4x ret ___ } $code.=<<___; .cfi_endproc .size mul4x_internal,.-mul4x_internal ___ }}} {{{ ###################################################################### # void bn_power5( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 # int pwr my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; .globl bn_power5 .type bn_power5,\@function,6 .align 32 bn_power5: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d and \$0x80108,%r11d cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 je .Lpowerx5_enter ___ $code.=<<___; push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lpower5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10d # 3*$num neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwr_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lpwr_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwr_page_walk jmp .Lpwr_page_walk_done .Lpwr_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwr_page_walk .Lpwr_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lpower5_body: movq $rptr,%xmm1 # save $rptr, used in sqr8x movq $nptr,%xmm2 # save $nptr movq %r10, %xmm3 # -$num, used in sqr8x movq $bptr,%xmm4 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal movq %xmm2,$nptr movq %xmm4,$bptr mov $aptr,$rptr mov 40(%rsp),%rax lea 32(%rsp),$n0 call mul4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpower5_epilogue: ret .cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal .hidden bn_sqr8x_internal .type bn_sqr8x_internal,\@abi-omnipotent .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: .cfi_startproc ############################################################## # Squaring part: # # a) multiply-n-add everything but a[i]*a[i]; # b) shift result of a) by 1 to the left and accumulate # a[i]*a[i] products; # ############################################################## # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[2]a[1] # a[4]a[0] # a[3]a[1] # a[5]a[0] # a[4]a[1] # a[3]a[2] # a[6]a[0] # a[5]a[1] # a[4]a[2] # a[7]a[0] # a[6]a[1] # a[5]a[2] # a[4]a[3] # a[7]a[1] # a[6]a[2] # a[5]a[3] # a[7]a[2] # a[6]a[3] # a[5]a[4] # a[7]a[3] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[4]a[0] # a[5]a[0] # a[6]a[0] # a[7]a[0] # a[2]a[1] # a[3]a[1] # a[4]a[1] # a[5]a[1] # a[6]a[1] # a[7]a[1] # a[3]a[2] # a[4]a[2] # a[5]a[2] # a[6]a[2] # a[7]a[2] # a[4]a[3] # a[5]a[3] # a[6]a[3] # a[7]a[3] # a[5]a[4] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[0]a[0] # a[1]a[1] # a[2]a[2] # a[3]a[3] # a[4]a[4] # a[5]a[5] # a[6]a[6] # a[7]a[7] lea 32(%r10),$i # $i=-($num-32) lea ($aptr,$num),$aptr # end of a[] buffer, ($aptr,$i)=&ap[2] mov $num,$j # $j=$num # comments apply to $num==8 case mov -32($aptr,$i),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr,$i),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr,$i),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] mov %rax,$A0[0] # a[1]*a[0] mov $ai,%rax # a[2] mov %rdx,$A0[1] mov $A0[0],-24($tptr,$i) # t[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax adc \$0,%rdx mov $A0[1],-16($tptr,$i) # t[2] mov %rdx,$A0[0] mov -8($aptr,$i),$ai # a[3] mul $a1 # a[2]*a[1] mov %rax,$A1[0] # a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A1[1] lea ($i),$j mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[3] jmp .Lsqr4x_1st .align 32 .Lsqr4x_1st: mov ($aptr,$j),$ai # a[4] mul $a1 # a[3]*a[1] add %rax,$A1[1] # a[3]*a[1]+t[4] mov $ai,%rax mov %rdx,$A1[0] adc \$0,$A1[0] mul $a0 # a[4]*a[0] add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] mov $ai,%rax # a[3] mov 8($aptr,$j),$ai # a[5] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[4]*a[3] add %rax,$A1[0] # a[4]*a[3]+t[5] mov $ai,%rax mov $A0[1],($tptr,$j) # t[4] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[5]*a[2] add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] mov $ai,%rax mov 16($aptr,$j),$ai # a[6] mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mul $a1 # a[5]*a[3] add %rax,$A1[1] # a[5]*a[3]+t[6] mov $ai,%rax mov $A0[0],8($tptr,$j) # t[5] mov %rdx,$A1[0] adc \$0,$A1[0] mul $a0 # a[6]*a[2] add %rax,$A0[1] # a[6]*a[2]+a[5]*a[3]+t[6] mov $ai,%rax # a[3] mov 24($aptr,$j),$ai # a[7] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[6]*a[5] add %rax,$A1[0] # a[6]*a[5]+t[7] mov $ai,%rax mov $A0[1],16($tptr,$j) # t[6] mov %rdx,$A1[1] adc \$0,$A1[1] lea 32($j),$j mul $a0 # a[7]*a[4] add %rax,$A0[0] # a[7]*a[4]+a[6]*a[5]+t[6] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[7] cmp \$0,$j jne .Lsqr4x_1st mul $a1 # a[7]*a[5] add %rax,$A1[1] lea 16($i),$i adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[8] mov %rdx,$A1[0] mov %rdx,8($tptr) # t[9] jmp .Lsqr4x_outer .align 32 .Lsqr4x_outer: # comments apply to $num==6 case mov -32($aptr,$i),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr,$i),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr,$i),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] mov -24($tptr,$i),$A0[0] # t[1] add %rax,$A0[0] # a[1]*a[0]+t[1] mov $ai,%rax # a[2] adc \$0,%rdx mov $A0[0],-24($tptr,$i) # t[1] mov %rdx,$A0[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax adc \$0,%rdx add -16($tptr,$i),$A0[1] # a[2]*a[0]+t[2] mov %rdx,$A0[0] adc \$0,$A0[0] mov $A0[1],-16($tptr,$i) # t[2] xor $A1[0],$A1[0] mov -8($aptr,$i),$ai # a[3] mul $a1 # a[2]*a[1] add %rax,$A1[0] # a[2]*a[1]+t[3] mov $ai,%rax adc \$0,%rdx add -8($tptr,$i),$A1[0] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax adc \$0,%rdx add $A1[0],$A0[0] mov %rdx,$A0[1] adc \$0,$A0[1] mov $A0[0],-8($tptr,$i) # t[3] lea ($i),$j jmp .Lsqr4x_inner .align 32 .Lsqr4x_inner: mov ($aptr,$j),$ai # a[4] mul $a1 # a[3]*a[1] add %rax,$A1[1] # a[3]*a[1]+t[4] mov $ai,%rax mov %rdx,$A1[0] adc \$0,$A1[0] add ($tptr,$j),$A1[1] adc \$0,$A1[0] .byte 0x67 mul $a0 # a[4]*a[0] add %rax,$A0[1] # a[4]*a[0]+a[3]*a[1]+t[4] mov $ai,%rax # a[3] mov 8($aptr,$j),$ai # a[5] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] adc \$0,$A0[0] mul $a1 # a[4]*a[3] add %rax,$A1[0] # a[4]*a[3]+t[5] mov $A0[1],($tptr,$j) # t[4] mov $ai,%rax mov %rdx,$A1[1] adc \$0,$A1[1] add 8($tptr,$j),$A1[0] lea 16($j),$j # j++ adc \$0,$A1[1] mul $a0 # a[5]*a[2] add %rax,$A0[0] # a[5]*a[2]+a[4]*a[3]+t[5] mov $ai,%rax adc \$0,%rdx add $A1[0],$A0[0] mov %rdx,$A0[1] adc \$0,$A0[1] mov $A0[0],-8($tptr,$j) # t[5], "preloaded t[1]" below cmp \$0,$j jne .Lsqr4x_inner .byte 0x67 mul $a1 # a[5]*a[3] add %rax,$A1[1] adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[6], "preloaded t[2]" below mov %rdx,$A1[0] mov %rdx,8($tptr) # t[7], "preloaded t[3]" below add \$16,$i jnz .Lsqr4x_outer # comments apply to $num==4 case mov -32($aptr),$a0 # a[0] lea 48+8(%rsp,$num,2),$tptr # end of tp[] buffer, &tp[2*$num] mov -24($aptr),%rax # a[1] lea -32($tptr,$i),$tptr # end of tp[] window, &tp[2*$num-"$i"] mov -16($aptr),$ai # a[2] mov %rax,$a1 mul $a0 # a[1]*a[0] add %rax,$A0[0] # a[1]*a[0]+t[1], preloaded t[1] mov $ai,%rax # a[2] mov %rdx,$A0[1] adc \$0,$A0[1] mul $a0 # a[2]*a[0] add %rax,$A0[1] mov $ai,%rax mov $A0[0],-24($tptr) # t[1] mov %rdx,$A0[0] adc \$0,$A0[0] add $A1[1],$A0[1] # a[2]*a[0]+t[2], preloaded t[2] mov -8($aptr),$ai # a[3] adc \$0,$A0[0] mul $a1 # a[2]*a[1] add %rax,$A1[0] # a[2]*a[1]+t[3], preloaded t[3] mov $ai,%rax mov $A0[1],-16($tptr) # t[2] mov %rdx,$A1[1] adc \$0,$A1[1] mul $a0 # a[3]*a[0] add %rax,$A0[0] # a[3]*a[0]+a[2]*a[1]+t[3] mov $ai,%rax mov %rdx,$A0[1] adc \$0,$A0[1] add $A1[0],$A0[0] adc \$0,$A0[1] mov $A0[0],-8($tptr) # t[3] mul $a1 # a[3]*a[1] add %rax,$A1[1] mov -16($aptr),%rax # a[2] adc \$0,%rdx add $A0[1],$A1[1] adc \$0,%rdx mov $A1[1],($tptr) # t[4] mov %rdx,$A1[0] mov %rdx,8($tptr) # t[5] mul $ai # a[2]*a[3] ___ { my ($shift,$carry)=($a0,$a1); my @S=(@A1,$ai,$n0); $code.=<<___; add \$16,$i xor $shift,$shift sub $num,$i # $i=16-$num xor $carry,$carry add $A1[0],%rax # t[5] adc \$0,%rdx mov %rax,8($tptr) # t[5] mov %rdx,16($tptr) # t[6] mov $carry,24($tptr) # t[7] mov -16($aptr,$i),%rax # a[0] lea 48+8(%rsp),$tptr xor $A0[0],$A0[0] # t[0] mov 8($tptr),$A0[1] # t[1] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],16($tptr) adc %rdx,$S[3] lea 16($i),$i mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry lea 64($tptr),$tptr jmp .Lsqr4x_shift_n_add .align 32 .Lsqr4x_shift_n_add: lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 0($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 0($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],-16($tptr) adc %rdx,$S[3] lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift mov $S[3],-8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov 16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 24($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov 8($aptr,$i),%rax # a[i+1] # prefetch mov $S[0],0($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift mov $S[1],8($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mov 32($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov 40($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[2] mov 16($aptr,$i),%rax # a[i+1] # prefetch mov $S[2],16($tptr) adc %rdx,$S[3] mov $S[3],24($tptr) sbb $carry,$carry # mov cf,$carry lea 64($tptr),$tptr add \$32,$i jnz .Lsqr4x_shift_n_add lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift .byte 0x67 shr \$63,$A0[0] lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[1] # | t[2*i]>>63 mov -16($tptr),$A0[0] # t[2*i+2] # prefetch mov $A0[1],$shift # shift=t[2*i+1]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch adc %rax,$S[0] mov -8($aptr),%rax # a[i+1] # prefetch mov $S[0],-32($tptr) adc %rdx,$S[1] lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift mov $S[1],-24($tptr) sbb $carry,$carry # mov cf,$carry shr \$63,$A0[0] lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 | shr \$63,$A0[1] or $A0[0],$S[3] # | t[2*i]>>63 mul %rax # a[i]*a[i] neg $carry # mov $carry,cf adc %rax,$S[2] adc %rdx,$S[3] mov $S[2],-16($tptr) mov $S[3],-8($tptr) ___ } ###################################################################### # Montgomery reduction part, "word-by-word" algorithm. # # This new path is inspired by multiple submissions from Intel, by # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, # Vinodh Gopal... { my ($nptr,$tptr,$carry,$m0)=("%rbp","%rdi","%rsi","%rbx"); $code.=<<___; movq %xmm2,$nptr __bn_sqr8x_reduction: xor %rax,%rax lea ($nptr,$num),%rcx # end of n[] lea 48+8(%rsp,$num,2),%rdx # end of t[] buffer mov %rcx,0+8(%rsp) lea 48+8(%rsp,$num),$tptr # end of initial t[] window mov %rdx,8+8(%rsp) neg $num jmp .L8x_reduction_loop .align 32 .L8x_reduction_loop: lea ($tptr,$num),$tptr # start of current t[] window .byte 0x66 mov 8*0($tptr),$m0 mov 8*1($tptr),%r9 mov 8*2($tptr),%r10 mov 8*3($tptr),%r11 mov 8*4($tptr),%r12 mov 8*5($tptr),%r13 mov 8*6($tptr),%r14 mov 8*7($tptr),%r15 mov %rax,(%rdx) # store top-most carry bit lea 8*8($tptr),$tptr .byte 0x67 mov $m0,%r8 imulq 32+8(%rsp),$m0 # n0*a[0] mov 8*0($nptr),%rax # n[0] mov \$8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq $m0 mov 8*1($nptr),%rax # n[1] neg %r8 mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 mov $m0,48-8+8(%rsp,%rcx,8) # put aside n0*a[i] mov %rdx,%r9 adc \$0,%r9 mulq $m0 add %rax,%r10 mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov 32+8(%rsp),$carry # pull n0, borrow $carry mov %rdx,%r10 adc \$0,%r10 mulq $m0 add %rax,%r11 mov 8*4($nptr),%rax adc \$0,%rdx imulq %r8,$carry # modulo-scheduled add %r11,%r10 mov %rdx,%r11 adc \$0,%r11 mulq $m0 add %rax,%r12 mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 adc \$0,%r12 mulq $m0 add %rax,%r13 mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 adc \$0,%r13 mulq $m0 add %rax,%r14 mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 adc \$0,%r14 mulq $m0 mov $carry,$m0 # n0*a[i] add %rax,%r15 mov 8*0($nptr),%rax # n[0] adc \$0,%rdx add %r15,%r14 mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_reduce lea 8*8($nptr),$nptr xor %rax,%rax mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_no_tail .byte 0x66 add 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 sbb $carry,$carry # top carry mov 48+56+8(%rsp),$m0 # pull n0*a[0] mov \$8,%ecx mov 8*0($nptr),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq $m0 add %rax,%r8 mov 8*1($nptr),%rax mov %r8,($tptr) # save result mov %rdx,%r8 adc \$0,%r8 mulq $m0 add %rax,%r9 mov 8*2($nptr),%rax adc \$0,%rdx add %r9,%r8 lea 8($tptr),$tptr # $tptr++ mov %rdx,%r9 adc \$0,%r9 mulq $m0 add %rax,%r10 mov 8*3($nptr),%rax adc \$0,%rdx add %r10,%r9 mov %rdx,%r10 adc \$0,%r10 mulq $m0 add %rax,%r11 mov 8*4($nptr),%rax adc \$0,%rdx add %r11,%r10 mov %rdx,%r11 adc \$0,%r11 mulq $m0 add %rax,%r12 mov 8*5($nptr),%rax adc \$0,%rdx add %r12,%r11 mov %rdx,%r12 adc \$0,%r12 mulq $m0 add %rax,%r13 mov 8*6($nptr),%rax adc \$0,%rdx add %r13,%r12 mov %rdx,%r13 adc \$0,%r13 mulq $m0 add %rax,%r14 mov 8*7($nptr),%rax adc \$0,%rdx add %r14,%r13 mov %rdx,%r14 adc \$0,%r14 mulq $m0 mov 48-16+8(%rsp,%rcx,8),$m0# pull n0*a[i] add %rax,%r15 adc \$0,%rdx add %r15,%r14 mov 8*0($nptr),%rax # pull n[0] mov %rdx,%r15 adc \$0,%r15 dec %ecx jnz .L8x_tail lea 8*8($nptr),$nptr mov 8+8(%rsp),%rdx # pull end of t[] cmp 0+8(%rsp),$nptr # end of n[]? jae .L8x_tail_done # break out of loop mov 48+56+8(%rsp),$m0 # pull n0*a[0] neg $carry mov 8*0($nptr),%rax # pull n[0] adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 sbb $carry,$carry # top carry mov \$8,%ecx jmp .L8x_tail .align 32 .L8x_tail_done: xor %rax,%rax add (%rdx),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 adc \$0,%rax neg $carry .L8x_no_tail: adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry mov -8($nptr),%rcx # np[num-1] xor $carry,$carry movq %xmm2,$nptr # restore $nptr mov %r8,8*0($tptr) # store top 512 bits mov %r9,8*1($tptr) movq %xmm3,$num # $num is %r9, can't be moved upwards mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr),$tptr cmp %rdx,$tptr # end of t[]? jb .L8x_reduction_loop ret .cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal ___ } ############################################################## # Post-condition, 4x unrolled # { my ($tptr,$nptr)=("%rbx","%rbp"); $code.=<<___; .type __bn_post4x_internal,\@abi-omnipotent .align 32 __bn_post4x_internal: .cfi_startproc mov 8*0($nptr),%r12 lea (%rdi,$num),$tptr # %rdi was $tptr above mov $num,%rcx movq %xmm1,$rptr # restore $rptr neg %rax movq %xmm1,$aptr # prepare for back-to-back call sar \$3+2,%rcx dec %r12 # so that after 'not' we get -n[0] xor %r10,%r10 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 jmp .Lsqr4x_sub_entry .align 16 .Lsqr4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 .Lsqr4x_sub_entry: lea 8*4($nptr),$nptr not %r12 not %r13 not %r14 not %r15 and %rax,%r12 and %rax,%r13 and %rax,%r14 and %rax,%r15 neg %r10 # mov %r10,%cf adc 8*0($tptr),%r12 adc 8*1($tptr),%r13 adc 8*2($tptr),%r14 adc 8*3($tptr),%r15 mov %r12,8*0($rptr) lea 8*4($tptr),$tptr mov %r13,8*1($rptr) sbb %r10,%r10 # mov %cf,%r10 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx # pass %cf jnz .Lsqr4x_sub mov $num,%r10 # prepare for back-to-back call neg $num # restore $num ret .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal ___ } { $code.=<<___; .globl bn_from_montgomery .type bn_from_montgomery,\@abi-omnipotent .align 32 bn_from_montgomery: .cfi_startproc testl \$7,`($win64?"48(%rsp)":"%r9d")` jz bn_from_mont8x xor %eax,%eax ret .cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,\@function,6 .align 32 bn_from_mont8x: .cfi_startproc .byte 0x67 mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lfrom_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). The stack is allocated to aligned with # bn_power5's frame, and as bn_from_montgomery happens to be # last operation, we use the opportunity to cleanse it. # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lfrom_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lfrom_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lfrom_page_walk jmp .Lfrom_page_walk_done .Lfrom_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lfrom_page_walk .Lfrom_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lfrom_body: mov $num,%r11 lea 48(%rsp),%rax pxor %xmm0,%xmm0 jmp .Lmul_by_1 .align 32 .Lmul_by_1: movdqu ($aptr),%xmm1 movdqu 16($aptr),%xmm2 movdqu 32($aptr),%xmm3 movdqa %xmm0,(%rax,$num) movdqu 48($aptr),%xmm4 movdqa %xmm0,16(%rax,$num) .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 # lea 64($aptr),$aptr movdqa %xmm1,(%rax) movdqa %xmm0,32(%rax,$num) movdqa %xmm2,16(%rax) movdqa %xmm0,48(%rax,$num) movdqa %xmm3,32(%rax) movdqa %xmm4,48(%rax) lea 64(%rax),%rax sub \$64,%r11 jnz .Lmul_by_1 movq $rptr,%xmm1 movq $nptr,%xmm2 .byte 0x67 mov $nptr,%rbp movq %r10, %xmm3 # -num ___ $code.=<<___ if ($addx); mov OPENSSL_ia32cap_P+8(%rip),%r11d and \$0x80108,%r11d cmp \$0x80108,%r11d # check for AD*X+BMI2+BMI1 jne .Lfrom_mont_nox lea (%rax,$num),$rptr call __bn_sqrx8x_reduction call __bn_postx4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_nox: ___ $code.=<<___; call __bn_sqr8x_reduction call __bn_post4x_internal pxor %xmm0,%xmm0 lea 48(%rsp),%rax jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 movdqa %xmm0,16*0(%rax) movdqa %xmm0,16*1(%rax) movdqa %xmm0,16*2(%rax) movdqa %xmm0,16*3(%rax) lea 16*4(%rax),%rax sub \$32,$num jnz .Lfrom_mont_zero mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lfrom_epilogue: ret .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x ___ } }}} if ($addx) {{{ my $bp="%rdx"; # restore original value $code.=<<___; .type bn_mulx4x_mont_gather5,\@function,6 .align 32 bn_mulx4x_mont_gather5: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lmulx4x_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmulx4x_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num # -$num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra [num] is allocated in order # to align with bn_power5's frame, which is cleansed after # completing exponentiation. Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rp,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lmulx4xsp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lmulx4xsp_done .Lmulx4xsp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lmulx4xsp_done: and \$-64,%rbp # ensure alignment mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk jmp .Lmulx4x_page_walk_done .Lmulx4x_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lmulx4x_page_walk .Lmulx4x_page_walk_done: ############################################################## # Stack layout # +0 -num # +8 off-loaded &b[i] # +16 end of b[num] # +24 inner counter # +32 saved n0 # +40 saved %rsp # +48 # +56 saved rp # +64 tmp[num+1] # mov $n0, 32(%rsp) # save *n0 mov %rax,40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lmulx4x_body: call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmulx4x_epilogue: ret .cfi_endproc .size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 .type mulx4x_internal,\@abi-omnipotent .align 32 mulx4x_internal: .cfi_startproc mov $num,8(%rsp) # save -$num (it was in bytes) mov $num,%r10 neg $num # restore $num shl \$5,$num neg %r10 # restore $num lea 128($bp,$num),%r13 # end of powers table (+size optimization) shr \$5+5,$num movd `($win64?56:8)`(%rax),%xmm5 # load 7th argument sub \$1,$num lea .Linc(%rip),%rax mov %r13,16+8(%rsp) # end of b[num] mov $num,24+8(%rsp) # inner counter mov $rp, 56+8(%rsp) # save $rp ___ my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)= ("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax"); my $rptr=$bptr; my $STRIDE=2**5*8; # 5 is "window size" my $N=$STRIDE/4; # should match cache line size $code.=<<___; movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 88-112(%rsp,%r10),%r10 # place the mask after tp[num+1] (+ICache optimization) lea 128($bp),$bptr # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast index movdqa %xmm1,%xmm4 .byte 0x67 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to index and save result to stack # $code.=<<___; .byte 0x67 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 movdqa %xmm4,%xmm3 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)+112`(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)+112`(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)+112`(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,`16*($i+3)+112`(%r10) movdqa %xmm4,%xmm3 ___ } $code.=<<___; # last iteration can be optimized .byte 0x67 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,`16*($i+0)+112`(%r10) paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,`16*($i+1)+112`(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,`16*($i+2)+112`(%r10) pand `16*($i+0)-128`($bptr),%xmm0 # while it's still in register pand `16*($i+1)-128`($bptr),%xmm1 pand `16*($i+2)-128`($bptr),%xmm2 movdqa %xmm3,`16*($i+3)+112`(%r10) pand `16*($i+3)-128`($bptr),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ for($i=0;$i<$STRIDE/16-4;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bptr),%xmm4 movdqa `16*($i+1)-128`($bptr),%xmm5 movdqa `16*($i+2)-128`($bptr),%xmm2 pand `16*($i+0)+112`(%r10),%xmm4 movdqa `16*($i+3)-128`($bptr),%xmm3 pand `16*($i+1)+112`(%r10),%xmm5 por %xmm4,%xmm0 pand `16*($i+2)+112`(%r10),%xmm2 por %xmm5,%xmm1 pand `16*($i+3)+112`(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 ___ } $code.=<<___; pxor %xmm1,%xmm0 pshufd \$0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # bp[0] lea 64+8*4+8(%rsp),$tptr mov %rdx,$bi mulx 0*8($aptr),$mi,%rax # a[0]*b[0] mulx 1*8($aptr),%r11,%r12 # a[1]*b[0] add %rax,%r11 mulx 2*8($aptr),%rax,%r13 # ... adc %rax,%r12 adc \$0,%r13 mulx 3*8($aptr),%rax,%r14 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 xor $zero,$zero # cf=0, of=0 mov $mi,%rdx mov $bptr,8+8(%rsp) # off-load &b[i] lea 4*8($aptr),$aptr adcx %rax,%r13 adcx $zero,%r14 # cf=0 mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r11,-8*3($tptr) adcx %rax,%r12 adox $zero,%r15 # of=0 lea 4*8($nptr),$nptr mov %r12,-8*2($tptr) jmp .Lmulx4x_1st .align 32 .Lmulx4x_1st: adcx $zero,%r15 # cf=0, modulo-scheduled mulx 0*8($aptr),%r10,%rax # a[4]*b[0] adcx %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[0] adcx %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx %r14,%r12 mulx 3*8($aptr),%r13,%r14 .byte 0x67,0x67 mov $mi,%rdx adcx %rax,%r13 adcx $zero,%r14 # cf=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 mov %r11,-4*8($tptr) adox %r15,%r13 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 lea 4*8($nptr),$nptr mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_1st mov 8(%rsp),$num # load -num adc $zero,%r15 # modulo-scheduled lea ($aptr,$num),$aptr # rewind $aptr add %r15,%r14 mov 8+8(%rsp),$bptr # re-load &b[i] adc $zero,$zero # top-most carry mov %r14,-1*8($tptr) jmp .Lmulx4x_outer .align 32 .Lmulx4x_outer: lea 16-256($tptr),%r10 # where 256-byte mask is (+density control) pxor %xmm4,%xmm4 .byte 0x67,0x67 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`($bptr),%xmm0 movdqa `16*($i+1)-128`($bptr),%xmm1 movdqa `16*($i+2)-128`($bptr),%xmm2 pand `16*($i+0)+256`(%r10),%xmm0 movdqa `16*($i+3)-128`($bptr),%xmm3 pand `16*($i+1)+256`(%r10),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)+256`(%r10),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)+256`(%r10),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 lea $STRIDE($bptr),$bptr movq %xmm0,%rdx # m0=bp[i] mov $zero,($tptr) # save top-most carry lea 4*8($tptr,$num),$tptr # rewind $tptr mulx 0*8($aptr),$mi,%r11 # a[0]*b[i] xor $zero,$zero # cf=0, of=0 mov %rdx,$bi mulx 1*8($aptr),%r14,%r12 # a[1]*b[i] adox -4*8($tptr),$mi # +t[0] adcx %r14,%r11 mulx 2*8($aptr),%r15,%r13 # ... adox -3*8($tptr),%r11 adcx %r15,%r12 mulx 3*8($aptr),%rdx,%r14 adox -2*8($tptr),%r12 adcx %rdx,%r13 lea ($nptr,$num),$nptr # rewind $nptr lea 4*8($aptr),$aptr adox -1*8($tptr),%r13 adcx $zero,%r14 adox $zero,%r14 mov $mi,%r15 imulq 32+8(%rsp),$mi # "t[0]"*n0 mov $mi,%rdx xor $zero,$zero # cf=0, of=0 mov $bptr,8+8(%rsp) # off-load &b[i] mulx 0*8($nptr),%rax,%r10 adcx %rax,%r15 # discarded adox %r11,%r10 mulx 1*8($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 mulx 2*8($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx mov 24+8(%rsp),$bptr # counter value mov %r10,-8*4($tptr) adcx %rax,%r12 mov %r11,-8*3($tptr) adox $zero,%r15 # of=0 mov %r12,-8*2($tptr) lea 4*8($nptr),$nptr jmp .Lmulx4x_inner .align 32 .Lmulx4x_inner: mulx 0*8($aptr),%r10,%rax # a[4]*b[i] adcx $zero,%r15 # cf=0, modulo-scheduled adox %r14,%r10 mulx 1*8($aptr),%r11,%r14 # a[5]*b[i] adcx 0*8($tptr),%r10 adox %rax,%r11 mulx 2*8($aptr),%r12,%rax # ... adcx 1*8($tptr),%r11 adox %r14,%r12 mulx 3*8($aptr),%r13,%r14 mov $mi,%rdx adcx 2*8($tptr),%r12 adox %rax,%r13 adcx 3*8($tptr),%r13 adox $zero,%r14 # of=0 lea 4*8($aptr),$aptr lea 4*8($tptr),$tptr adcx $zero,%r14 # cf=0 adox %r15,%r10 mulx 0*8($nptr),%rax,%r15 adcx %rax,%r10 adox %r15,%r11 mulx 1*8($nptr),%rax,%r15 adcx %rax,%r11 adox %r15,%r12 mulx 2*8($nptr),%rax,%r15 mov %r10,-5*8($tptr) adcx %rax,%r12 adox %r15,%r13 mov %r11,-4*8($tptr) mulx 3*8($nptr),%rax,%r15 mov $bi,%rdx lea 4*8($nptr),$nptr mov %r12,-3*8($tptr) adcx %rax,%r13 adox $zero,%r15 mov %r13,-2*8($tptr) dec $bptr # of=0, pass cf jnz .Lmulx4x_inner mov 0+8(%rsp),$num # load -num adc $zero,%r15 # modulo-scheduled sub 0*8($tptr),$bptr # pull top-most carry to %cf mov 8+8(%rsp),$bptr # re-load &b[i] mov 16+8(%rsp),%r10 adc %r15,%r14 lea ($aptr,$num),$aptr # rewind $aptr adc $zero,$zero # top-most carry mov %r14,-1*8($tptr) cmp %r10,$bptr jb .Lmulx4x_outer mov -8($nptr),%r10 mov $zero,%r8 mov ($nptr,$num),%r12 lea ($nptr,$num),%rbp # rewind $nptr mov $num,%rcx lea ($tptr,$num),%rdi # rewind $tptr xor %eax,%eax xor %r15,%r15 sub %r14,%r10 # compare top-most words adc %r15,%r15 or %r15,%r8 sar \$3+2,%rcx sub %r8,%rax # %rax=-%r8 mov 56+8(%rsp),%rdx # restore rp dec %r12 # so that after 'not' we get -n[0] mov 8*1(%rbp),%r13 xor %r8,%r8 mov 8*2(%rbp),%r14 mov 8*3(%rbp),%r15 jmp .Lsqrx4x_sub_entry # common post-condition .cfi_endproc .size mulx4x_internal,.-mulx4x_internal ___ } { ###################################################################### # void bn_power5( my $rptr="%rdi"; # BN_ULONG *rptr, my $aptr="%rsi"; # const BN_ULONG *aptr, my $bptr="%rdx"; # const void *table, my $nptr="%rcx"; # const BN_ULONG *nptr, my $n0 ="%r8"; # const BN_ULONG *n0); my $num ="%r9"; # int num, has to be divisible by 8 # int pwr); my ($i,$j,$tptr)=("%rbp","%rcx",$rptr); my @A0=("%r10","%r11"); my @A1=("%r12","%r13"); my ($a0,$a1,$ai)=("%r14","%r15","%rbx"); $code.=<<___; .type bn_powerx5,\@function,6 .align 32 bn_powerx5: .cfi_startproc mov %rsp,%rax .cfi_def_cfa_register %rax .Lpowerx5_enter: push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lpowerx5_prologue: shl \$3,${num}d # convert $num to bytes lea ($num,$num,2),%r10 # 3*$num in bytes neg $num mov ($n0),$n0 # *n0 ############################################################## # Ensure that stack frame doesn't alias with $rptr+3*$num # modulo 4096, which covers ret[num], am[num] and n[num] # (see bn_exp.c). This is done to allow memory disambiguation # logic do its magic. [Extra 256 bytes is for power mask # calculated from 7th argument, the index.] # lea -320(%rsp,$num,2),%r11 mov %rsp,%rbp sub $rptr,%r11 and \$4095,%r11 cmp %r11,%r10 jb .Lpwrx_sp_alt sub %r11,%rbp # align with $aptr lea -320(%rbp,$num,2),%rbp # future alloca(frame+2*$num*8+256) jmp .Lpwrx_sp_done .align 32 .Lpwrx_sp_alt: lea 4096-320(,$num,2),%r10 lea -320(%rbp,$num,2),%rbp # alloca(frame+2*$num*8+256) sub %r10,%r11 mov \$0,%r10 cmovc %r10,%r11 sub %r11,%rbp .Lpwrx_sp_done: and \$-64,%rbp mov %rsp,%r11 sub %rbp,%r11 and \$-4096,%r11 lea (%rbp,%r11),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwrx_page_walk jmp .Lpwrx_page_walk_done .Lpwrx_page_walk: lea -4096(%rsp),%rsp mov (%rsp),%r10 cmp %rbp,%rsp ja .Lpwrx_page_walk .Lpwrx_page_walk_done: mov $num,%r10 neg $num ############################################################## # Stack layout # # +0 saved $num, used in reduction section # +8 &t[2*$num], used in reduction section # +16 intermediate carry bit # +24 top-most carry bit, used in reduction section # +32 saved *n0 # +40 saved %rsp # +48 t[2*$num] # pxor %xmm0,%xmm0 movq $rptr,%xmm1 # save $rptr movq $nptr,%xmm2 # save $nptr movq %r10, %xmm3 # -$num movq $bptr,%xmm4 mov $n0, 32(%rsp) mov %rax, 40(%rsp) # save original %rsp .cfi_cfa_expression %rsp+40,deref,+8 .Lpowerx5_body: call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal call __bn_sqrx8x_internal call __bn_postx4x_internal mov %r10,$num # -num mov $aptr,$rptr movq %xmm2,$nptr movq %xmm4,$bptr mov 40(%rsp),%rax call mulx4x_internal mov 40(%rsp),%rsi # restore %rsp .cfi_def_cfa %rsi,8 mov \$1,%rax mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpowerx5_epilogue: ret .cfi_endproc .size bn_powerx5,.-bn_powerx5 .globl bn_sqrx8x_internal .hidden bn_sqrx8x_internal .type bn_sqrx8x_internal,\@abi-omnipotent .align 32 bn_sqrx8x_internal: __bn_sqrx8x_internal: .cfi_startproc ################################################################## # Squaring part: # # a) multiply-n-add everything but a[i]*a[i]; # b) shift result of a) by 1 to the left and accumulate # a[i]*a[i] products; # ################################################################## # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] # a[1]a[0] # a[2]a[0] # a[3]a[0] # a[2]a[1] # a[3]a[1] # a[3]a[2] # # a[4]a[0] # a[5]a[0] # a[6]a[0] # a[7]a[0] # a[4]a[1] # a[5]a[1] # a[6]a[1] # a[7]a[1] # a[4]a[2] # a[5]a[2] # a[6]a[2] # a[7]a[2] # a[4]a[3] # a[5]a[3] # a[6]a[3] # a[7]a[3] # # a[5]a[4] # a[6]a[4] # a[7]a[4] # a[6]a[5] # a[7]a[5] # a[7]a[6] # a[7]a[7]a[6]a[6]a[5]a[5]a[4]a[4]a[3]a[3]a[2]a[2]a[1]a[1]a[0]a[0] ___ { my ($zero,$carry)=("%rbp","%rcx"); my $aaptr=$zero; $code.=<<___; lea 48+8(%rsp),$tptr lea ($aptr,$num),$aaptr mov $num,0+8(%rsp) # save $num mov $aaptr,8+8(%rsp) # save end of $aptr jmp .Lsqr8x_zero_start .align 32 .byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 .Lsqrx8x_zero: .byte 0x3e movdqa %xmm0,0*8($tptr) movdqa %xmm0,2*8($tptr) movdqa %xmm0,4*8($tptr) movdqa %xmm0,6*8($tptr) .Lsqr8x_zero_start: # aligned at 32 movdqa %xmm0,8*8($tptr) movdqa %xmm0,10*8($tptr) movdqa %xmm0,12*8($tptr) movdqa %xmm0,14*8($tptr) lea 16*8($tptr),$tptr sub \$64,$num jnz .Lsqrx8x_zero mov 0*8($aptr),%rdx # a[0], modulo-scheduled #xor %r9,%r9 # t[1], ex-$num, zero already xor %r10,%r10 xor %r11,%r11 xor %r12,%r12 xor %r13,%r13 xor %r14,%r14 xor %r15,%r15 lea 48+8(%rsp),$tptr xor $zero,$zero # cf=0, cf=0 jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_loop: mulx 1*8($aptr),%r8,%rax # a[1]*a[0] adcx %r9,%r8 # a[1]*a[0]+=t[1] adox %rax,%r10 mulx 2*8($aptr),%r9,%rax # a[2]*a[0] adcx %r10,%r9 adox %rax,%r11 .byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 3*8($aptr),%r10,%rax # ... adcx %r11,%r10 adox %rax,%r12 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 # mulx 4*8($aptr),%r11,%rax adcx %r12,%r11 adox %rax,%r13 mulx 5*8($aptr),%r12,%rax adcx %r13,%r12 adox %rax,%r14 mulx 6*8($aptr),%r13,%rax adcx %r14,%r13 adox %r15,%rax mulx 7*8($aptr),%r14,%r15 mov 1*8($aptr),%rdx # a[1] adcx %rax,%r14 adox $zero,%r15 adc 8*8($tptr),%r15 mov %r8,1*8($tptr) # t[1] mov %r9,2*8($tptr) # t[2] sbb $carry,$carry # mov %cf,$carry xor $zero,$zero # cf=0, of=0 mulx 2*8($aptr),%r8,%rbx # a[2]*a[1] mulx 3*8($aptr),%r9,%rax # a[3]*a[1] adcx %r10,%r8 adox %rbx,%r9 mulx 4*8($aptr),%r10,%rbx # ... adcx %r11,%r9 adox %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 # mulx 5*8($aptr),%r11,%rax adcx %r12,%r10 adox %rbx,%r11 .byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r12,%rbx adcx %r13,%r11 adox %r14,%r12 .byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r13,%r14 mov 2*8($aptr),%rdx # a[2] adcx %rax,%r12 adox %rbx,%r13 adcx %r15,%r13 adox $zero,%r14 # of=0 adcx $zero,%r14 # cf=0 mov %r8,3*8($tptr) # t[3] mov %r9,4*8($tptr) # t[4] mulx 3*8($aptr),%r8,%rbx # a[3]*a[2] mulx 4*8($aptr),%r9,%rax # a[4]*a[2] adcx %r10,%r8 adox %rbx,%r9 mulx 5*8($aptr),%r10,%rbx # ... adcx %r11,%r9 adox %rax,%r10 .byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 # mulx 6*8($aptr),%r11,%rax adcx %r12,%r10 adox %r13,%r11 .byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 7*8($aptr),%r12,%r13 .byte 0x3e mov 3*8($aptr),%rdx # a[3] adcx %rbx,%r11 adox %rax,%r12 adcx %r14,%r12 mov %r8,5*8($tptr) # t[5] mov %r9,6*8($tptr) # t[6] mulx 4*8($aptr),%r8,%rax # a[4]*a[3] adox $zero,%r13 # of=0 adcx $zero,%r13 # cf=0 mulx 5*8($aptr),%r9,%rbx # a[5]*a[3] adcx %r10,%r8 adox %rax,%r9 mulx 6*8($aptr),%r10,%rax # ... adcx %r11,%r9 adox %r12,%r10 mulx 7*8($aptr),%r11,%r12 mov 4*8($aptr),%rdx # a[4] mov 5*8($aptr),%r14 # a[5] adcx %rbx,%r10 adox %rax,%r11 mov 6*8($aptr),%r15 # a[6] adcx %r13,%r11 adox $zero,%r12 # of=0 adcx $zero,%r12 # cf=0 mov %r8,7*8($tptr) # t[7] mov %r9,8*8($tptr) # t[8] mulx %r14,%r9,%rax # a[5]*a[4] mov 7*8($aptr),%r8 # a[7] adcx %r10,%r9 mulx %r15,%r10,%rbx # a[6]*a[4] adox %rax,%r10 adcx %r11,%r10 mulx %r8,%r11,%rax # a[7]*a[4] mov %r14,%rdx # a[5] adox %rbx,%r11 adcx %r12,%r11 #adox $zero,%rax # of=0 adcx $zero,%rax # cf=0 mulx %r15,%r14,%rbx # a[6]*a[5] mulx %r8,%r12,%r13 # a[7]*a[5] mov %r15,%rdx # a[6] lea 8*8($aptr),$aptr adcx %r14,%r11 adox %rbx,%r12 adcx %rax,%r12 adox $zero,%r13 .byte 0x67,0x67 mulx %r8,%r8,%r14 # a[7]*a[6] adcx %r8,%r13 adcx $zero,%r14 cmp 8+8(%rsp),$aptr je .Lsqrx8x_outer_break neg $carry # mov $carry,%cf mov \$-8,%rcx mov $zero,%r15 mov 8*8($tptr),%r8 adcx 9*8($tptr),%r9 # +=t[9] adcx 10*8($tptr),%r10 # ... adcx 11*8($tptr),%r11 adc 12*8($tptr),%r12 adc 13*8($tptr),%r13 adc 14*8($tptr),%r14 adc 15*8($tptr),%r15 lea ($aptr),$aaptr lea 2*64($tptr),$tptr sbb %rax,%rax # mov %cf,$carry mov -64($aptr),%rdx # a[0] mov %rax,16+8(%rsp) # offload $carry mov $tptr,24+8(%rsp) #lea 8*8($tptr),$tptr # see 2*8*8($tptr) above xor %eax,%eax # cf=0, of=0 jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_loop: mov %r8,%rbx mulx 0*8($aaptr),%rax,%r8 # a[8]*a[i] adcx %rax,%rbx # +=t[8] adox %r9,%r8 mulx 1*8($aaptr),%rax,%r9 # ... adcx %rax,%r8 adox %r10,%r9 mulx 2*8($aaptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 mulx 3*8($aaptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 4*8($aaptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 5*8($aaptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 6*8($aaptr),%rax,%r14 mov %rbx,($tptr,%rcx,8) # store t[8+i] mov \$0,%ebx adcx %rax,%r13 adox %r15,%r14 .byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 # mulx 7*8($aaptr),%rax,%r15 mov 8($aptr,%rcx,8),%rdx # a[i] adcx %rax,%r14 adox %rbx,%r15 # %rbx is 0, of=0 adcx %rbx,%r15 # cf=0 .byte 0x67 inc %rcx # of=0 jnz .Lsqrx8x_loop lea 8*8($aaptr),$aaptr mov \$-8,%rcx cmp 8+8(%rsp),$aaptr # done? je .Lsqrx8x_break sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf .byte 0x66 mov -64($aptr),%rdx adcx 0*8($tptr),%r8 adcx 1*8($tptr),%r9 adc 2*8($tptr),%r10 adc 3*8($tptr),%r11 adc 4*8($tptr),%r12 adc 5*8($tptr),%r13 adc 6*8($tptr),%r14 adc 7*8($tptr),%r15 lea 8*8($tptr),$tptr .byte 0x67 sbb %rax,%rax # mov %cf,%rax xor %ebx,%ebx # cf=0, of=0 mov %rax,16+8(%rsp) # offload carry jmp .Lsqrx8x_loop .align 32 .Lsqrx8x_break: xor $zero,$zero sub 16+8(%rsp),%rbx # mov 16(%rsp),%cf adcx $zero,%r8 mov 24+8(%rsp),$carry # initial $tptr, borrow $carry adcx $zero,%r9 mov 0*8($aptr),%rdx # a[8], modulo-scheduled adc \$0,%r10 mov %r8,0*8($tptr) adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 cmp $carry,$tptr # cf=0, of=0 je .Lsqrx8x_outer_loop mov %r9,1*8($tptr) mov 1*8($carry),%r9 mov %r10,2*8($tptr) mov 2*8($carry),%r10 mov %r11,3*8($tptr) mov 3*8($carry),%r11 mov %r12,4*8($tptr) mov 4*8($carry),%r12 mov %r13,5*8($tptr) mov 5*8($carry),%r13 mov %r14,6*8($tptr) mov 6*8($carry),%r14 mov %r15,7*8($tptr) mov 7*8($carry),%r15 mov $carry,$tptr jmp .Lsqrx8x_outer_loop .align 32 .Lsqrx8x_outer_break: mov %r9,9*8($tptr) # t[9] movq %xmm3,%rcx # -$num mov %r10,10*8($tptr) # ... mov %r11,11*8($tptr) mov %r12,12*8($tptr) mov %r13,13*8($tptr) mov %r14,14*8($tptr) ___ } { my $i="%rcx"; $code.=<<___; lea 48+8(%rsp),$tptr mov ($aptr,$i),%rdx # a[0] mov 8($tptr),$A0[1] # t[1] xor $A0[0],$A0[0] # t[0], of=0, cf=0 mov 0+8(%rsp),$num # restore $num adox $A0[1],$A0[1] mov 16($tptr),$A1[0] # t[2] # prefetch mov 24($tptr),$A1[1] # t[3] # prefetch #jmp .Lsqrx4x_shift_n_add # happens to be aligned .align 32 .Lsqrx4x_shift_n_add: mulx %rdx,%rax,%rbx adox $A1[0],$A1[0] adcx $A0[0],%rax .byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 # mov 8($aptr,$i),%rdx # a[i+1] # prefetch .byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 # mov 32($tptr),$A0[0] # t[2*i+4] # prefetch adox $A1[1],$A1[1] adcx $A0[1],%rbx mov 40($tptr),$A0[1] # t[2*i+4+1] # prefetch mov %rax,0($tptr) mov %rbx,8($tptr) mulx %rdx,%rax,%rbx adox $A0[0],$A0[0] adcx $A1[0],%rax mov 16($aptr,$i),%rdx # a[i+2] # prefetch mov 48($tptr),$A1[0] # t[2*i+6] # prefetch adox $A0[1],$A0[1] adcx $A1[1],%rbx mov 56($tptr),$A1[1] # t[2*i+6+1] # prefetch mov %rax,16($tptr) mov %rbx,24($tptr) mulx %rdx,%rax,%rbx adox $A1[0],$A1[0] adcx $A0[0],%rax mov 24($aptr,$i),%rdx # a[i+3] # prefetch lea 32($i),$i mov 64($tptr),$A0[0] # t[2*i+8] # prefetch adox $A1[1],$A1[1] adcx $A0[1],%rbx mov 72($tptr),$A0[1] # t[2*i+8+1] # prefetch mov %rax,32($tptr) mov %rbx,40($tptr) mulx %rdx,%rax,%rbx adox $A0[0],$A0[0] adcx $A1[0],%rax jrcxz .Lsqrx4x_shift_n_add_break .byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 # mov 0($aptr,$i),%rdx # a[i+4] # prefetch adox $A0[1],$A0[1] adcx $A1[1],%rbx mov 80($tptr),$A1[0] # t[2*i+10] # prefetch mov 88($tptr),$A1[1] # t[2*i+10+1] # prefetch mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr nop jmp .Lsqrx4x_shift_n_add .align 32 .Lsqrx4x_shift_n_add_break: adcx $A1[1],%rbx mov %rax,48($tptr) mov %rbx,56($tptr) lea 64($tptr),$tptr # end of t[] buffer ___ } ###################################################################### # Montgomery reduction part, "word-by-word" algorithm. # # This new path is inspired by multiple submissions from Intel, by # Shay Gueron, Vlad Krasnov, Erdinc Ozturk, James Guilford, # Vinodh Gopal... { my ($nptr,$carry,$m0)=("%rbp","%rsi","%rdx"); $code.=<<___; movq %xmm2,$nptr __bn_sqrx8x_reduction: xor %eax,%eax # initial top-most carry bit mov 32+8(%rsp),%rbx # n0 mov 48+8(%rsp),%rdx # "%r8", 8*0($tptr) lea -8*8($nptr,$num),%rcx # end of n[] #lea 48+8(%rsp,$num,2),$tptr # end of t[] buffer mov %rcx, 0+8(%rsp) # save end of n[] mov $tptr,8+8(%rsp) # save end of t[] lea 48+8(%rsp),$tptr # initial t[] window jmp .Lsqrx8x_reduction_loop .align 32 .Lsqrx8x_reduction_loop: mov 8*1($tptr),%r9 mov 8*2($tptr),%r10 mov 8*3($tptr),%r11 mov 8*4($tptr),%r12 mov %rdx,%r8 imulq %rbx,%rdx # n0*a[i] mov 8*5($tptr),%r13 mov 8*6($tptr),%r14 mov 8*7($tptr),%r15 mov %rax,24+8(%rsp) # store top-most carry bit lea 8*8($tptr),$tptr xor $carry,$carry # cf=0,of=0 mov \$-8,%rcx jmp .Lsqrx8x_reduce .align 32 .Lsqrx8x_reduce: mov %r8, %rbx mulx 8*0($nptr),%rax,%r8 # n[0] adcx %rbx,%rax # discarded adox %r9,%r8 mulx 8*1($nptr),%rbx,%r9 # n[1] adcx %rbx,%r8 adox %r10,%r9 mulx 8*2($nptr),%rbx,%r10 adcx %rbx,%r9 adox %r11,%r10 mulx 8*3($nptr),%rbx,%r11 adcx %rbx,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rbx,%r12 mov %rdx,%rax mov %r8,%rdx adcx %rbx,%r11 adox %r13,%r12 mulx 32+8(%rsp),%rbx,%rdx # %rdx discarded mov %rax,%rdx mov %rax,64+48+8(%rsp,%rcx,8) # put aside n0*a[i] mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 mulx 8*7($nptr),%rax,%r15 mov %rbx,%rdx adcx %rax,%r14 adox $carry,%r15 # $carry is 0 adcx $carry,%r15 # cf=0 .byte 0x67,0x67,0x67 inc %rcx # of=0 jnz .Lsqrx8x_reduce mov $carry,%rax # xor %rax,%rax cmp 0+8(%rsp),$nptr # end of n[]? jae .Lsqrx8x_no_tail mov 48+8(%rsp),%rdx # pull n0*a[0] add 8*0($tptr),%r8 lea 8*8($nptr),$nptr mov \$-8,%rcx adcx 8*1($tptr),%r9 adcx 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb %rax,%rax # top carry xor $carry,$carry # of=0, cf=0 mov %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail: mov %r8,%rbx mulx 8*0($nptr),%rax,%r8 adcx %rax,%rbx adox %r9,%r8 mulx 8*1($nptr),%rax,%r9 adcx %rax,%r8 adox %r10,%r9 mulx 8*2($nptr),%rax,%r10 adcx %rax,%r9 adox %r11,%r10 mulx 8*3($nptr),%rax,%r11 adcx %rax,%r10 adox %r12,%r11 .byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 8*4($nptr),%rax,%r12 adcx %rax,%r11 adox %r13,%r12 mulx 8*5($nptr),%rax,%r13 adcx %rax,%r12 adox %r14,%r13 mulx 8*6($nptr),%rax,%r14 adcx %rax,%r13 adox %r15,%r14 mulx 8*7($nptr),%rax,%r15 mov 72+48+8(%rsp,%rcx,8),%rdx # pull n0*a[i] adcx %rax,%r14 adox $carry,%r15 mov %rbx,($tptr,%rcx,8) # save result mov %r8,%rbx adcx $carry,%r15 # cf=0 inc %rcx # of=0 jnz .Lsqrx8x_tail cmp 0+8(%rsp),$nptr # end of n[]? jae .Lsqrx8x_tail_done # break out of loop sub 16+8(%rsp),$carry # mov 16(%rsp),%cf mov 48+8(%rsp),%rdx # pull n0*a[0] lea 8*8($nptr),$nptr adc 8*0($tptr),%r8 adc 8*1($tptr),%r9 adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 lea 8*8($tptr),$tptr sbb %rax,%rax sub \$8,%rcx # mov \$-8,%rcx xor $carry,$carry # of=0, cf=0 mov %rax,16+8(%rsp) jmp .Lsqrx8x_tail .align 32 .Lsqrx8x_tail_done: xor %rax,%rax add 24+8(%rsp),%r8 # can this overflow? adc \$0,%r9 adc \$0,%r10 adc \$0,%r11 adc \$0,%r12 adc \$0,%r13 adc \$0,%r14 adc \$0,%r15 adc \$0,%rax sub 16+8(%rsp),$carry # mov 16(%rsp),%cf .Lsqrx8x_no_tail: # %cf is 0 if jumped here adc 8*0($tptr),%r8 movq %xmm3,%rcx adc 8*1($tptr),%r9 mov 8*7($nptr),$carry movq %xmm2,$nptr # restore $nptr adc 8*2($tptr),%r10 adc 8*3($tptr),%r11 adc 8*4($tptr),%r12 adc 8*5($tptr),%r13 adc 8*6($tptr),%r14 adc 8*7($tptr),%r15 adc \$0,%rax # top-most carry mov 32+8(%rsp),%rbx # n0 mov 8*8($tptr,%rcx),%rdx # modulo-scheduled "%r8" mov %r8,8*0($tptr) # store top 512 bits lea 8*8($tptr),%r8 # borrow %r8 mov %r9,8*1($tptr) mov %r10,8*2($tptr) mov %r11,8*3($tptr) mov %r12,8*4($tptr) mov %r13,8*5($tptr) mov %r14,8*6($tptr) mov %r15,8*7($tptr) lea 8*8($tptr,%rcx),$tptr # start of current t[] window cmp 8+8(%rsp),%r8 # end of t[]? jb .Lsqrx8x_reduction_loop ret .cfi_endproc .size bn_sqrx8x_internal,.-bn_sqrx8x_internal ___ } ############################################################## # Post-condition, 4x unrolled # { my ($rptr,$nptr)=("%rdx","%rbp"); $code.=<<___; .align 32 __bn_postx4x_internal: .cfi_startproc mov 8*0($nptr),%r12 mov %rcx,%r10 # -$num mov %rcx,%r9 # -$num neg %rax sar \$3+2,%rcx #lea 48+8(%rsp,%r9),$tptr movq %xmm1,$rptr # restore $rptr movq %xmm1,$aptr # prepare for back-to-back call dec %r12 # so that after 'not' we get -n[0] mov 8*1($nptr),%r13 xor %r8,%r8 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 jmp .Lsqrx4x_sub_entry .align 16 .Lsqrx4x_sub: mov 8*0($nptr),%r12 mov 8*1($nptr),%r13 mov 8*2($nptr),%r14 mov 8*3($nptr),%r15 .Lsqrx4x_sub_entry: andn %rax,%r12,%r12 lea 8*4($nptr),$nptr andn %rax,%r13,%r13 andn %rax,%r14,%r14 andn %rax,%r15,%r15 neg %r8 # mov %r8,%cf adc 8*0($tptr),%r12 adc 8*1($tptr),%r13 adc 8*2($tptr),%r14 adc 8*3($tptr),%r15 mov %r12,8*0($rptr) lea 8*4($tptr),$tptr mov %r13,8*1($rptr) sbb %r8,%r8 # mov %cf,%r8 mov %r14,8*2($rptr) mov %r15,8*3($rptr) lea 8*4($rptr),$rptr inc %rcx jnz .Lsqrx4x_sub neg %r9 # restore $num ret .cfi_endproc .size __bn_postx4x_internal,.-__bn_postx4x_internal ___ } }}} { my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%edx","%r8", "%r9d") : # Win64 order ("%rdi","%esi","%rdx","%ecx"); # Unix order my $out=$inp; my $STRIDE=2**5*8; my $N=$STRIDE/4; $code.=<<___; .globl bn_get_bits5 .type bn_get_bits5,\@abi-omnipotent .align 16 bn_get_bits5: .cfi_startproc lea 0($inp),%r10 lea 1($inp),%r11 mov $num,%ecx shr \$4,$num and \$15,%ecx lea -8(%ecx),%eax cmp \$11,%ecx cmova %r11,%r10 cmova %eax,%ecx movzw (%r10,$num,2),%eax shrl %cl,%eax and \$31,%eax ret .cfi_endproc .size bn_get_bits5,.-bn_get_bits5 .globl bn_scatter5 .type bn_scatter5,\@abi-omnipotent .align 16 bn_scatter5: .cfi_startproc cmp \$0, $num jz .Lscatter_epilogue lea ($tbl,$idx,8),$tbl .Lscatter: mov ($inp),%rax lea 8($inp),$inp mov %rax,($tbl) lea 32*8($tbl),$tbl sub \$1,$num jnz .Lscatter .Lscatter_epilogue: ret .cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 .type bn_gather5,\@abi-omnipotent .align 32 bn_gather5: .LSEH_begin_bn_gather5: # Win64 thing, but harmless in other cases .cfi_startproc # I can't trust assembler to use specific encoding:-( .byte 0x4c,0x8d,0x14,0x24 #lea (%rsp),%r10 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 #sub $0x108,%rsp lea .Linc(%rip),%rax and \$-16,%rsp # shouldn't be formally required movd $idx,%xmm5 movdqa 0(%rax),%xmm0 # 00000001000000010000000000000000 movdqa 16(%rax),%xmm1 # 00000002000000020000000200000002 lea 128($tbl),%r11 # size optimization lea 128(%rsp),%rax # size optimization pshufd \$0,%xmm5,%xmm5 # broadcast $idx movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 ___ ######################################################################## # calculate mask by comparing 0..31 to $idx and save result to stack # for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 # compare to 1,0 ___ $code.=<<___ if ($i); movdqa %xmm3,`16*($i-1)-128`(%rax) ___ $code.=<<___; movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 # compare to 3,2 movdqa %xmm0,`16*($i+0)-128`(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 # compare to 5,4 movdqa %xmm1,`16*($i+1)-128`(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 # compare to 7,6 movdqa %xmm2,`16*($i+2)-128`(%rax) movdqa %xmm4,%xmm2 ___ } $code.=<<___; movdqa %xmm3,`16*($i-1)-128`(%rax) jmp .Lgather .align 32 .Lgather: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 ___ for($i=0;$i<$STRIDE/16;$i+=4) { $code.=<<___; movdqa `16*($i+0)-128`(%r11),%xmm0 movdqa `16*($i+1)-128`(%r11),%xmm1 movdqa `16*($i+2)-128`(%r11),%xmm2 pand `16*($i+0)-128`(%rax),%xmm0 movdqa `16*($i+3)-128`(%r11),%xmm3 pand `16*($i+1)-128`(%rax),%xmm1 por %xmm0,%xmm4 pand `16*($i+2)-128`(%rax),%xmm2 por %xmm1,%xmm5 pand `16*($i+3)-128`(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 ___ } $code.=<<___; por %xmm5,%xmm4 lea $STRIDE(%r11),%r11 pshufd \$0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 movq %xmm0,($out) # m0=bp[0] lea 8($out),$out sub \$1,$num jnz .Lgather lea (%r10),%rsp ret .LSEH_end_bn_gather5: .cfi_endproc .size bn_gather5,.-bn_gather5 ___ } $code.=<<___; .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 .asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by " ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type mul_handler,\@abi-omnipotent .align 16 mul_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRipRsp mov 8(%r11),%r10d # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea .Lmul_epilogue(%rip),%r10 cmp %r10,%rbx ja .Lbody_40 mov 192($context),%r10 # pull $num mov 8(%rax,%r10,8),%rax # pull saved stack pointer jmp .Lcommon_pop_regs .Lbody_40: mov 40(%rax),%rax # pull saved stack pointer .Lcommon_pop_regs: mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size mul_handler,.-mul_handler .section .pdata .align 4 .rva .LSEH_begin_bn_mul_mont_gather5 .rva .LSEH_end_bn_mul_mont_gather5 .rva .LSEH_info_bn_mul_mont_gather5 .rva .LSEH_begin_bn_mul4x_mont_gather5 .rva .LSEH_end_bn_mul4x_mont_gather5 .rva .LSEH_info_bn_mul4x_mont_gather5 .rva .LSEH_begin_bn_power5 .rva .LSEH_end_bn_power5 .rva .LSEH_info_bn_power5 .rva .LSEH_begin_bn_from_mont8x .rva .LSEH_end_bn_from_mont8x .rva .LSEH_info_bn_from_mont8x ___ $code.=<<___ if ($addx); .rva .LSEH_begin_bn_mulx4x_mont_gather5 .rva .LSEH_end_bn_mulx4x_mont_gather5 .rva .LSEH_info_bn_mulx4x_mont_gather5 .rva .LSEH_begin_bn_powerx5 .rva .LSEH_end_bn_powerx5 .rva .LSEH_info_bn_powerx5 ___ $code.=<<___; .rva .LSEH_begin_bn_gather5 .rva .LSEH_end_bn_gather5 .rva .LSEH_info_bn_gather5 .section .xdata .align 8 .LSEH_info_bn_mul_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmul_body,.Lmul_body,.Lmul_epilogue # HandlerData[] .align 8 .LSEH_info_bn_mul4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmul4x_prologue,.Lmul4x_body,.Lmul4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_power5: .byte 9,0,0,0 .rva mul_handler .rva .Lpower5_prologue,.Lpower5_body,.Lpower5_epilogue # HandlerData[] .align 8 .LSEH_info_bn_from_mont8x: .byte 9,0,0,0 .rva mul_handler .rva .Lfrom_prologue,.Lfrom_body,.Lfrom_epilogue # HandlerData[] ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_bn_mulx4x_mont_gather5: .byte 9,0,0,0 .rva mul_handler .rva .Lmulx4x_prologue,.Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[] .align 8 .LSEH_info_bn_powerx5: .byte 9,0,0,0 .rva mul_handler .rva .Lpowerx5_prologue,.Lpowerx5_body,.Lpowerx5_epilogue # HandlerData[] ___ $code.=<<___; .align 8 .LSEH_info_bn_gather5: .byte 0x01,0x0b,0x03,0x0a .byte 0x0b,0x01,0x21,0x00 # sub rsp,0x108 .byte 0x04,0xa3,0x00,0x00 # lea r10,(rsp) .align 8 ___ } $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/chacha/asm/chacha-x86.pl =================================================================== --- stable/12/crypto/openssl/crypto/chacha/asm/chacha-x86.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/chacha/asm/chacha-x86.pl (revision 364963) @@ -1,1155 +1,1155 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # January 2015 # # ChaCha20 for x86. # # Performance in cycles per byte out of large buffer. # # 1xIALU/gcc 4xSSSE3 # Pentium 17.5/+80% # PIII 14.2/+60% # P4 18.6/+84% # Core2 9.56/+89% 4.83 # Westmere 9.50/+45% 3.35 # Sandy Bridge 10.5/+47% 3.20 # Haswell 8.15/+50% 2.83 # Skylake 7.53/+22% 2.75 # Silvermont 17.4/+36% 8.35 # Goldmont 13.4/+40% 4.36 # Sledgehammer 10.2/+54% # Bulldozer 13.4/+50% 4.38(*) # # (*) Bulldozer actually executes 4xXOP code path that delivers 3.55; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } $ymm=1 if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/ && ($gasver=$1)>=2.19); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && - `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && + `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); # first version supporting AVX $a="eax"; ($b,$b_)=("ebx","ebp"); ($c,$c_)=("ecx","esi"); ($d,$d_)=("edx","edi"); sub QUARTERROUND { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&add ($a,$b); # see elsewhere &xor ($d,$a); &mov (&DWP(4*$cp,"esp"),$c_) if ($ai>0 && $ai<3); &rol ($d,16); &mov (&DWP(4*$bp,"esp"),$b_) if ($i!=0); &add ($c,$d); &mov ($c_,&DWP(4*$cn,"esp")) if ($ai>0 && $ai<3); &xor ($b,$c); &mov ($d_,&DWP(4*$dn,"esp")) if ($di!=$dn); &rol ($b,12); &mov ($b_,&DWP(4*$bn,"esp")) if ($i<7); &mov ($b_,&DWP(128,"esp")) if ($i==7); # loop counter &add ($a,$b); &xor ($d,$a); &mov (&DWP(4*$ai,"esp"),$a); &rol ($d,8); &mov ($a,&DWP(4*$an,"esp")); &add ($c,$d); &mov (&DWP(4*$di,"esp"),$d) if ($di!=$dn); &mov ($d_,$d) if ($di==$dn); &xor ($b,$c); &add ($a,$b_) if ($i<7); # elsewhere &rol ($b,7); ($b,$b_)=($b_,$b); ($c,$c_)=($c_,$c); ($d,$d_)=($d_,$d); } &static_label("ssse3_shortcut"); &static_label("xop_shortcut"); &static_label("ssse3_data"); &static_label("pic_point"); &function_begin("ChaCha20_ctr32"); &xor ("eax","eax"); &cmp ("eax",&wparam(2)); # len==0? &je (&label("no_data")); if ($xmm) { &call (&label("pic_point")); &set_label("pic_point"); &blindpop("eax"); &picmeup("ebp","OPENSSL_ia32cap_P","eax",&label("pic_point")); &test (&DWP(0,"ebp"),1<<24); # test FXSR bit &jz (&label("x86")); &test (&DWP(4,"ebp"),1<<9); # test SSSE3 bit &jz (&label("x86")); &jmp (&label("ssse3_shortcut")); &set_label("x86"); } &mov ("esi",&wparam(3)); # key &mov ("edi",&wparam(4)); # counter and nonce &stack_push(33); &mov ("eax",&DWP(4*0,"esi")); # copy key &mov ("ebx",&DWP(4*1,"esi")); &mov ("ecx",&DWP(4*2,"esi")); &mov ("edx",&DWP(4*3,"esi")); &mov (&DWP(64+4*4,"esp"),"eax"); &mov (&DWP(64+4*5,"esp"),"ebx"); &mov (&DWP(64+4*6,"esp"),"ecx"); &mov (&DWP(64+4*7,"esp"),"edx"); &mov ("eax",&DWP(4*4,"esi")); &mov ("ebx",&DWP(4*5,"esi")); &mov ("ecx",&DWP(4*6,"esi")); &mov ("edx",&DWP(4*7,"esi")); &mov (&DWP(64+4*8,"esp"),"eax"); &mov (&DWP(64+4*9,"esp"),"ebx"); &mov (&DWP(64+4*10,"esp"),"ecx"); &mov (&DWP(64+4*11,"esp"),"edx"); &mov ("eax",&DWP(4*0,"edi")); # copy counter and nonce &mov ("ebx",&DWP(4*1,"edi")); &mov ("ecx",&DWP(4*2,"edi")); &mov ("edx",&DWP(4*3,"edi")); &sub ("eax",1); &mov (&DWP(64+4*12,"esp"),"eax"); &mov (&DWP(64+4*13,"esp"),"ebx"); &mov (&DWP(64+4*14,"esp"),"ecx"); &mov (&DWP(64+4*15,"esp"),"edx"); &jmp (&label("entry")); &set_label("outer_loop",16); &mov (&wparam(1),$b); # save input &mov (&wparam(0),$a); # save output &mov (&wparam(2),$c); # save len &set_label("entry"); &mov ($a,0x61707865); &mov (&DWP(4*1,"esp"),0x3320646e); &mov (&DWP(4*2,"esp"),0x79622d32); &mov (&DWP(4*3,"esp"),0x6b206574); &mov ($b, &DWP(64+4*5,"esp")); # copy key material &mov ($b_,&DWP(64+4*6,"esp")); &mov ($c, &DWP(64+4*10,"esp")); &mov ($c_,&DWP(64+4*11,"esp")); &mov ($d, &DWP(64+4*13,"esp")); &mov ($d_,&DWP(64+4*14,"esp")); &mov (&DWP(4*5,"esp"),$b); &mov (&DWP(4*6,"esp"),$b_); &mov (&DWP(4*10,"esp"),$c); &mov (&DWP(4*11,"esp"),$c_); &mov (&DWP(4*13,"esp"),$d); &mov (&DWP(4*14,"esp"),$d_); &mov ($b, &DWP(64+4*7,"esp")); &mov ($d_,&DWP(64+4*15,"esp")); &mov ($d, &DWP(64+4*12,"esp")); &mov ($b_,&DWP(64+4*4,"esp")); &mov ($c, &DWP(64+4*8,"esp")); &mov ($c_,&DWP(64+4*9,"esp")); &add ($d,1); # counter value &mov (&DWP(4*7,"esp"),$b); &mov (&DWP(4*15,"esp"),$d_); &mov (&DWP(64+4*12,"esp"),$d); # save counter value &mov ($b,10); # loop counter &jmp (&label("loop")); &set_label("loop",16); &add ($a,$b_); # elsewhere &mov (&DWP(128,"esp"),$b); # save loop counter &mov ($b,$b_); &QUARTERROUND(0, 4, 8, 12, 0); &QUARTERROUND(1, 5, 9, 13, 1); &QUARTERROUND(2, 6,10, 14, 2); &QUARTERROUND(3, 7,11, 15, 3); &QUARTERROUND(0, 5,10, 15, 4); &QUARTERROUND(1, 6,11, 12, 5); &QUARTERROUND(2, 7, 8, 13, 6); &QUARTERROUND(3, 4, 9, 14, 7); &dec ($b); &jnz (&label("loop")); &mov ($b,&wparam(2)); # load len &add ($a,0x61707865); # accumulate key material &add ($b_,&DWP(64+4*4,"esp")); &add ($c, &DWP(64+4*8,"esp")); &add ($c_,&DWP(64+4*9,"esp")); &cmp ($b,64); &jb (&label("tail")); &mov ($b,&wparam(1)); # load input pointer &add ($d, &DWP(64+4*12,"esp")); &add ($d_,&DWP(64+4*14,"esp")); &xor ($a, &DWP(4*0,$b)); # xor with input &xor ($b_,&DWP(4*4,$b)); &mov (&DWP(4*0,"esp"),$a); &mov ($a,&wparam(0)); # load output pointer &xor ($c, &DWP(4*8,$b)); &xor ($c_,&DWP(4*9,$b)); &xor ($d, &DWP(4*12,$b)); &xor ($d_,&DWP(4*14,$b)); &mov (&DWP(4*4,$a),$b_); # write output &mov (&DWP(4*8,$a),$c); &mov (&DWP(4*9,$a),$c_); &mov (&DWP(4*12,$a),$d); &mov (&DWP(4*14,$a),$d_); &mov ($b_,&DWP(4*1,"esp")); &mov ($c, &DWP(4*2,"esp")); &mov ($c_,&DWP(4*3,"esp")); &mov ($d, &DWP(4*5,"esp")); &mov ($d_,&DWP(4*6,"esp")); &add ($b_,0x3320646e); # accumulate key material &add ($c, 0x79622d32); &add ($c_,0x6b206574); &add ($d, &DWP(64+4*5,"esp")); &add ($d_,&DWP(64+4*6,"esp")); &xor ($b_,&DWP(4*1,$b)); &xor ($c, &DWP(4*2,$b)); &xor ($c_,&DWP(4*3,$b)); &xor ($d, &DWP(4*5,$b)); &xor ($d_,&DWP(4*6,$b)); &mov (&DWP(4*1,$a),$b_); &mov (&DWP(4*2,$a),$c); &mov (&DWP(4*3,$a),$c_); &mov (&DWP(4*5,$a),$d); &mov (&DWP(4*6,$a),$d_); &mov ($b_,&DWP(4*7,"esp")); &mov ($c, &DWP(4*10,"esp")); &mov ($c_,&DWP(4*11,"esp")); &mov ($d, &DWP(4*13,"esp")); &mov ($d_,&DWP(4*15,"esp")); &add ($b_,&DWP(64+4*7,"esp")); &add ($c, &DWP(64+4*10,"esp")); &add ($c_,&DWP(64+4*11,"esp")); &add ($d, &DWP(64+4*13,"esp")); &add ($d_,&DWP(64+4*15,"esp")); &xor ($b_,&DWP(4*7,$b)); &xor ($c, &DWP(4*10,$b)); &xor ($c_,&DWP(4*11,$b)); &xor ($d, &DWP(4*13,$b)); &xor ($d_,&DWP(4*15,$b)); &lea ($b,&DWP(4*16,$b)); &mov (&DWP(4*7,$a),$b_); &mov ($b_,&DWP(4*0,"esp")); &mov (&DWP(4*10,$a),$c); &mov ($c,&wparam(2)); # len &mov (&DWP(4*11,$a),$c_); &mov (&DWP(4*13,$a),$d); &mov (&DWP(4*15,$a),$d_); &mov (&DWP(4*0,$a),$b_); &lea ($a,&DWP(4*16,$a)); &sub ($c,64); &jnz (&label("outer_loop")); &jmp (&label("done")); &set_label("tail"); &add ($d, &DWP(64+4*12,"esp")); &add ($d_,&DWP(64+4*14,"esp")); &mov (&DWP(4*0,"esp"),$a); &mov (&DWP(4*4,"esp"),$b_); &mov (&DWP(4*8,"esp"),$c); &mov (&DWP(4*9,"esp"),$c_); &mov (&DWP(4*12,"esp"),$d); &mov (&DWP(4*14,"esp"),$d_); &mov ($b_,&DWP(4*1,"esp")); &mov ($c, &DWP(4*2,"esp")); &mov ($c_,&DWP(4*3,"esp")); &mov ($d, &DWP(4*5,"esp")); &mov ($d_,&DWP(4*6,"esp")); &add ($b_,0x3320646e); # accumulate key material &add ($c, 0x79622d32); &add ($c_,0x6b206574); &add ($d, &DWP(64+4*5,"esp")); &add ($d_,&DWP(64+4*6,"esp")); &mov (&DWP(4*1,"esp"),$b_); &mov (&DWP(4*2,"esp"),$c); &mov (&DWP(4*3,"esp"),$c_); &mov (&DWP(4*5,"esp"),$d); &mov (&DWP(4*6,"esp"),$d_); &mov ($b_,&DWP(4*7,"esp")); &mov ($c, &DWP(4*10,"esp")); &mov ($c_,&DWP(4*11,"esp")); &mov ($d, &DWP(4*13,"esp")); &mov ($d_,&DWP(4*15,"esp")); &add ($b_,&DWP(64+4*7,"esp")); &add ($c, &DWP(64+4*10,"esp")); &add ($c_,&DWP(64+4*11,"esp")); &add ($d, &DWP(64+4*13,"esp")); &add ($d_,&DWP(64+4*15,"esp")); &mov (&DWP(4*7,"esp"),$b_); &mov ($b_,&wparam(1)); # load input &mov (&DWP(4*10,"esp"),$c); &mov ($c,&wparam(0)); # load output &mov (&DWP(4*11,"esp"),$c_); &xor ($c_,$c_); &mov (&DWP(4*13,"esp"),$d); &mov (&DWP(4*15,"esp"),$d_); &xor ("eax","eax"); &xor ("edx","edx"); &set_label("tail_loop"); &movb ("al",&BP(0,$c_,$b_)); &movb ("dl",&BP(0,"esp",$c_)); &lea ($c_,&DWP(1,$c_)); &xor ("al","dl"); &mov (&BP(-1,$c,$c_),"al"); &dec ($b); &jnz (&label("tail_loop")); &set_label("done"); &stack_pop(33); &set_label("no_data"); &function_end("ChaCha20_ctr32"); if ($xmm) { my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); my ($out,$inp,$len)=("edi","esi","ecx"); sub QUARTERROUND_SSSE3 { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&paddd ($xa,$xb); # see elsewhere #&pxor ($xd,$xa); # see elsewhere &movdqa(&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); &pshufb ($xd,&QWP(0,"eax")); # rot16 &movdqa(&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); &paddd ($xc,$xd); &movdqa($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); &pxor ($xb,$xc); &movdqa($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); &movdqa ($xa_,$xb); # borrow as temporary &pslld ($xb,12); &psrld ($xa_,20); &por ($xb,$xa_); &movdqa($xa_,&QWP(16*$an-128,"ebx")); &paddd ($xa,$xb); &movdqa($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); &pxor ($xd,$xa); &movdqa (&QWP(16*$ai-128,"ebx"),$xa); &pshufb ($xd,&QWP(16,"eax")); # rot8 &paddd ($xc,$xd); &movdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); &movdqa ($xd_,$xd) if ($di==$dn); &pxor ($xb,$xc); &paddd ($xa_,$xb_) if ($i<7); # elsewhere &movdqa ($xa,$xb); # borrow as temporary &pslld ($xb,7); &psrld ($xa,25); &pxor ($xd_,$xa_) if ($i<7); # elsewhere &por ($xb,$xa); ($xa,$xa_)=($xa_,$xa); ($xb,$xb_)=($xb_,$xb); ($xc,$xc_)=($xc_,$xc); ($xd,$xd_)=($xd_,$xd); } &function_begin("ChaCha20_ssse3"); &set_label("ssse3_shortcut"); if ($ymm) { &test (&DWP(4,"ebp"),1<<11); # test XOP bit &jnz (&label("xop_shortcut")); } &mov ($out,&wparam(0)); &mov ($inp,&wparam(1)); &mov ($len,&wparam(2)); &mov ("edx",&wparam(3)); # key &mov ("ebx",&wparam(4)); # counter and nonce &mov ("ebp","esp"); &stack_push (131); &and ("esp",-64); &mov (&DWP(512,"esp"),"ebp"); &lea ("eax",&DWP(&label("ssse3_data")."-". &label("pic_point"),"eax")); &movdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce if (defined($gasver) && $gasver>=2.17) { # even though we encode # pshufb manually, we # handle only register # operands, while this # segment uses memory # operand... &cmp ($len,64*4); &jb (&label("1x")); &mov (&DWP(512+4,"esp"),"edx"); # offload pointers &mov (&DWP(512+8,"esp"),"ebx"); &sub ($len,64*4); # bias len &lea ("ebp",&DWP(256+128,"esp")); # size optimization &movdqu ("xmm7",&QWP(0,"edx")); # key &pshufd ("xmm0","xmm3",0x00); &pshufd ("xmm1","xmm3",0x55); &pshufd ("xmm2","xmm3",0xaa); &pshufd ("xmm3","xmm3",0xff); &paddd ("xmm0",&QWP(16*3,"eax")); # fix counters &pshufd ("xmm4","xmm7",0x00); &pshufd ("xmm5","xmm7",0x55); &psubd ("xmm0",&QWP(16*4,"eax")); &pshufd ("xmm6","xmm7",0xaa); &pshufd ("xmm7","xmm7",0xff); &movdqa (&QWP(16*12-128,"ebp"),"xmm0"); &movdqa (&QWP(16*13-128,"ebp"),"xmm1"); &movdqa (&QWP(16*14-128,"ebp"),"xmm2"); &movdqa (&QWP(16*15-128,"ebp"),"xmm3"); &movdqu ("xmm3",&QWP(16,"edx")); # key &movdqa (&QWP(16*4-128,"ebp"),"xmm4"); &movdqa (&QWP(16*5-128,"ebp"),"xmm5"); &movdqa (&QWP(16*6-128,"ebp"),"xmm6"); &movdqa (&QWP(16*7-128,"ebp"),"xmm7"); &movdqa ("xmm7",&QWP(16*2,"eax")); # sigma &lea ("ebx",&DWP(128,"esp")); # size optimization &pshufd ("xmm0","xmm3",0x00); &pshufd ("xmm1","xmm3",0x55); &pshufd ("xmm2","xmm3",0xaa); &pshufd ("xmm3","xmm3",0xff); &pshufd ("xmm4","xmm7",0x00); &pshufd ("xmm5","xmm7",0x55); &pshufd ("xmm6","xmm7",0xaa); &pshufd ("xmm7","xmm7",0xff); &movdqa (&QWP(16*8-128,"ebp"),"xmm0"); &movdqa (&QWP(16*9-128,"ebp"),"xmm1"); &movdqa (&QWP(16*10-128,"ebp"),"xmm2"); &movdqa (&QWP(16*11-128,"ebp"),"xmm3"); &movdqa (&QWP(16*0-128,"ebp"),"xmm4"); &movdqa (&QWP(16*1-128,"ebp"),"xmm5"); &movdqa (&QWP(16*2-128,"ebp"),"xmm6"); &movdqa (&QWP(16*3-128,"ebp"),"xmm7"); &lea ($inp,&DWP(128,$inp)); # size optimization &lea ($out,&DWP(128,$out)); # size optimization &jmp (&label("outer_loop")); &set_label("outer_loop",16); #&movdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material &movdqa ("xmm1",&QWP(16*1-128,"ebp")); &movdqa ("xmm2",&QWP(16*2-128,"ebp")); &movdqa ("xmm3",&QWP(16*3-128,"ebp")); #&movdqa ("xmm4",&QWP(16*4-128,"ebp")); &movdqa ("xmm5",&QWP(16*5-128,"ebp")); &movdqa ("xmm6",&QWP(16*6-128,"ebp")); &movdqa ("xmm7",&QWP(16*7-128,"ebp")); #&movdqa (&QWP(16*0-128,"ebx"),"xmm0"); &movdqa (&QWP(16*1-128,"ebx"),"xmm1"); &movdqa (&QWP(16*2-128,"ebx"),"xmm2"); &movdqa (&QWP(16*3-128,"ebx"),"xmm3"); #&movdqa (&QWP(16*4-128,"ebx"),"xmm4"); &movdqa (&QWP(16*5-128,"ebx"),"xmm5"); &movdqa (&QWP(16*6-128,"ebx"),"xmm6"); &movdqa (&QWP(16*7-128,"ebx"),"xmm7"); #&movdqa ("xmm0",&QWP(16*8-128,"ebp")); #&movdqa ("xmm1",&QWP(16*9-128,"ebp")); &movdqa ("xmm2",&QWP(16*10-128,"ebp")); &movdqa ("xmm3",&QWP(16*11-128,"ebp")); &movdqa ("xmm4",&QWP(16*12-128,"ebp")); &movdqa ("xmm5",&QWP(16*13-128,"ebp")); &movdqa ("xmm6",&QWP(16*14-128,"ebp")); &movdqa ("xmm7",&QWP(16*15-128,"ebp")); &paddd ("xmm4",&QWP(16*4,"eax")); # counter value #&movdqa (&QWP(16*8-128,"ebx"),"xmm0"); #&movdqa (&QWP(16*9-128,"ebx"),"xmm1"); &movdqa (&QWP(16*10-128,"ebx"),"xmm2"); &movdqa (&QWP(16*11-128,"ebx"),"xmm3"); &movdqa (&QWP(16*12-128,"ebx"),"xmm4"); &movdqa (&QWP(16*13-128,"ebx"),"xmm5"); &movdqa (&QWP(16*14-128,"ebx"),"xmm6"); &movdqa (&QWP(16*15-128,"ebx"),"xmm7"); &movdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value &movdqa ($xa, &QWP(16*0-128,"ebp")); &movdqa ($xd, "xmm4"); &movdqa ($xb_,&QWP(16*4-128,"ebp")); &movdqa ($xc, &QWP(16*8-128,"ebp")); &movdqa ($xc_,&QWP(16*9-128,"ebp")); &mov ("edx",10); # loop counter &nop (); &set_label("loop",16); &paddd ($xa,$xb_); # elsewhere &movdqa ($xb,$xb_); &pxor ($xd,$xa); # elsewhere &QUARTERROUND_SSSE3(0, 4, 8, 12, 0); &QUARTERROUND_SSSE3(1, 5, 9, 13, 1); &QUARTERROUND_SSSE3(2, 6,10, 14, 2); &QUARTERROUND_SSSE3(3, 7,11, 15, 3); &QUARTERROUND_SSSE3(0, 5,10, 15, 4); &QUARTERROUND_SSSE3(1, 6,11, 12, 5); &QUARTERROUND_SSSE3(2, 7, 8, 13, 6); &QUARTERROUND_SSSE3(3, 4, 9, 14, 7); &dec ("edx"); &jnz (&label("loop")); &movdqa (&QWP(16*4-128,"ebx"),$xb_); &movdqa (&QWP(16*8-128,"ebx"),$xc); &movdqa (&QWP(16*9-128,"ebx"),$xc_); &movdqa (&QWP(16*12-128,"ebx"),$xd); &movdqa (&QWP(16*14-128,"ebx"),$xd_); my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); #&movdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there &movdqa ($xa1,&QWP(16*1-128,"ebx")); &movdqa ($xa2,&QWP(16*2-128,"ebx")); &movdqa ($xa3,&QWP(16*3-128,"ebx")); for($i=0;$i<256;$i+=64) { &paddd ($xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &paddd ($xa1,&QWP($i+16*1-128,"ebp")); &paddd ($xa2,&QWP($i+16*2-128,"ebp")); &paddd ($xa3,&QWP($i+16*3-128,"ebp")); &movdqa ($xt2,$xa0); # "de-interlace" data &punpckldq ($xa0,$xa1); &movdqa ($xt3,$xa2); &punpckldq ($xa2,$xa3); &punpckhdq ($xt2,$xa1); &punpckhdq ($xt3,$xa3); &movdqa ($xa1,$xa0); &punpcklqdq ($xa0,$xa2); # "a0" &movdqa ($xa3,$xt2); &punpcklqdq ($xt2,$xt3); # "a2" &punpckhqdq ($xa1,$xa2); # "a1" &punpckhqdq ($xa3,$xt3); # "a3" #($xa2,$xt2)=($xt2,$xa2); &movdqu ($xt0,&QWP(64*0-128,$inp)); # load input &movdqu ($xt1,&QWP(64*1-128,$inp)); &movdqu ($xa2,&QWP(64*2-128,$inp)); &movdqu ($xt3,&QWP(64*3-128,$inp)); &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); &pxor ($xt0,$xa0); &movdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); &pxor ($xt1,$xa1); &movdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); &pxor ($xt2,$xa2); &movdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); &pxor ($xt3,$xa3); &movdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); &movdqu (&QWP(64*0-128,$out),$xt0); # store output &movdqu (&QWP(64*1-128,$out),$xt1); &movdqu (&QWP(64*2-128,$out),$xt2); &movdqu (&QWP(64*3-128,$out),$xt3); &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); } &sub ($len,64*4); &jnc (&label("outer_loop")); &add ($len,64*4); &jz (&label("done")); &mov ("ebx",&DWP(512+8,"esp")); # restore pointers &lea ($inp,&DWP(-128,$inp)); &mov ("edx",&DWP(512+4,"esp")); &lea ($out,&DWP(-128,$out)); &movd ("xmm2",&DWP(16*12-128,"ebp")); # counter value &movdqu ("xmm3",&QWP(0,"ebx")); &paddd ("xmm2",&QWP(16*6,"eax")); # +four &pand ("xmm3",&QWP(16*7,"eax")); &por ("xmm3","xmm2"); # counter value } { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot16); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,20); &pslld ($t,12); &por ($b,$t); &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot24); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,25); &pslld ($t,7); &por ($b,$t); } &set_label("1x"); &movdqa ($a,&QWP(16*2,"eax")); # sigma &movdqu ($b,&QWP(0,"edx")); &movdqu ($c,&QWP(16,"edx")); #&movdqu ($d,&QWP(0,"ebx")); # already loaded &movdqa ($rot16,&QWP(0,"eax")); &movdqa ($rot24,&QWP(16,"eax")); &mov (&DWP(16*3,"esp"),"ebp"); &movdqa (&QWP(16*0,"esp"),$a); &movdqa (&QWP(16*1,"esp"),$b); &movdqa (&QWP(16*2,"esp"),$c); &movdqa (&QWP(16*3,"esp"),$d); &mov ("edx",10); &jmp (&label("loop1x")); &set_label("outer1x",16); &movdqa ($d,&QWP(16*5,"eax")); # one &movdqa ($a,&QWP(16*0,"esp")); &movdqa ($b,&QWP(16*1,"esp")); &movdqa ($c,&QWP(16*2,"esp")); &paddd ($d,&QWP(16*3,"esp")); &mov ("edx",10); &movdqa (&QWP(16*3,"esp"),$d); &jmp (&label("loop1x")); &set_label("loop1x",16); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &nop (); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &dec ("edx"); &jnz (&label("loop1x")); &paddd ($a,&QWP(16*0,"esp")); &paddd ($b,&QWP(16*1,"esp")); &paddd ($c,&QWP(16*2,"esp")); &paddd ($d,&QWP(16*3,"esp")); &cmp ($len,64); &jb (&label("tail")); &movdqu ($t,&QWP(16*0,$inp)); &movdqu ($t1,&QWP(16*1,$inp)); &pxor ($a,$t); # xor with input &movdqu ($t,&QWP(16*2,$inp)); &pxor ($b,$t1); &movdqu ($t1,&QWP(16*3,$inp)); &pxor ($c,$t); &pxor ($d,$t1); &lea ($inp,&DWP(16*4,$inp)); # inp+=64 &movdqu (&QWP(16*0,$out),$a); # write output &movdqu (&QWP(16*1,$out),$b); &movdqu (&QWP(16*2,$out),$c); &movdqu (&QWP(16*3,$out),$d); &lea ($out,&DWP(16*4,$out)); # inp+=64 &sub ($len,64); &jnz (&label("outer1x")); &jmp (&label("done")); &set_label("tail"); &movdqa (&QWP(16*0,"esp"),$a); &movdqa (&QWP(16*1,"esp"),$b); &movdqa (&QWP(16*2,"esp"),$c); &movdqa (&QWP(16*3,"esp"),$d); &xor ("eax","eax"); &xor ("edx","edx"); &xor ("ebp","ebp"); &set_label("tail_loop"); &movb ("al",&BP(0,"esp","ebp")); &movb ("dl",&BP(0,$inp,"ebp")); &lea ("ebp",&DWP(1,"ebp")); &xor ("al","dl"); &movb (&BP(-1,$out,"ebp"),"al"); &dec ($len); &jnz (&label("tail_loop")); } &set_label("done"); &mov ("esp",&DWP(512,"esp")); &function_end("ChaCha20_ssse3"); &align (64); &set_label("ssse3_data"); &data_byte(0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd); &data_byte(0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe); &data_word(0x61707865,0x3320646e,0x79622d32,0x6b206574); &data_word(0,1,2,3); &data_word(4,4,4,4); &data_word(1,0,0,0); &data_word(4,0,0,0); &data_word(0,-1,-1,-1); &align (64); } &asciz ("ChaCha20 for x86, CRYPTOGAMS by "); if ($ymm) { my ($xa,$xa_,$xb,$xb_,$xc,$xc_,$xd,$xd_)=map("xmm$_",(0..7)); my ($out,$inp,$len)=("edi","esi","ecx"); sub QUARTERROUND_XOP { my ($ai,$bi,$ci,$di,$i)=@_; my ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+1)&3),($ai,$bi,$ci,$di)); # next my ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-1)&3),($ai,$bi,$ci,$di)); # previous # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 if ($i==0) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_-$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==3) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_+$j++)&3),($an,$bn,$cn,$dn)); } elsif ($i==4) { my $j=4; ($ap,$bp,$cp,$dp)=map(($_&~3)+(($_+$j--)&3),($ap,$bp,$cp,$dp)); } elsif ($i==7) { my $j=0; ($an,$bn,$cn,$dn)=map(($_&~3)+(($_-$j++)&3),($an,$bn,$cn,$dn)); } #&vpaddd ($xa,$xa,$xb); # see elsewhere #&vpxor ($xd,$xd,$xa); # see elsewhere &vmovdqa (&QWP(16*$cp-128,"ebx"),$xc_) if ($ai>0 && $ai<3); &vprotd ($xd,$xd,16); &vmovdqa (&QWP(16*$bp-128,"ebx"),$xb_) if ($i!=0); &vpaddd ($xc,$xc,$xd); &vmovdqa ($xc_,&QWP(16*$cn-128,"ebx")) if ($ai>0 && $ai<3); &vpxor ($xb,$i!=0?$xb:$xb_,$xc); &vmovdqa ($xa_,&QWP(16*$an-128,"ebx")); &vprotd ($xb,$xb,12); &vmovdqa ($xb_,&QWP(16*$bn-128,"ebx")) if ($i<7); &vpaddd ($xa,$xa,$xb); &vmovdqa ($xd_,&QWP(16*$dn-128,"ebx")) if ($di!=$dn); &vpxor ($xd,$xd,$xa); &vpaddd ($xa_,$xa_,$xb_) if ($i<7); # elsewhere &vprotd ($xd,$xd,8); &vmovdqa (&QWP(16*$ai-128,"ebx"),$xa); &vpaddd ($xc,$xc,$xd); &vmovdqa (&QWP(16*$di-128,"ebx"),$xd) if ($di!=$dn); &vpxor ($xb,$xb,$xc); &vpxor ($xd_,$di==$dn?$xd:$xd_,$xa_) if ($i<7); # elsewhere &vprotd ($xb,$xb,7); ($xa,$xa_)=($xa_,$xa); ($xb,$xb_)=($xb_,$xb); ($xc,$xc_)=($xc_,$xc); ($xd,$xd_)=($xd_,$xd); } &function_begin("ChaCha20_xop"); &set_label("xop_shortcut"); &mov ($out,&wparam(0)); &mov ($inp,&wparam(1)); &mov ($len,&wparam(2)); &mov ("edx",&wparam(3)); # key &mov ("ebx",&wparam(4)); # counter and nonce &vzeroupper (); &mov ("ebp","esp"); &stack_push (131); &and ("esp",-64); &mov (&DWP(512,"esp"),"ebp"); &lea ("eax",&DWP(&label("ssse3_data")."-". &label("pic_point"),"eax")); &vmovdqu ("xmm3",&QWP(0,"ebx")); # counter and nonce &cmp ($len,64*4); &jb (&label("1x")); &mov (&DWP(512+4,"esp"),"edx"); # offload pointers &mov (&DWP(512+8,"esp"),"ebx"); &sub ($len,64*4); # bias len &lea ("ebp",&DWP(256+128,"esp")); # size optimization &vmovdqu ("xmm7",&QWP(0,"edx")); # key &vpshufd ("xmm0","xmm3",0x00); &vpshufd ("xmm1","xmm3",0x55); &vpshufd ("xmm2","xmm3",0xaa); &vpshufd ("xmm3","xmm3",0xff); &vpaddd ("xmm0","xmm0",&QWP(16*3,"eax")); # fix counters &vpshufd ("xmm4","xmm7",0x00); &vpshufd ("xmm5","xmm7",0x55); &vpsubd ("xmm0","xmm0",&QWP(16*4,"eax")); &vpshufd ("xmm6","xmm7",0xaa); &vpshufd ("xmm7","xmm7",0xff); &vmovdqa (&QWP(16*12-128,"ebp"),"xmm0"); &vmovdqa (&QWP(16*13-128,"ebp"),"xmm1"); &vmovdqa (&QWP(16*14-128,"ebp"),"xmm2"); &vmovdqa (&QWP(16*15-128,"ebp"),"xmm3"); &vmovdqu ("xmm3",&QWP(16,"edx")); # key &vmovdqa (&QWP(16*4-128,"ebp"),"xmm4"); &vmovdqa (&QWP(16*5-128,"ebp"),"xmm5"); &vmovdqa (&QWP(16*6-128,"ebp"),"xmm6"); &vmovdqa (&QWP(16*7-128,"ebp"),"xmm7"); &vmovdqa ("xmm7",&QWP(16*2,"eax")); # sigma &lea ("ebx",&DWP(128,"esp")); # size optimization &vpshufd ("xmm0","xmm3",0x00); &vpshufd ("xmm1","xmm3",0x55); &vpshufd ("xmm2","xmm3",0xaa); &vpshufd ("xmm3","xmm3",0xff); &vpshufd ("xmm4","xmm7",0x00); &vpshufd ("xmm5","xmm7",0x55); &vpshufd ("xmm6","xmm7",0xaa); &vpshufd ("xmm7","xmm7",0xff); &vmovdqa (&QWP(16*8-128,"ebp"),"xmm0"); &vmovdqa (&QWP(16*9-128,"ebp"),"xmm1"); &vmovdqa (&QWP(16*10-128,"ebp"),"xmm2"); &vmovdqa (&QWP(16*11-128,"ebp"),"xmm3"); &vmovdqa (&QWP(16*0-128,"ebp"),"xmm4"); &vmovdqa (&QWP(16*1-128,"ebp"),"xmm5"); &vmovdqa (&QWP(16*2-128,"ebp"),"xmm6"); &vmovdqa (&QWP(16*3-128,"ebp"),"xmm7"); &lea ($inp,&DWP(128,$inp)); # size optimization &lea ($out,&DWP(128,$out)); # size optimization &jmp (&label("outer_loop")); &set_label("outer_loop",32); #&vmovdqa ("xmm0",&QWP(16*0-128,"ebp")); # copy key material &vmovdqa ("xmm1",&QWP(16*1-128,"ebp")); &vmovdqa ("xmm2",&QWP(16*2-128,"ebp")); &vmovdqa ("xmm3",&QWP(16*3-128,"ebp")); #&vmovdqa ("xmm4",&QWP(16*4-128,"ebp")); &vmovdqa ("xmm5",&QWP(16*5-128,"ebp")); &vmovdqa ("xmm6",&QWP(16*6-128,"ebp")); &vmovdqa ("xmm7",&QWP(16*7-128,"ebp")); #&vmovdqa (&QWP(16*0-128,"ebx"),"xmm0"); &vmovdqa (&QWP(16*1-128,"ebx"),"xmm1"); &vmovdqa (&QWP(16*2-128,"ebx"),"xmm2"); &vmovdqa (&QWP(16*3-128,"ebx"),"xmm3"); #&vmovdqa (&QWP(16*4-128,"ebx"),"xmm4"); &vmovdqa (&QWP(16*5-128,"ebx"),"xmm5"); &vmovdqa (&QWP(16*6-128,"ebx"),"xmm6"); &vmovdqa (&QWP(16*7-128,"ebx"),"xmm7"); #&vmovdqa ("xmm0",&QWP(16*8-128,"ebp")); #&vmovdqa ("xmm1",&QWP(16*9-128,"ebp")); &vmovdqa ("xmm2",&QWP(16*10-128,"ebp")); &vmovdqa ("xmm3",&QWP(16*11-128,"ebp")); &vmovdqa ("xmm4",&QWP(16*12-128,"ebp")); &vmovdqa ("xmm5",&QWP(16*13-128,"ebp")); &vmovdqa ("xmm6",&QWP(16*14-128,"ebp")); &vmovdqa ("xmm7",&QWP(16*15-128,"ebp")); &vpaddd ("xmm4","xmm4",&QWP(16*4,"eax")); # counter value #&vmovdqa (&QWP(16*8-128,"ebx"),"xmm0"); #&vmovdqa (&QWP(16*9-128,"ebx"),"xmm1"); &vmovdqa (&QWP(16*10-128,"ebx"),"xmm2"); &vmovdqa (&QWP(16*11-128,"ebx"),"xmm3"); &vmovdqa (&QWP(16*12-128,"ebx"),"xmm4"); &vmovdqa (&QWP(16*13-128,"ebx"),"xmm5"); &vmovdqa (&QWP(16*14-128,"ebx"),"xmm6"); &vmovdqa (&QWP(16*15-128,"ebx"),"xmm7"); &vmovdqa (&QWP(16*12-128,"ebp"),"xmm4"); # save counter value &vmovdqa ($xa, &QWP(16*0-128,"ebp")); &vmovdqa ($xd, "xmm4"); &vmovdqa ($xb_,&QWP(16*4-128,"ebp")); &vmovdqa ($xc, &QWP(16*8-128,"ebp")); &vmovdqa ($xc_,&QWP(16*9-128,"ebp")); &mov ("edx",10); # loop counter &nop (); &set_label("loop",32); &vpaddd ($xa,$xa,$xb_); # elsewhere &vpxor ($xd,$xd,$xa); # elsewhere &QUARTERROUND_XOP(0, 4, 8, 12, 0); &QUARTERROUND_XOP(1, 5, 9, 13, 1); &QUARTERROUND_XOP(2, 6,10, 14, 2); &QUARTERROUND_XOP(3, 7,11, 15, 3); &QUARTERROUND_XOP(0, 5,10, 15, 4); &QUARTERROUND_XOP(1, 6,11, 12, 5); &QUARTERROUND_XOP(2, 7, 8, 13, 6); &QUARTERROUND_XOP(3, 4, 9, 14, 7); &dec ("edx"); &jnz (&label("loop")); &vmovdqa (&QWP(16*4-128,"ebx"),$xb_); &vmovdqa (&QWP(16*8-128,"ebx"),$xc); &vmovdqa (&QWP(16*9-128,"ebx"),$xc_); &vmovdqa (&QWP(16*12-128,"ebx"),$xd); &vmovdqa (&QWP(16*14-128,"ebx"),$xd_); my ($xa0,$xa1,$xa2,$xa3,$xt0,$xt1,$xt2,$xt3)=map("xmm$_",(0..7)); #&vmovdqa ($xa0,&QWP(16*0-128,"ebx")); # it's there &vmovdqa ($xa1,&QWP(16*1-128,"ebx")); &vmovdqa ($xa2,&QWP(16*2-128,"ebx")); &vmovdqa ($xa3,&QWP(16*3-128,"ebx")); for($i=0;$i<256;$i+=64) { &vpaddd ($xa0,$xa0,&QWP($i+16*0-128,"ebp")); # accumulate key material &vpaddd ($xa1,$xa1,&QWP($i+16*1-128,"ebp")); &vpaddd ($xa2,$xa2,&QWP($i+16*2-128,"ebp")); &vpaddd ($xa3,$xa3,&QWP($i+16*3-128,"ebp")); &vpunpckldq ($xt2,$xa0,$xa1); # "de-interlace" data &vpunpckldq ($xt3,$xa2,$xa3); &vpunpckhdq ($xa0,$xa0,$xa1); &vpunpckhdq ($xa2,$xa2,$xa3); &vpunpcklqdq ($xa1,$xt2,$xt3); # "a0" &vpunpckhqdq ($xt2,$xt2,$xt3); # "a1" &vpunpcklqdq ($xt3,$xa0,$xa2); # "a2" &vpunpckhqdq ($xa3,$xa0,$xa2); # "a3" &vpxor ($xt0,$xa1,&QWP(64*0-128,$inp)); &vpxor ($xt1,$xt2,&QWP(64*1-128,$inp)); &vpxor ($xt2,$xt3,&QWP(64*2-128,$inp)); &vpxor ($xt3,$xa3,&QWP(64*3-128,$inp)); &lea ($inp,&QWP($i<192?16:(64*4-16*3),$inp)); &vmovdqa ($xa0,&QWP($i+16*4-128,"ebx")) if ($i<192); &vmovdqa ($xa1,&QWP($i+16*5-128,"ebx")) if ($i<192); &vmovdqa ($xa2,&QWP($i+16*6-128,"ebx")) if ($i<192); &vmovdqa ($xa3,&QWP($i+16*7-128,"ebx")) if ($i<192); &vmovdqu (&QWP(64*0-128,$out),$xt0); # store output &vmovdqu (&QWP(64*1-128,$out),$xt1); &vmovdqu (&QWP(64*2-128,$out),$xt2); &vmovdqu (&QWP(64*3-128,$out),$xt3); &lea ($out,&QWP($i<192?16:(64*4-16*3),$out)); } &sub ($len,64*4); &jnc (&label("outer_loop")); &add ($len,64*4); &jz (&label("done")); &mov ("ebx",&DWP(512+8,"esp")); # restore pointers &lea ($inp,&DWP(-128,$inp)); &mov ("edx",&DWP(512+4,"esp")); &lea ($out,&DWP(-128,$out)); &vmovd ("xmm2",&DWP(16*12-128,"ebp")); # counter value &vmovdqu ("xmm3",&QWP(0,"ebx")); &vpaddd ("xmm2","xmm2",&QWP(16*6,"eax"));# +four &vpand ("xmm3","xmm3",&QWP(16*7,"eax")); &vpor ("xmm3","xmm3","xmm2"); # counter value { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("xmm$_",(0..7)); sub XOPROUND { &vpaddd ($a,$a,$b); &vpxor ($d,$d,$a); &vprotd ($d,$d,16); &vpaddd ($c,$c,$d); &vpxor ($b,$b,$c); &vprotd ($b,$b,12); &vpaddd ($a,$a,$b); &vpxor ($d,$d,$a); &vprotd ($d,$d,8); &vpaddd ($c,$c,$d); &vpxor ($b,$b,$c); &vprotd ($b,$b,7); } &set_label("1x"); &vmovdqa ($a,&QWP(16*2,"eax")); # sigma &vmovdqu ($b,&QWP(0,"edx")); &vmovdqu ($c,&QWP(16,"edx")); #&vmovdqu ($d,&QWP(0,"ebx")); # already loaded &vmovdqa ($rot16,&QWP(0,"eax")); &vmovdqa ($rot24,&QWP(16,"eax")); &mov (&DWP(16*3,"esp"),"ebp"); &vmovdqa (&QWP(16*0,"esp"),$a); &vmovdqa (&QWP(16*1,"esp"),$b); &vmovdqa (&QWP(16*2,"esp"),$c); &vmovdqa (&QWP(16*3,"esp"),$d); &mov ("edx",10); &jmp (&label("loop1x")); &set_label("outer1x",16); &vmovdqa ($d,&QWP(16*5,"eax")); # one &vmovdqa ($a,&QWP(16*0,"esp")); &vmovdqa ($b,&QWP(16*1,"esp")); &vmovdqa ($c,&QWP(16*2,"esp")); &vpaddd ($d,$d,&QWP(16*3,"esp")); &mov ("edx",10); &vmovdqa (&QWP(16*3,"esp"),$d); &jmp (&label("loop1x")); &set_label("loop1x",16); &XOPROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b00111001); &vpshufd ($d,$d,0b10010011); &XOPROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b10010011); &vpshufd ($d,$d,0b00111001); &dec ("edx"); &jnz (&label("loop1x")); &vpaddd ($a,$a,&QWP(16*0,"esp")); &vpaddd ($b,$b,&QWP(16*1,"esp")); &vpaddd ($c,$c,&QWP(16*2,"esp")); &vpaddd ($d,$d,&QWP(16*3,"esp")); &cmp ($len,64); &jb (&label("tail")); &vpxor ($a,$a,&QWP(16*0,$inp)); # xor with input &vpxor ($b,$b,&QWP(16*1,$inp)); &vpxor ($c,$c,&QWP(16*2,$inp)); &vpxor ($d,$d,&QWP(16*3,$inp)); &lea ($inp,&DWP(16*4,$inp)); # inp+=64 &vmovdqu (&QWP(16*0,$out),$a); # write output &vmovdqu (&QWP(16*1,$out),$b); &vmovdqu (&QWP(16*2,$out),$c); &vmovdqu (&QWP(16*3,$out),$d); &lea ($out,&DWP(16*4,$out)); # inp+=64 &sub ($len,64); &jnz (&label("outer1x")); &jmp (&label("done")); &set_label("tail"); &vmovdqa (&QWP(16*0,"esp"),$a); &vmovdqa (&QWP(16*1,"esp"),$b); &vmovdqa (&QWP(16*2,"esp"),$c); &vmovdqa (&QWP(16*3,"esp"),$d); &xor ("eax","eax"); &xor ("edx","edx"); &xor ("ebp","ebp"); &set_label("tail_loop"); &movb ("al",&BP(0,"esp","ebp")); &movb ("dl",&BP(0,$inp,"ebp")); &lea ("ebp",&DWP(1,"ebp")); &xor ("al","dl"); &movb (&BP(-1,$out,"ebp"),"al"); &dec ($len); &jnz (&label("tail_loop")); } &set_label("done"); &vzeroupper (); &mov ("esp",&DWP(512,"esp")); &function_end("ChaCha20_xop"); } &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/chacha/asm/chacha-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/chacha/asm/chacha-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/chacha/asm/chacha-x86_64.pl (revision 364963) @@ -1,4005 +1,4005 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # November 2014 # # ChaCha20 for x86_64. # # December 2016 # # Add AVX512F code path. # # December 2017 # # Add AVX512VL code path. # # Performance in cycles per byte out of large buffer. # # IALU/gcc 4.8(i) 1x/2xSSSE3(ii) 4xSSSE3 NxAVX(v) # # P4 9.48/+99% - - # Core2 7.83/+55% 7.90/5.76 4.35 # Westmere 7.19/+50% 5.60/4.50 3.00 # Sandy Bridge 8.31/+42% 5.45/4.00 2.72 # Ivy Bridge 6.71/+46% 5.40/? 2.41 # Haswell 5.92/+43% 5.20/3.45 2.42 1.23 # Skylake[-X] 5.87/+39% 4.70/3.22 2.31 1.19[0.80(vi)] # Silvermont 12.0/+33% 7.75/6.90 7.03(iii) # Knights L 11.7/- ? 9.60(iii) 0.80 # Goldmont 10.6/+17% 5.10/3.52 3.28 # Sledgehammer 7.28/+52% - - # Bulldozer 9.66/+28% 9.85/5.35(iv) 3.06(iv) # Ryzen 5.96/+50% 5.19/3.00 2.40 2.09 # VIA Nano 10.5/+46% 6.72/6.88 6.05 # # (i) compared to older gcc 3.x one can observe >2x improvement on # most platforms; # (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used # by chacha20_poly1305_tls_cipher, results are EVP-free; # (iii) this is not optimal result for Atom because of MSROM # limitations, SSE2 can do better, but gain is considered too # low to justify the [maintenance] effort; # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20 # and 4.85 for 128-byte inputs; # (v) 8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable; # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57 # cpb in single thread, the corresponding capability is suppressed; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { $avx = ($1>=2.09) + ($1>=2.10) + ($1>=2.12); $avx += 1 if ($1==2.11 && $2>=8); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # input parameter block ($out,$inp,$len,$key,$counter)=("%rdi","%rsi","%rdx","%rcx","%r8"); $code.=<<___; .text .extern OPENSSL_ia32cap_P .align 64 .Lzero: .long 0,0,0,0 .Lone: .long 1,0,0,0 .Linc: .long 0,1,2,3 .Lfour: .long 4,4,4,4 .Lincy: .long 0,2,4,6,1,3,5,7 .Leight: .long 8,8,8,8,8,8,8,8 .Lrot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd .Lrot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Ltwoy: .long 2,0,0,0, 2,0,0,0 .align 64 .Lzeroz: .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 .Lfourz: .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 .Lincz: .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .Lsixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .Lsigma: .asciz "expand 32-byte k" .asciz "ChaCha20 for x86_64, CRYPTOGAMS by " ___ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } @x=("%eax","%ebx","%ecx","%edx",map("%r${_}d",(8..11)), "%nox","%nox","%nox","%nox",map("%r${_}d",(12..15))); @t=("%esi","%edi"); sub ROUND { # critical path is 24 cycles per round my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_)=map("\"$_\"",@t); my @x=map("\"$_\"",@x); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. # Normally instructions would be interleaved to favour in-order # execution. Generally out-of-order cores manage it gracefully, # but not this time for some reason. As in-order execution # cores are dying breed, old Atom is the only one around, # instructions are left uninterleaved. Besides, Atom is better # off executing 1xSSSE3 code anyway... ( "&add (@x[$a0],@x[$b0])", # Q1 "&xor (@x[$d0],@x[$a0])", "&rol (@x[$d0],16)", "&add (@x[$a1],@x[$b1])", # Q2 "&xor (@x[$d1],@x[$a1])", "&rol (@x[$d1],16)", "&add ($xc,@x[$d0])", "&xor (@x[$b0],$xc)", "&rol (@x[$b0],12)", "&add ($xc_,@x[$d1])", "&xor (@x[$b1],$xc_)", "&rol (@x[$b1],12)", "&add (@x[$a0],@x[$b0])", "&xor (@x[$d0],@x[$a0])", "&rol (@x[$d0],8)", "&add (@x[$a1],@x[$b1])", "&xor (@x[$d1],@x[$a1])", "&rol (@x[$d1],8)", "&add ($xc,@x[$d0])", "&xor (@x[$b0],$xc)", "&rol (@x[$b0],7)", "&add ($xc_,@x[$d1])", "&xor (@x[$b1],$xc_)", "&rol (@x[$b1],7)", "&mov (\"4*$c0(%rsp)\",$xc)", # reload pair of 'c's "&mov (\"4*$c1(%rsp)\",$xc_)", "&mov ($xc,\"4*$c2(%rsp)\")", "&mov ($xc_,\"4*$c3(%rsp)\")", "&add (@x[$a2],@x[$b2])", # Q3 "&xor (@x[$d2],@x[$a2])", "&rol (@x[$d2],16)", "&add (@x[$a3],@x[$b3])", # Q4 "&xor (@x[$d3],@x[$a3])", "&rol (@x[$d3],16)", "&add ($xc,@x[$d2])", "&xor (@x[$b2],$xc)", "&rol (@x[$b2],12)", "&add ($xc_,@x[$d3])", "&xor (@x[$b3],$xc_)", "&rol (@x[$b3],12)", "&add (@x[$a2],@x[$b2])", "&xor (@x[$d2],@x[$a2])", "&rol (@x[$d2],8)", "&add (@x[$a3],@x[$b3])", "&xor (@x[$d3],@x[$a3])", "&rol (@x[$d3],8)", "&add ($xc,@x[$d2])", "&xor (@x[$b2],$xc)", "&rol (@x[$b2],7)", "&add ($xc_,@x[$d3])", "&xor (@x[$b3],$xc_)", "&rol (@x[$b3],7)" ); } ######################################################################## # Generic code path that handles all lengths on pre-SSSE3 processors. $code.=<<___; .globl ChaCha20_ctr32 .type ChaCha20_ctr32,\@function,5 .align 64 ChaCha20_ctr32: .cfi_startproc cmp \$0,$len je .Lno_data mov OPENSSL_ia32cap_P+4(%rip),%r10 ___ $code.=<<___ if ($avx>2); bt \$48,%r10 # check for AVX512F jc .LChaCha20_avx512 test %r10,%r10 # check for AVX512VL js .LChaCha20_avx512vl ___ $code.=<<___; test \$`1<<(41-32)`,%r10d jnz .LChaCha20_ssse3 push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$64+24,%rsp .cfi_adjust_cfa_offset 64+24 .Lctr32_body: #movdqa .Lsigma(%rip),%xmm0 movdqu ($key),%xmm1 movdqu 16($key),%xmm2 movdqu ($counter),%xmm3 movdqa .Lone(%rip),%xmm4 #movdqa %xmm0,4*0(%rsp) # key[0] movdqa %xmm1,4*4(%rsp) # key[1] movdqa %xmm2,4*8(%rsp) # key[2] movdqa %xmm3,4*12(%rsp) # key[3] mov $len,%rbp # reassign $len jmp .Loop_outer .align 32 .Loop_outer: mov \$0x61707865,@x[0] # 'expa' mov \$0x3320646e,@x[1] # 'nd 3' mov \$0x79622d32,@x[2] # '2-by' mov \$0x6b206574,@x[3] # 'te k' mov 4*4(%rsp),@x[4] mov 4*5(%rsp),@x[5] mov 4*6(%rsp),@x[6] mov 4*7(%rsp),@x[7] movd %xmm3,@x[12] mov 4*13(%rsp),@x[13] mov 4*14(%rsp),@x[14] mov 4*15(%rsp),@x[15] mov %rbp,64+0(%rsp) # save len mov \$10,%ebp mov $inp,64+8(%rsp) # save inp movq %xmm2,%rsi # "@x[8]" mov $out,64+16(%rsp) # save out mov %rsi,%rdi shr \$32,%rdi # "@x[9]" jmp .Loop .align 32 .Loop: ___ foreach (&ROUND (0, 4, 8,12)) { eval; } foreach (&ROUND (0, 5,10,15)) { eval; } &dec ("%ebp"); &jnz (".Loop"); $code.=<<___; mov @t[1],4*9(%rsp) # modulo-scheduled mov @t[0],4*8(%rsp) mov 64(%rsp),%rbp # load len movdqa %xmm2,%xmm1 mov 64+8(%rsp),$inp # load inp paddd %xmm4,%xmm3 # increment counter mov 64+16(%rsp),$out # load out add \$0x61707865,@x[0] # 'expa' add \$0x3320646e,@x[1] # 'nd 3' add \$0x79622d32,@x[2] # '2-by' add \$0x6b206574,@x[3] # 'te k' add 4*4(%rsp),@x[4] add 4*5(%rsp),@x[5] add 4*6(%rsp),@x[6] add 4*7(%rsp),@x[7] add 4*12(%rsp),@x[12] add 4*13(%rsp),@x[13] add 4*14(%rsp),@x[14] add 4*15(%rsp),@x[15] paddd 4*8(%rsp),%xmm1 cmp \$64,%rbp jb .Ltail xor 4*0($inp),@x[0] # xor with input xor 4*1($inp),@x[1] xor 4*2($inp),@x[2] xor 4*3($inp),@x[3] xor 4*4($inp),@x[4] xor 4*5($inp),@x[5] xor 4*6($inp),@x[6] xor 4*7($inp),@x[7] movdqu 4*8($inp),%xmm0 xor 4*12($inp),@x[12] xor 4*13($inp),@x[13] xor 4*14($inp),@x[14] xor 4*15($inp),@x[15] lea 4*16($inp),$inp # inp+=64 pxor %xmm1,%xmm0 movdqa %xmm2,4*8(%rsp) movd %xmm3,4*12(%rsp) mov @x[0],4*0($out) # write output mov @x[1],4*1($out) mov @x[2],4*2($out) mov @x[3],4*3($out) mov @x[4],4*4($out) mov @x[5],4*5($out) mov @x[6],4*6($out) mov @x[7],4*7($out) movdqu %xmm0,4*8($out) mov @x[12],4*12($out) mov @x[13],4*13($out) mov @x[14],4*14($out) mov @x[15],4*15($out) lea 4*16($out),$out # out+=64 sub \$64,%rbp jnz .Loop_outer jmp .Ldone .align 16 .Ltail: mov @x[0],4*0(%rsp) mov @x[1],4*1(%rsp) xor %rbx,%rbx mov @x[2],4*2(%rsp) mov @x[3],4*3(%rsp) mov @x[4],4*4(%rsp) mov @x[5],4*5(%rsp) mov @x[6],4*6(%rsp) mov @x[7],4*7(%rsp) movdqa %xmm1,4*8(%rsp) mov @x[12],4*12(%rsp) mov @x[13],4*13(%rsp) mov @x[14],4*14(%rsp) mov @x[15],4*15(%rsp) .Loop_tail: movzb ($inp,%rbx),%eax movzb (%rsp,%rbx),%edx lea 1(%rbx),%rbx xor %edx,%eax mov %al,-1($out,%rbx) dec %rbp jnz .Loop_tail .Ldone: lea 64+24+48(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lno_data: ret .cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 ___ ######################################################################## # SSSE3 code path that handles shorter lengths { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(0..7)); sub SSSE3ROUND { # critical path is 20 "SIMD ticks" per round &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot16); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,20); &pslld ($t,12); &por ($b,$t); &paddd ($a,$b); &pxor ($d,$a); &pshufb ($d,$rot24); &paddd ($c,$d); &pxor ($b,$c); &movdqa ($t,$b); &psrld ($b,25); &pslld ($t,7); &por ($b,$t); } my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_ssse3,\@function,5 .align 32 ChaCha20_ssse3: .cfi_startproc .LChaCha20_ssse3: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 ___ $code.=<<___ if ($avx); test \$`1<<(43-32)`,%r10d jnz .LChaCha20_4xop # XOP is fastest even if we use 1/4 ___ $code.=<<___; cmp \$128,$len # we might throw away some data, je .LChaCha20_128 ja .LChaCha20_4x # but overall it won't be slower .Ldo_sse3_after_all: sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x28(%r9) movaps %xmm7,-0x18(%r9) .Lssse3_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$a movdqu ($key),$b movdqu 16($key),$c movdqu ($counter),$d movdqa .Lrot16(%rip),$rot16 movdqa .Lrot24(%rip),$rot24 movdqa $a,0x00(%rsp) movdqa $b,0x10(%rsp) movdqa $c,0x20(%rsp) movdqa $d,0x30(%rsp) mov \$10,$counter # reuse $counter jmp .Loop_ssse3 .align 32 .Loop_outer_ssse3: movdqa .Lone(%rip),$d movdqa 0x00(%rsp),$a movdqa 0x10(%rsp),$b movdqa 0x20(%rsp),$c paddd 0x30(%rsp),$d mov \$10,$counter movdqa $d,0x30(%rsp) jmp .Loop_ssse3 .align 32 .Loop_ssse3: ___ &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &nop (); &SSSE3ROUND(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &dec ($counter); &jnz (".Loop_ssse3"); $code.=<<___; paddd 0x00(%rsp),$a paddd 0x10(%rsp),$b paddd 0x20(%rsp),$c paddd 0x30(%rsp),$d cmp \$64,$len jb .Ltail_ssse3 movdqu 0x00($inp),$t movdqu 0x10($inp),$t1 pxor $t,$a # xor with input movdqu 0x20($inp),$t pxor $t1,$b movdqu 0x30($inp),$t1 lea 0x40($inp),$inp # inp+=64 pxor $t,$c pxor $t1,$d movdqu $a,0x00($out) # write output movdqu $b,0x10($out) movdqu $c,0x20($out) movdqu $d,0x30($out) lea 0x40($out),$out # out+=64 sub \$64,$len jnz .Loop_outer_ssse3 jmp .Ldone_ssse3 .align 16 .Ltail_ssse3: movdqa $a,0x00(%rsp) movdqa $b,0x10(%rsp) movdqa $c,0x20(%rsp) movdqa $d,0x30(%rsp) xor $counter,$counter .Loop_tail_ssse3: movzb ($inp,$counter),%eax movzb (%rsp,$counter),%ecx lea 1($counter),$counter xor %ecx,%eax mov %al,-1($out,$counter) dec $len jnz .Loop_tail_ssse3 .Ldone_ssse3: ___ $code.=<<___ if ($win64); movaps -0x28(%r9),%xmm6 movaps -0x18(%r9),%xmm7 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .Lssse3_epilogue: ret .cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 ___ } ######################################################################## # SSSE3 code path that handles 128-byte inputs { my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7)); my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1)); sub SSSE3ROUND_2x { &paddd ($a,$b); &pxor ($d,$a); &paddd ($a1,$b1); &pxor ($d1,$a1); &pshufb ($d,$rot16); &pshufb($d1,$rot16); &paddd ($c,$d); &paddd ($c1,$d1); &pxor ($b,$c); &pxor ($b1,$c1); &movdqa ($t,$b); &psrld ($b,20); &movdqa($t1,$b1); &pslld ($t,12); &psrld ($b1,20); &por ($b,$t); &pslld ($t1,12); &por ($b1,$t1); &paddd ($a,$b); &pxor ($d,$a); &paddd ($a1,$b1); &pxor ($d1,$a1); &pshufb ($d,$rot24); &pshufb($d1,$rot24); &paddd ($c,$d); &paddd ($c1,$d1); &pxor ($b,$c); &pxor ($b1,$c1); &movdqa ($t,$b); &psrld ($b,25); &movdqa($t1,$b1); &pslld ($t,7); &psrld ($b1,25); &por ($b,$t); &pslld ($t1,7); &por ($b1,$t1); } my $xframe = $win64 ? 0x68 : 8; $code.=<<___; .type ChaCha20_128,\@function,5 .align 32 ChaCha20_128: .cfi_startproc .LChaCha20_128: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x68(%r9) movaps %xmm7,-0x58(%r9) movaps %xmm8,-0x48(%r9) movaps %xmm9,-0x38(%r9) movaps %xmm10,-0x28(%r9) movaps %xmm11,-0x18(%r9) .L128_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$a movdqu ($key),$b movdqu 16($key),$c movdqu ($counter),$d movdqa .Lone(%rip),$d1 movdqa .Lrot16(%rip),$rot16 movdqa .Lrot24(%rip),$rot24 movdqa $a,$a1 movdqa $a,0x00(%rsp) movdqa $b,$b1 movdqa $b,0x10(%rsp) movdqa $c,$c1 movdqa $c,0x20(%rsp) paddd $d,$d1 movdqa $d,0x30(%rsp) mov \$10,$counter # reuse $counter jmp .Loop_128 .align 32 .Loop_128: ___ &SSSE3ROUND_2x(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b00111001); &pshufd ($d,$d,0b10010011); &pshufd ($c1,$c1,0b01001110); &pshufd ($b1,$b1,0b00111001); &pshufd ($d1,$d1,0b10010011); &SSSE3ROUND_2x(); &pshufd ($c,$c,0b01001110); &pshufd ($b,$b,0b10010011); &pshufd ($d,$d,0b00111001); &pshufd ($c1,$c1,0b01001110); &pshufd ($b1,$b1,0b10010011); &pshufd ($d1,$d1,0b00111001); &dec ($counter); &jnz (".Loop_128"); $code.=<<___; paddd 0x00(%rsp),$a paddd 0x10(%rsp),$b paddd 0x20(%rsp),$c paddd 0x30(%rsp),$d paddd .Lone(%rip),$d1 paddd 0x00(%rsp),$a1 paddd 0x10(%rsp),$b1 paddd 0x20(%rsp),$c1 paddd 0x30(%rsp),$d1 movdqu 0x00($inp),$t movdqu 0x10($inp),$t1 pxor $t,$a # xor with input movdqu 0x20($inp),$t pxor $t1,$b movdqu 0x30($inp),$t1 pxor $t,$c movdqu 0x40($inp),$t pxor $t1,$d movdqu 0x50($inp),$t1 pxor $t,$a1 movdqu 0x60($inp),$t pxor $t1,$b1 movdqu 0x70($inp),$t1 pxor $t,$c1 pxor $t1,$d1 movdqu $a,0x00($out) # write output movdqu $b,0x10($out) movdqu $c,0x20($out) movdqu $d,0x30($out) movdqu $a1,0x40($out) movdqu $b1,0x50($out) movdqu $c1,0x60($out) movdqu $d1,0x70($out) ___ $code.=<<___ if ($win64); movaps -0x68(%r9),%xmm6 movaps -0x58(%r9),%xmm7 movaps -0x48(%r9),%xmm8 movaps -0x38(%r9),%xmm9 movaps -0x28(%r9),%xmm10 movaps -0x18(%r9),%xmm11 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L128_epilogue: ret .cfi_endproc .size ChaCha20_128,.-ChaCha20_128 ___ } ######################################################################## # SSSE3 code path that handles longer messages. { # assign variables to favor Atom front-end my ($xd0,$xd1,$xd2,$xd3, $xt0,$xt1,$xt2,$xt3, $xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3)=map("%xmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); sub SSSE3_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); my @x=map("\"$_\"",@xx); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. ( "&paddd (@x[$a0],@x[$b0])", # Q1 "&paddd (@x[$a1],@x[$b1])", # Q2 "&pxor (@x[$d0],@x[$a0])", "&pxor (@x[$d1],@x[$a1])", "&pshufb (@x[$d0],$t1)", "&pshufb (@x[$d1],$t1)", "&paddd ($xc,@x[$d0])", "&paddd ($xc_,@x[$d1])", "&pxor (@x[$b0],$xc)", "&pxor (@x[$b1],$xc_)", "&movdqa ($t0,@x[$b0])", "&pslld (@x[$b0],12)", "&psrld ($t0,20)", "&movdqa ($t1,@x[$b1])", "&pslld (@x[$b1],12)", "&por (@x[$b0],$t0)", "&psrld ($t1,20)", "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) "&por (@x[$b1],$t1)", "&paddd (@x[$a0],@x[$b0])", "&paddd (@x[$a1],@x[$b1])", "&pxor (@x[$d0],@x[$a0])", "&pxor (@x[$d1],@x[$a1])", "&pshufb (@x[$d0],$t0)", "&pshufb (@x[$d1],$t0)", "&paddd ($xc,@x[$d0])", "&paddd ($xc_,@x[$d1])", "&pxor (@x[$b0],$xc)", "&pxor (@x[$b1],$xc_)", "&movdqa ($t1,@x[$b0])", "&pslld (@x[$b0],7)", "&psrld ($t1,25)", "&movdqa ($t0,@x[$b1])", "&pslld (@x[$b1],7)", "&por (@x[$b0],$t1)", "&psrld ($t0,25)", "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) "&por (@x[$b1],$t0)", "&movdqa (\"`16*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's "&movdqa (\"`16*($c1-8)`(%rsp)\",$xc_)", "&movdqa ($xc,\"`16*($c2-8)`(%rsp)\")", "&movdqa ($xc_,\"`16*($c3-8)`(%rsp)\")", "&paddd (@x[$a2],@x[$b2])", # Q3 "&paddd (@x[$a3],@x[$b3])", # Q4 "&pxor (@x[$d2],@x[$a2])", "&pxor (@x[$d3],@x[$a3])", "&pshufb (@x[$d2],$t1)", "&pshufb (@x[$d3],$t1)", "&paddd ($xc,@x[$d2])", "&paddd ($xc_,@x[$d3])", "&pxor (@x[$b2],$xc)", "&pxor (@x[$b3],$xc_)", "&movdqa ($t0,@x[$b2])", "&pslld (@x[$b2],12)", "&psrld ($t0,20)", "&movdqa ($t1,@x[$b3])", "&pslld (@x[$b3],12)", "&por (@x[$b2],$t0)", "&psrld ($t1,20)", "&movdqa ($t0,'(%r11)')", # .Lrot24(%rip) "&por (@x[$b3],$t1)", "&paddd (@x[$a2],@x[$b2])", "&paddd (@x[$a3],@x[$b3])", "&pxor (@x[$d2],@x[$a2])", "&pxor (@x[$d3],@x[$a3])", "&pshufb (@x[$d2],$t0)", "&pshufb (@x[$d3],$t0)", "&paddd ($xc,@x[$d2])", "&paddd ($xc_,@x[$d3])", "&pxor (@x[$b2],$xc)", "&pxor (@x[$b3],$xc_)", "&movdqa ($t1,@x[$b2])", "&pslld (@x[$b2],7)", "&psrld ($t1,25)", "&movdqa ($t0,@x[$b3])", "&pslld (@x[$b3],7)", "&por (@x[$b2],$t1)", "&psrld ($t0,25)", "&movdqa ($t1,'(%r10)')", # .Lrot16(%rip) "&por (@x[$b3],$t0)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4x,\@function,5 .align 32 ChaCha20_4x: .cfi_startproc .LChaCha20_4x: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 mov %r10,%r11 ___ $code.=<<___ if ($avx>1); shr \$32,%r10 # OPENSSL_ia32cap_P+8 test \$`1<<5`,%r10 # test AVX2 jnz .LChaCha20_8x ___ $code.=<<___; cmp \$192,$len ja .Lproceed4x and \$`1<<26|1<<22`,%r11 # isolate XSAVE+MOVBE cmp \$`1<<22`,%r11 # check for MOVBE without XSAVE je .Ldo_sse3_after_all # to detect Atom .Lproceed4x: sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x40 constant copy of key[0-2] smashed by lanes # ... # +0x100 SIMD counters (with nonce smashed by lanes) # ... # +0x140 $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L4x_body: ___ $code.=<<___; movdqa .Lsigma(%rip),$xa3 # key[0] movdqu ($key),$xb3 # key[1] movdqu 16($key),$xt3 # key[2] movdqu ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea .Lrot16(%rip),%r10 lea .Lrot24(%rip),%r11 pshufd \$0x00,$xa3,$xa0 # smash key by lanes... pshufd \$0x55,$xa3,$xa1 movdqa $xa0,0x40(%rsp) # ... and offload pshufd \$0xaa,$xa3,$xa2 movdqa $xa1,0x50(%rsp) pshufd \$0xff,$xa3,$xa3 movdqa $xa2,0x60(%rsp) movdqa $xa3,0x70(%rsp) pshufd \$0x00,$xb3,$xb0 pshufd \$0x55,$xb3,$xb1 movdqa $xb0,0x80-0x100(%rcx) pshufd \$0xaa,$xb3,$xb2 movdqa $xb1,0x90-0x100(%rcx) pshufd \$0xff,$xb3,$xb3 movdqa $xb2,0xa0-0x100(%rcx) movdqa $xb3,0xb0-0x100(%rcx) pshufd \$0x00,$xt3,$xt0 # "$xc0" pshufd \$0x55,$xt3,$xt1 # "$xc1" movdqa $xt0,0xc0-0x100(%rcx) pshufd \$0xaa,$xt3,$xt2 # "$xc2" movdqa $xt1,0xd0-0x100(%rcx) pshufd \$0xff,$xt3,$xt3 # "$xc3" movdqa $xt2,0xe0-0x100(%rcx) movdqa $xt3,0xf0-0x100(%rcx) pshufd \$0x00,$xd3,$xd0 pshufd \$0x55,$xd3,$xd1 paddd .Linc(%rip),$xd0 # don't save counters yet pshufd \$0xaa,$xd3,$xd2 movdqa $xd1,0x110-0x100(%rcx) pshufd \$0xff,$xd3,$xd3 movdqa $xd2,0x120-0x100(%rcx) movdqa $xd3,0x130-0x100(%rcx) jmp .Loop_enter4x .align 32 .Loop_outer4x: movdqa 0x40(%rsp),$xa0 # re-load smashed key movdqa 0x50(%rsp),$xa1 movdqa 0x60(%rsp),$xa2 movdqa 0x70(%rsp),$xa3 movdqa 0x80-0x100(%rcx),$xb0 movdqa 0x90-0x100(%rcx),$xb1 movdqa 0xa0-0x100(%rcx),$xb2 movdqa 0xb0-0x100(%rcx),$xb3 movdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" movdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" movdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" movdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" movdqa 0x100-0x100(%rcx),$xd0 movdqa 0x110-0x100(%rcx),$xd1 movdqa 0x120-0x100(%rcx),$xd2 movdqa 0x130-0x100(%rcx),$xd3 paddd .Lfour(%rip),$xd0 # next SIMD counters .Loop_enter4x: movdqa $xt2,0x20(%rsp) # SIMD equivalent of "@x[10]" movdqa $xt3,0x30(%rsp) # SIMD equivalent of "@x[11]" movdqa (%r10),$xt3 # .Lrot16(%rip) mov \$10,%eax movdqa $xd0,0x100-0x100(%rcx) # save SIMD counters jmp .Loop4x .align 32 .Loop4x: ___ foreach (&SSSE3_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&SSSE3_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop4x paddd 0x40(%rsp),$xa0 # accumulate key material paddd 0x50(%rsp),$xa1 paddd 0x60(%rsp),$xa2 paddd 0x70(%rsp),$xa3 movdqa $xa0,$xt2 # "de-interlace" data punpckldq $xa1,$xa0 movdqa $xa2,$xt3 punpckldq $xa3,$xa2 punpckhdq $xa1,$xt2 punpckhdq $xa3,$xt3 movdqa $xa0,$xa1 punpcklqdq $xa2,$xa0 # "a0" movdqa $xt2,$xa3 punpcklqdq $xt3,$xt2 # "a2" punpckhqdq $xa2,$xa1 # "a1" punpckhqdq $xt3,$xa3 # "a3" ___ ($xa2,$xt2)=($xt2,$xa2); $code.=<<___; paddd 0x80-0x100(%rcx),$xb0 paddd 0x90-0x100(%rcx),$xb1 paddd 0xa0-0x100(%rcx),$xb2 paddd 0xb0-0x100(%rcx),$xb3 movdqa $xa0,0x00(%rsp) # offload $xaN movdqa $xa1,0x10(%rsp) movdqa 0x20(%rsp),$xa0 # "xc2" movdqa 0x30(%rsp),$xa1 # "xc3" movdqa $xb0,$xt2 punpckldq $xb1,$xb0 movdqa $xb2,$xt3 punpckldq $xb3,$xb2 punpckhdq $xb1,$xt2 punpckhdq $xb3,$xt3 movdqa $xb0,$xb1 punpcklqdq $xb2,$xb0 # "b0" movdqa $xt2,$xb3 punpcklqdq $xt3,$xt2 # "b2" punpckhqdq $xb2,$xb1 # "b1" punpckhqdq $xt3,$xb3 # "b3" ___ ($xb2,$xt2)=($xt2,$xb2); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; paddd 0xc0-0x100(%rcx),$xc0 paddd 0xd0-0x100(%rcx),$xc1 paddd 0xe0-0x100(%rcx),$xc2 paddd 0xf0-0x100(%rcx),$xc3 movdqa $xa2,0x20(%rsp) # keep offloading $xaN movdqa $xa3,0x30(%rsp) movdqa $xc0,$xt2 punpckldq $xc1,$xc0 movdqa $xc2,$xt3 punpckldq $xc3,$xc2 punpckhdq $xc1,$xt2 punpckhdq $xc3,$xt3 movdqa $xc0,$xc1 punpcklqdq $xc2,$xc0 # "c0" movdqa $xt2,$xc3 punpcklqdq $xt3,$xt2 # "c2" punpckhqdq $xc2,$xc1 # "c1" punpckhqdq $xt3,$xc3 # "c3" ___ ($xc2,$xt2)=($xt2,$xc2); ($xt0,$xt1)=($xa2,$xa3); # use $xaN as temporary $code.=<<___; paddd 0x100-0x100(%rcx),$xd0 paddd 0x110-0x100(%rcx),$xd1 paddd 0x120-0x100(%rcx),$xd2 paddd 0x130-0x100(%rcx),$xd3 movdqa $xd0,$xt2 punpckldq $xd1,$xd0 movdqa $xd2,$xt3 punpckldq $xd3,$xd2 punpckhdq $xd1,$xt2 punpckhdq $xd3,$xt3 movdqa $xd0,$xd1 punpcklqdq $xd2,$xd0 # "d0" movdqa $xt2,$xd3 punpcklqdq $xt3,$xt2 # "d2" punpckhqdq $xd2,$xd1 # "d1" punpckhqdq $xt3,$xd3 # "d3" ___ ($xd2,$xt2)=($xt2,$xd2); $code.=<<___; cmp \$64*4,$len jb .Ltail4x movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # size optimization pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu 0x00($inp),$xt0 movdqu $xt1,0x50($out) movdqu 0x10($inp),$xt1 movdqu $xt2,0x60($out) movdqu 0x20($inp),$xt2 movdqu $xt3,0x70($out) lea 0x80($out),$out # size optimization movdqu 0x30($inp),$xt3 pxor 0x20(%rsp),$xt0 pxor $xb2,$xt1 pxor $xc2,$xt2 pxor $xd2,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # inp+=64*4 pxor 0x30(%rsp),$xt0 pxor $xb3,$xt1 pxor $xc3,$xt2 pxor $xd3,$xt3 movdqu $xt0,0x40($out) movdqu $xt1,0x50($out) movdqu $xt2,0x60($out) movdqu $xt3,0x70($out) lea 0x80($out),$out # out+=64*4 sub \$64*4,$len jnz .Loop_outer4x jmp .Ldone4x .Ltail4x: cmp \$192,$len jae .L192_or_more4x cmp \$128,$len jae .L128_or_more4x cmp \$64,$len jae .L64_or_more4x #movdqa 0x00(%rsp),$xt0 # $xaN is offloaded, remember? xor %r10,%r10 #movdqa $xt0,0x00(%rsp) movdqa $xb0,0x10(%rsp) movdqa $xc0,0x20(%rsp) movdqa $xd0,0x30(%rsp) jmp .Loop_tail4x .align 32 .L64_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaxN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu $xt1,0x10($out) movdqu $xt2,0x20($out) movdqu $xt3,0x30($out) je .Ldone4x movdqa 0x10(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*1 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 movdqa $xc1,0x20(%rsp) sub \$64,$len # len-=64*1 movdqa $xd1,0x30(%rsp) jmp .Loop_tail4x .align 32 .L128_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu $xt1,0x50($out) movdqu $xt2,0x60($out) movdqu $xt3,0x70($out) je .Ldone4x movdqa 0x20(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x80($inp),$inp # inp+=64*2 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 movdqa $xc2,0x20(%rsp) sub \$128,$len # len-=64*2 movdqa $xd2,0x30(%rsp) jmp .Loop_tail4x .align 32 .L192_or_more4x: movdqu 0x00($inp),$xt0 # xor with input movdqu 0x10($inp),$xt1 movdqu 0x20($inp),$xt2 movdqu 0x30($inp),$xt3 pxor 0x00(%rsp),$xt0 # $xaN is offloaded, remember? pxor $xb0,$xt1 pxor $xc0,$xt2 pxor $xd0,$xt3 movdqu $xt0,0x00($out) movdqu 0x40($inp),$xt0 movdqu $xt1,0x10($out) movdqu 0x50($inp),$xt1 movdqu $xt2,0x20($out) movdqu 0x60($inp),$xt2 movdqu $xt3,0x30($out) movdqu 0x70($inp),$xt3 lea 0x80($inp),$inp # size optimization pxor 0x10(%rsp),$xt0 pxor $xb1,$xt1 pxor $xc1,$xt2 pxor $xd1,$xt3 movdqu $xt0,0x40($out) movdqu 0x00($inp),$xt0 movdqu $xt1,0x50($out) movdqu 0x10($inp),$xt1 movdqu $xt2,0x60($out) movdqu 0x20($inp),$xt2 movdqu $xt3,0x70($out) lea 0x80($out),$out # size optimization movdqu 0x30($inp),$xt3 pxor 0x20(%rsp),$xt0 pxor $xb2,$xt1 pxor $xc2,$xt2 pxor $xd2,$xt3 movdqu $xt0,0x00($out) movdqu $xt1,0x10($out) movdqu $xt2,0x20($out) movdqu $xt3,0x30($out) je .Ldone4x movdqa 0x30(%rsp),$xt0 # $xaN is offloaded, remember? lea 0x40($inp),$inp # inp+=64*3 xor %r10,%r10 movdqa $xt0,0x00(%rsp) movdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 movdqa $xc3,0x20(%rsp) sub \$192,$len # len-=64*3 movdqa $xd3,0x30(%rsp) .Loop_tail4x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail4x .Ldone4x: ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L4x_epilogue: ret .cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x ___ } ######################################################################## # XOP code path that handles all lengths. if ($avx) { # There is some "anomaly" observed depending on instructions' size or # alignment. If you look closely at below code you'll notice that # sometimes argument order varies. The order affects instruction # encoding by making it larger, and such fiddling gives 5% performance # improvement. This is on FX-4100... my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%xmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xt0,$xt1,$xt2,$xt3, $xd0,$xd1,$xd2,$xd3); sub XOP_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my @x=map("\"$_\"",@xx); ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vprotd (@x[$d0],@x[$d0],16)", "&vprotd (@x[$d1],@x[$d1],16)", "&vprotd (@x[$d2],@x[$d2],16)", "&vprotd (@x[$d3],@x[$d3],16)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxor (@x[$b0],@x[$c0],@x[$b0])", "&vpxor (@x[$b1],@x[$c1],@x[$b1])", "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip "&vprotd (@x[$b0],@x[$b0],12)", "&vprotd (@x[$b1],@x[$b1],12)", "&vprotd (@x[$b2],@x[$b2],12)", "&vprotd (@x[$b3],@x[$b3],12)", "&vpaddd (@x[$a0],@x[$b0],@x[$a0])", # flip "&vpaddd (@x[$a1],@x[$b1],@x[$a1])", # flip "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vprotd (@x[$d0],@x[$d0],8)", "&vprotd (@x[$d1],@x[$d1],8)", "&vprotd (@x[$d2],@x[$d2],8)", "&vprotd (@x[$d3],@x[$d3],8)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxor (@x[$b0],@x[$c0],@x[$b0])", "&vpxor (@x[$b1],@x[$c1],@x[$b1])", "&vpxor (@x[$b2],@x[$b2],@x[$c2])", # flip "&vpxor (@x[$b3],@x[$b3],@x[$c3])", # flip "&vprotd (@x[$b0],@x[$b0],7)", "&vprotd (@x[$b1],@x[$b1],7)", "&vprotd (@x[$b2],@x[$b2],7)", "&vprotd (@x[$b3],@x[$b3],7)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_4xop,\@function,5 .align 32 ChaCha20_4xop: .cfi_startproc .LChaCha20_4xop: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 sub \$0x140+$xframe,%rsp ___ ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x40 constant copy of key[0-2] smashed by lanes # ... # +0x100 SIMD counters (with nonce smashed by lanes) # ... # +0x140 $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L4xop_body: ___ $code.=<<___; vzeroupper vmovdqa .Lsigma(%rip),$xa3 # key[0] vmovdqu ($key),$xb3 # key[1] vmovdqu 16($key),$xt3 # key[2] vmovdqu ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vmovdqa $xa0,0x40(%rsp) # ... and offload vpshufd \$0xaa,$xa3,$xa2 vmovdqa $xa1,0x50(%rsp) vpshufd \$0xff,$xa3,$xa3 vmovdqa $xa2,0x60(%rsp) vmovdqa $xa3,0x70(%rsp) vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vmovdqa $xb0,0x80-0x100(%rcx) vpshufd \$0xaa,$xb3,$xb2 vmovdqa $xb1,0x90-0x100(%rcx) vpshufd \$0xff,$xb3,$xb3 vmovdqa $xb2,0xa0-0x100(%rcx) vmovdqa $xb3,0xb0-0x100(%rcx) vpshufd \$0x00,$xt3,$xt0 # "$xc0" vpshufd \$0x55,$xt3,$xt1 # "$xc1" vmovdqa $xt0,0xc0-0x100(%rcx) vpshufd \$0xaa,$xt3,$xt2 # "$xc2" vmovdqa $xt1,0xd0-0x100(%rcx) vpshufd \$0xff,$xt3,$xt3 # "$xc3" vmovdqa $xt2,0xe0-0x100(%rcx) vmovdqa $xt3,0xf0-0x100(%rcx) vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpaddd .Linc(%rip),$xd0,$xd0 # don't save counters yet vpshufd \$0xaa,$xd3,$xd2 vmovdqa $xd1,0x110-0x100(%rcx) vpshufd \$0xff,$xd3,$xd3 vmovdqa $xd2,0x120-0x100(%rcx) vmovdqa $xd3,0x130-0x100(%rcx) jmp .Loop_enter4xop .align 32 .Loop_outer4xop: vmovdqa 0x40(%rsp),$xa0 # re-load smashed key vmovdqa 0x50(%rsp),$xa1 vmovdqa 0x60(%rsp),$xa2 vmovdqa 0x70(%rsp),$xa3 vmovdqa 0x80-0x100(%rcx),$xb0 vmovdqa 0x90-0x100(%rcx),$xb1 vmovdqa 0xa0-0x100(%rcx),$xb2 vmovdqa 0xb0-0x100(%rcx),$xb3 vmovdqa 0xc0-0x100(%rcx),$xt0 # "$xc0" vmovdqa 0xd0-0x100(%rcx),$xt1 # "$xc1" vmovdqa 0xe0-0x100(%rcx),$xt2 # "$xc2" vmovdqa 0xf0-0x100(%rcx),$xt3 # "$xc3" vmovdqa 0x100-0x100(%rcx),$xd0 vmovdqa 0x110-0x100(%rcx),$xd1 vmovdqa 0x120-0x100(%rcx),$xd2 vmovdqa 0x130-0x100(%rcx),$xd3 vpaddd .Lfour(%rip),$xd0,$xd0 # next SIMD counters .Loop_enter4xop: mov \$10,%eax vmovdqa $xd0,0x100-0x100(%rcx) # save SIMD counters jmp .Loop4xop .align 32 .Loop4xop: ___ foreach (&XOP_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&XOP_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop4xop vpaddd 0x40(%rsp),$xa0,$xa0 # accumulate key material vpaddd 0x50(%rsp),$xa1,$xa1 vpaddd 0x60(%rsp),$xa2,$xa2 vpaddd 0x70(%rsp),$xa3,$xa3 vmovdqa $xt2,0x20(%rsp) # offload $xc2,3 vmovdqa $xt3,0x30(%rsp) vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd 0x80-0x100(%rcx),$xb0,$xb0 vpaddd 0x90-0x100(%rcx),$xb1,$xb1 vpaddd 0xa0-0x100(%rcx),$xb2,$xb2 vpaddd 0xb0-0x100(%rcx),$xb3,$xb3 vmovdqa $xa0,0x00(%rsp) # offload $xa0,1 vmovdqa $xa1,0x10(%rsp) vmovdqa 0x20(%rsp),$xa0 # "xc2" vmovdqa 0x30(%rsp),$xa1 # "xc3" vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; vpaddd 0xc0-0x100(%rcx),$xc0,$xc0 vpaddd 0xd0-0x100(%rcx),$xc1,$xc1 vpaddd 0xe0-0x100(%rcx),$xc2,$xc2 vpaddd 0xf0-0x100(%rcx),$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd 0x100-0x100(%rcx),$xd0,$xd0 vpaddd 0x110-0x100(%rcx),$xd1,$xd1 vpaddd 0x120-0x100(%rcx),$xd2,$xd2 vpaddd 0x130-0x100(%rcx),$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); ($xa0,$xa1)=($xt2,$xt3); $code.=<<___; vmovdqa 0x00(%rsp),$xa0 # restore $xa0,1 vmovdqa 0x10(%rsp),$xa1 cmp \$64*4,$len jb .Ltail4xop vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vpxor 0x40($inp),$xa1,$xa1 vpxor 0x50($inp),$xb1,$xb1 vpxor 0x60($inp),$xc1,$xc1 vpxor 0x70($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x10($inp),$xb2,$xb2 vpxor 0x20($inp),$xc2,$xc2 vpxor 0x30($inp),$xd2,$xd2 vpxor 0x40($inp),$xa3,$xa3 vpxor 0x50($inp),$xb3,$xb3 vpxor 0x60($inp),$xc3,$xc3 vpxor 0x70($inp),$xd3,$xd3 lea 0x80($inp),$inp # inp+=64*4 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) vmovdqu $xa1,0x40($out) vmovdqu $xb1,0x50($out) vmovdqu $xc1,0x60($out) vmovdqu $xd1,0x70($out) lea 0x80($out),$out # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x10($out) vmovdqu $xc2,0x20($out) vmovdqu $xd2,0x30($out) vmovdqu $xa3,0x40($out) vmovdqu $xb3,0x50($out) vmovdqu $xc3,0x60($out) vmovdqu $xd3,0x70($out) lea 0x80($out),$out # out+=64*4 sub \$64*4,$len jnz .Loop_outer4xop jmp .Ldone4xop .align 32 .Ltail4xop: cmp \$192,$len jae .L192_or_more4xop cmp \$128,$len jae .L128_or_more4xop cmp \$64,$len jae .L64_or_more4xop xor %r10,%r10 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x10(%rsp) vmovdqa $xc0,0x20(%rsp) vmovdqa $xd0,0x30(%rsp) jmp .Loop_tail4xop .align 32 .L64_or_more4xop: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) je .Ldone4xop lea 0x40($inp),$inp # inp+=64*1 vmovdqa $xa1,0x00(%rsp) xor %r10,%r10 vmovdqa $xb1,0x10(%rsp) lea 0x40($out),$out # out+=64*1 vmovdqa $xc1,0x20(%rsp) sub \$64,$len # len-=64*1 vmovdqa $xd1,0x30(%rsp) jmp .Loop_tail4xop .align 32 .L128_or_more4xop: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vpxor 0x40($inp),$xa1,$xa1 vpxor 0x50($inp),$xb1,$xb1 vpxor 0x60($inp),$xc1,$xc1 vpxor 0x70($inp),$xd1,$xd1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) vmovdqu $xa1,0x40($out) vmovdqu $xb1,0x50($out) vmovdqu $xc1,0x60($out) vmovdqu $xd1,0x70($out) je .Ldone4xop lea 0x80($inp),$inp # inp+=64*2 vmovdqa $xa2,0x00(%rsp) xor %r10,%r10 vmovdqa $xb2,0x10(%rsp) lea 0x80($out),$out # out+=64*2 vmovdqa $xc2,0x20(%rsp) sub \$128,$len # len-=64*2 vmovdqa $xd2,0x30(%rsp) jmp .Loop_tail4xop .align 32 .L192_or_more4xop: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x10($inp),$xb0,$xb0 vpxor 0x20($inp),$xc0,$xc0 vpxor 0x30($inp),$xd0,$xd0 vpxor 0x40($inp),$xa1,$xa1 vpxor 0x50($inp),$xb1,$xb1 vpxor 0x60($inp),$xc1,$xc1 vpxor 0x70($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x10($inp),$xb2,$xb2 vpxor 0x20($inp),$xc2,$xc2 vpxor 0x30($inp),$xd2,$xd2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x10($out) vmovdqu $xc0,0x20($out) vmovdqu $xd0,0x30($out) vmovdqu $xa1,0x40($out) vmovdqu $xb1,0x50($out) vmovdqu $xc1,0x60($out) vmovdqu $xd1,0x70($out) lea 0x80($out),$out # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x10($out) vmovdqu $xc2,0x20($out) vmovdqu $xd2,0x30($out) je .Ldone4xop lea 0x40($inp),$inp # inp+=64*3 vmovdqa $xa3,0x00(%rsp) xor %r10,%r10 vmovdqa $xb3,0x10(%rsp) lea 0x40($out),$out # out+=64*3 vmovdqa $xc3,0x20(%rsp) sub \$192,$len # len-=64*3 vmovdqa $xd3,0x30(%rsp) .Loop_tail4xop: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail4xop .Ldone4xop: vzeroupper ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L4xop_epilogue: ret .cfi_endproc .size ChaCha20_4xop,.-ChaCha20_4xop ___ } ######################################################################## # AVX2 code path if ($avx>1) { my ($xb0,$xb1,$xb2,$xb3, $xd0,$xd1,$xd2,$xd3, $xa0,$xa1,$xa2,$xa3, $xt0,$xt1,$xt2,$xt3)=map("%ymm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, "%nox","%nox","%nox","%nox", $xd0,$xd1,$xd2,$xd3); sub AVX2_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my ($xc,$xc_,$t0,$t1)=map("\"$_\"",$xt0,$xt1,$xt2,$xt3); my @x=map("\"$_\"",@xx); # Consider order in which variables are addressed by their # index: # # a b c d # # 0 4 8 12 < even round # 1 5 9 13 # 2 6 10 14 # 3 7 11 15 # 0 5 10 15 < odd round # 1 6 11 12 # 2 7 8 13 # 3 4 9 14 # # 'a', 'b' and 'd's are permanently allocated in registers, # @x[0..7,12..15], while 'c's are maintained in memory. If # you observe 'c' column, you'll notice that pair of 'c's is # invariant between rounds. This means that we have to reload # them once per round, in the middle. This is why you'll see # bunch of 'c' stores and loads in the middle, but none in # the beginning or end. ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpshufb (@x[$d0],@x[$d0],$t1)", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpshufb (@x[$d1],@x[$d1],$t1)", "&vpaddd ($xc,$xc,@x[$d0])", "&vpxor (@x[$b0],$xc,@x[$b0])", "&vpslld ($t0,@x[$b0],12)", "&vpsrld (@x[$b0],@x[$b0],20)", "&vpor (@x[$b0],$t0,@x[$b0])", "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t1,@x[$b1],12)", "&vpsrld (@x[$b1],@x[$b1],20)", "&vpor (@x[$b1],$t1,@x[$b1])", "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", "&vpxor (@x[$d0],@x[$a0],@x[$d0])", "&vpshufb (@x[$d0],@x[$d0],$t0)", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", "&vpxor (@x[$d1],@x[$a1],@x[$d1])", "&vpshufb (@x[$d1],@x[$d1],$t0)", "&vpaddd ($xc,$xc,@x[$d0])", "&vpxor (@x[$b0],$xc,@x[$b0])", "&vpslld ($t1,@x[$b0],7)", "&vpsrld (@x[$b0],@x[$b0],25)", "&vpor (@x[$b0],$t1,@x[$b0])", "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d1])", "&vpxor (@x[$b1],$xc_,@x[$b1])", "&vpslld ($t0,@x[$b1],7)", "&vpsrld (@x[$b1],@x[$b1],25)", "&vpor (@x[$b1],$t0,@x[$b1])", "&vmovdqa (\"`32*($c0-8)`(%rsp)\",$xc)", # reload pair of 'c's "&vmovdqa (\"`32*($c1-8)`(%rsp)\",$xc_)", "&vmovdqa ($xc,\"`32*($c2-8)`(%rsp)\")", "&vmovdqa ($xc_,\"`32*($c3-8)`(%rsp)\")", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpshufb (@x[$d2],@x[$d2],$t1)", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vpshufb (@x[$d3],@x[$d3],$t1)", "&vpaddd ($xc,$xc,@x[$d2])", "&vpxor (@x[$b2],$xc,@x[$b2])", "&vpslld ($t0,@x[$b2],12)", "&vpsrld (@x[$b2],@x[$b2],20)", "&vpor (@x[$b2],$t0,@x[$b2])", "&vbroadcasti128($t0,'(%r11)')", # .Lrot24(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t1,@x[$b3],12)", "&vpsrld (@x[$b3],@x[$b3],20)", "&vpor (@x[$b3],$t1,@x[$b3])", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpxor (@x[$d2],@x[$a2],@x[$d2])", "&vpshufb (@x[$d2],@x[$d2],$t0)", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxor (@x[$d3],@x[$a3],@x[$d3])", "&vpshufb (@x[$d3],@x[$d3],$t0)", "&vpaddd ($xc,$xc,@x[$d2])", "&vpxor (@x[$b2],$xc,@x[$b2])", "&vpslld ($t1,@x[$b2],7)", "&vpsrld (@x[$b2],@x[$b2],25)", "&vpor (@x[$b2],$t1,@x[$b2])", "&vbroadcasti128($t1,'(%r10)')", # .Lrot16(%rip) "&vpaddd ($xc_,$xc_,@x[$d3])", "&vpxor (@x[$b3],$xc_,@x[$b3])", "&vpslld ($t0,@x[$b3],7)", "&vpsrld (@x[$b3],@x[$b3],25)", "&vpor (@x[$b3],$t0,@x[$b3])" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_8x,\@function,5 .align 32 ChaCha20_8x: .cfi_startproc .LChaCha20_8x: mov %rsp,%r9 # frame register .cfi_def_cfa_register %r9 sub \$0x280+$xframe,%rsp and \$-32,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L8x_body: ___ $code.=<<___; vzeroupper ################ stack layout # +0x00 SIMD equivalent of @x[8-12] # ... # +0x80 constant copy of key[0-2] smashed by lanes # ... # +0x200 SIMD counters (with nonce smashed by lanes) # ... # +0x280 vbroadcasti128 .Lsigma(%rip),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] vbroadcasti128 16($key),$xt3 # key[2] vbroadcasti128 ($counter),$xd3 # key[3] lea 0x100(%rsp),%rcx # size optimization lea 0x200(%rsp),%rax # size optimization lea .Lrot16(%rip),%r10 lea .Lrot24(%rip),%r11 vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vmovdqa $xa0,0x80-0x100(%rcx) # ... and offload vpshufd \$0xaa,$xa3,$xa2 vmovdqa $xa1,0xa0-0x100(%rcx) vpshufd \$0xff,$xa3,$xa3 vmovdqa $xa2,0xc0-0x100(%rcx) vmovdqa $xa3,0xe0-0x100(%rcx) vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vmovdqa $xb0,0x100-0x100(%rcx) vpshufd \$0xaa,$xb3,$xb2 vmovdqa $xb1,0x120-0x100(%rcx) vpshufd \$0xff,$xb3,$xb3 vmovdqa $xb2,0x140-0x100(%rcx) vmovdqa $xb3,0x160-0x100(%rcx) vpshufd \$0x00,$xt3,$xt0 # "xc0" vpshufd \$0x55,$xt3,$xt1 # "xc1" vmovdqa $xt0,0x180-0x200(%rax) vpshufd \$0xaa,$xt3,$xt2 # "xc2" vmovdqa $xt1,0x1a0-0x200(%rax) vpshufd \$0xff,$xt3,$xt3 # "xc3" vmovdqa $xt2,0x1c0-0x200(%rax) vmovdqa $xt3,0x1e0-0x200(%rax) vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet vpshufd \$0xaa,$xd3,$xd2 vmovdqa $xd1,0x220-0x200(%rax) vpshufd \$0xff,$xd3,$xd3 vmovdqa $xd2,0x240-0x200(%rax) vmovdqa $xd3,0x260-0x200(%rax) jmp .Loop_enter8x .align 32 .Loop_outer8x: vmovdqa 0x80-0x100(%rcx),$xa0 # re-load smashed key vmovdqa 0xa0-0x100(%rcx),$xa1 vmovdqa 0xc0-0x100(%rcx),$xa2 vmovdqa 0xe0-0x100(%rcx),$xa3 vmovdqa 0x100-0x100(%rcx),$xb0 vmovdqa 0x120-0x100(%rcx),$xb1 vmovdqa 0x140-0x100(%rcx),$xb2 vmovdqa 0x160-0x100(%rcx),$xb3 vmovdqa 0x180-0x200(%rax),$xt0 # "xc0" vmovdqa 0x1a0-0x200(%rax),$xt1 # "xc1" vmovdqa 0x1c0-0x200(%rax),$xt2 # "xc2" vmovdqa 0x1e0-0x200(%rax),$xt3 # "xc3" vmovdqa 0x200-0x200(%rax),$xd0 vmovdqa 0x220-0x200(%rax),$xd1 vmovdqa 0x240-0x200(%rax),$xd2 vmovdqa 0x260-0x200(%rax),$xd3 vpaddd .Leight(%rip),$xd0,$xd0 # next SIMD counters .Loop_enter8x: vmovdqa $xt2,0x40(%rsp) # SIMD equivalent of "@x[10]" vmovdqa $xt3,0x60(%rsp) # SIMD equivalent of "@x[11]" vbroadcasti128 (%r10),$xt3 vmovdqa $xd0,0x200-0x200(%rax) # save SIMD counters mov \$10,%eax jmp .Loop8x .align 32 .Loop8x: ___ foreach (&AVX2_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX2_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop8x lea 0x200(%rsp),%rax # size optimization vpaddd 0x80-0x100(%rcx),$xa0,$xa0 # accumulate key vpaddd 0xa0-0x100(%rcx),$xa1,$xa1 vpaddd 0xc0-0x100(%rcx),$xa2,$xa2 vpaddd 0xe0-0x100(%rcx),$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd 0x100-0x100(%rcx),$xb0,$xb0 vpaddd 0x120-0x100(%rcx),$xb1,$xb1 vpaddd 0x140-0x100(%rcx),$xb2,$xb2 vpaddd 0x160-0x100(%rcx),$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vperm2i128 \$0x20,$xb0,$xa0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xb0,$xa0,$xb0 vperm2i128 \$0x20,$xb1,$xa1,$xa0 vperm2i128 \$0x31,$xb1,$xa1,$xb1 vperm2i128 \$0x20,$xb2,$xa2,$xa1 vperm2i128 \$0x31,$xb2,$xa2,$xb2 vperm2i128 \$0x20,$xb3,$xa3,$xa2 vperm2i128 \$0x31,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); my ($xc0,$xc1,$xc2,$xc3)=($xt0,$xt1,$xa0,$xa1); $code.=<<___; vmovdqa $xa0,0x00(%rsp) # offload $xaN vmovdqa $xa1,0x20(%rsp) vmovdqa 0x40(%rsp),$xc2 # $xa0 vmovdqa 0x60(%rsp),$xc3 # $xa1 vpaddd 0x180-0x200(%rax),$xc0,$xc0 vpaddd 0x1a0-0x200(%rax),$xc1,$xc1 vpaddd 0x1c0-0x200(%rax),$xc2,$xc2 vpaddd 0x1e0-0x200(%rax),$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd 0x200-0x200(%rax),$xd0,$xd0 vpaddd 0x220-0x200(%rax),$xd1,$xd1 vpaddd 0x240-0x200(%rax),$xd2,$xd2 vpaddd 0x260-0x200(%rax),$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xd0,$xc0,$xd0 vperm2i128 \$0x20,$xd1,$xc1,$xc0 vperm2i128 \$0x31,$xd1,$xc1,$xd1 vperm2i128 \$0x20,$xd2,$xc2,$xc1 vperm2i128 \$0x31,$xd2,$xc2,$xd2 vperm2i128 \$0x20,$xd3,$xc3,$xc2 vperm2i128 \$0x31,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); ($xa0,$xa1)=($xt2,$xt3); $code.=<<___; vmovdqa 0x00(%rsp),$xa0 # $xaN was offloaded, remember? vmovdqa 0x20(%rsp),$xa1 cmp \$64*8,$len jb .Ltail8x vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 lea 0x80($inp),$inp # size optimization vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vpxor 0x40($inp),$xc1,$xc1 vpxor 0x60($inp),$xd1,$xd1 lea 0x80($inp),$inp # size optimization vmovdqu $xa1,0x00($out) vmovdqu $xb1,0x20($out) vmovdqu $xc1,0x40($out) vmovdqu $xd1,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vpxor 0x40($inp),$xc2,$xc2 vpxor 0x60($inp),$xd2,$xd2 lea 0x80($inp),$inp # size optimization vmovdqu $xa2,0x00($out) vmovdqu $xb2,0x20($out) vmovdqu $xc2,0x40($out) vmovdqu $xd2,0x60($out) lea 0x80($out),$out # size optimization vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vpxor 0x40($inp),$xc3,$xc3 vpxor 0x60($inp),$xd3,$xd3 lea 0x80($inp),$inp # size optimization vmovdqu $xa3,0x00($out) vmovdqu $xb3,0x20($out) vmovdqu $xc3,0x40($out) vmovdqu $xd3,0x60($out) lea 0x80($out),$out # size optimization sub \$64*8,$len jnz .Loop_outer8x jmp .Ldone8x .Ltail8x: cmp \$448,$len jae .L448_or_more8x cmp \$384,$len jae .L384_or_more8x cmp \$320,$len jae .L320_or_more8x cmp \$256,$len jae .L256_or_more8x cmp \$192,$len jae .L192_or_more8x cmp \$128,$len jae .L128_or_more8x cmp \$64,$len jae .L64_or_more8x xor %r10,%r10 vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x20(%rsp) jmp .Loop_tail8x .align 32 .L64_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) je .Ldone8x lea 0x40($inp),$inp # inp+=64*1 xor %r10,%r10 vmovdqa $xc0,0x00(%rsp) lea 0x40($out),$out # out+=64*1 sub \$64,$len # len-=64*1 vmovdqa $xd0,0x20(%rsp) jmp .Loop_tail8x .align 32 .L128_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) je .Ldone8x lea 0x80($inp),$inp # inp+=64*2 xor %r10,%r10 vmovdqa $xa1,0x00(%rsp) lea 0x80($out),$out # out+=64*2 sub \$128,$len # len-=64*2 vmovdqa $xb1,0x20(%rsp) jmp .Loop_tail8x .align 32 .L192_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) je .Ldone8x lea 0xc0($inp),$inp # inp+=64*3 xor %r10,%r10 vmovdqa $xc1,0x00(%rsp) lea 0xc0($out),$out # out+=64*3 sub \$192,$len # len-=64*3 vmovdqa $xd1,0x20(%rsp) jmp .Loop_tail8x .align 32 .L256_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) je .Ldone8x lea 0x100($inp),$inp # inp+=64*4 xor %r10,%r10 vmovdqa $xa2,0x00(%rsp) lea 0x100($out),$out # out+=64*4 sub \$256,$len # len-=64*4 vmovdqa $xb2,0x20(%rsp) jmp .Loop_tail8x .align 32 .L320_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) je .Ldone8x lea 0x140($inp),$inp # inp+=64*5 xor %r10,%r10 vmovdqa $xc2,0x00(%rsp) lea 0x140($out),$out # out+=64*5 sub \$320,$len # len-=64*5 vmovdqa $xd2,0x20(%rsp) jmp .Loop_tail8x .align 32 .L384_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vpxor 0x140($inp),$xc2,$xc2 vpxor 0x160($inp),$xd2,$xd2 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) vmovdqu $xc2,0x140($out) vmovdqu $xd2,0x160($out) je .Ldone8x lea 0x180($inp),$inp # inp+=64*6 xor %r10,%r10 vmovdqa $xa3,0x00(%rsp) lea 0x180($out),$out # out+=64*6 sub \$384,$len # len-=64*6 vmovdqa $xb3,0x20(%rsp) jmp .Loop_tail8x .align 32 .L448_or_more8x: vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 vpxor 0x80($inp),$xa1,$xa1 vpxor 0xa0($inp),$xb1,$xb1 vpxor 0xc0($inp),$xc1,$xc1 vpxor 0xe0($inp),$xd1,$xd1 vpxor 0x100($inp),$xa2,$xa2 vpxor 0x120($inp),$xb2,$xb2 vpxor 0x140($inp),$xc2,$xc2 vpxor 0x160($inp),$xd2,$xd2 vpxor 0x180($inp),$xa3,$xa3 vpxor 0x1a0($inp),$xb3,$xb3 vmovdqu $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) vmovdqu $xa1,0x80($out) vmovdqu $xb1,0xa0($out) vmovdqu $xc1,0xc0($out) vmovdqu $xd1,0xe0($out) vmovdqu $xa2,0x100($out) vmovdqu $xb2,0x120($out) vmovdqu $xc2,0x140($out) vmovdqu $xd2,0x160($out) vmovdqu $xa3,0x180($out) vmovdqu $xb3,0x1a0($out) je .Ldone8x lea 0x1c0($inp),$inp # inp+=64*7 xor %r10,%r10 vmovdqa $xc3,0x00(%rsp) lea 0x1c0($out),$out # out+=64*7 sub \$448,$len # len-=64*7 vmovdqa $xd3,0x20(%rsp) .Loop_tail8x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail8x .Ldone8x: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L8x_epilogue: ret .cfi_endproc .size ChaCha20_8x,.-ChaCha20_8x ___ } ######################################################################## # AVX512 code paths if ($avx>2) { # This one handles shorter inputs... my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); sub vpxord() # size optimization { my $opcode = "vpxor"; # adhere to vpxor when possible foreach (@_) { if (/%([zy])mm([0-9]+)/ && ($1 eq "z" || $2>=16)) { $opcode = "vpxord"; last; } } $code .= "\t$opcode\t".join(',',reverse @_)."\n"; } sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round &vpaddd ($a,$a,$b); &vpxord ($d,$d,$a); &vprold ($d,$d,16); &vpaddd ($c,$c,$d); &vpxord ($b,$b,$c); &vprold ($b,$b,12); &vpaddd ($a,$a,$b); &vpxord ($d,$d,$a); &vprold ($d,$d,8); &vpaddd ($c,$c,$d); &vpxord ($b,$b,$c); &vprold ($b,$b,7); } my $xframe = $win64 ? 32+8 : 8; $code.=<<___; .type ChaCha20_avx512,\@function,5 .align 32 ChaCha20_avx512: .cfi_startproc .LChaCha20_avx512: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 cmp \$512,$len ja .LChaCha20_16x sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x28(%r9) movaps %xmm7,-0x18(%r9) .Lavx512_body: ___ $code.=<<___; vbroadcasti32x4 .Lsigma(%rip),$a vbroadcasti32x4 ($key),$b vbroadcasti32x4 16($key),$c vbroadcasti32x4 ($counter),$d vmovdqa32 $a,$a_ vmovdqa32 $b,$b_ vmovdqa32 $c,$c_ vpaddd .Lzeroz(%rip),$d,$d vmovdqa32 .Lfourz(%rip),$fourz mov \$10,$counter # reuse $counter vmovdqa32 $d,$d_ jmp .Loop_avx512 .align 16 .Loop_outer_avx512: vmovdqa32 $a_,$a vmovdqa32 $b_,$b vmovdqa32 $c_,$c vpaddd $fourz,$d_,$d mov \$10,$counter vmovdqa32 $d,$d_ jmp .Loop_avx512 .align 32 .Loop_avx512: ___ &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b00111001); &vpshufd ($d,$d,0b10010011); &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b10010011); &vpshufd ($d,$d,0b00111001); &dec ($counter); &jnz (".Loop_avx512"); $code.=<<___; vpaddd $a_,$a,$a vpaddd $b_,$b,$b vpaddd $c_,$c,$c vpaddd $d_,$d,$d sub \$64,$len jb .Ltail64_avx512 vpxor 0x00($inp),%x#$a,$t0 # xor with input vpxor 0x10($inp),%x#$b,$t1 vpxor 0x20($inp),%x#$c,$t2 vpxor 0x30($inp),%x#$d,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512 vextracti32x4 \$1,$a,$t0 vextracti32x4 \$1,$b,$t1 vextracti32x4 \$1,$c,$t2 vextracti32x4 \$1,$d,$t3 sub \$64,$len jb .Ltail_avx512 vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512 vextracti32x4 \$2,$a,$t0 vextracti32x4 \$2,$b,$t1 vextracti32x4 \$2,$c,$t2 vextracti32x4 \$2,$d,$t3 sub \$64,$len jb .Ltail_avx512 vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512 vextracti32x4 \$3,$a,$t0 vextracti32x4 \$3,$b,$t1 vextracti32x4 \$3,$c,$t2 vextracti32x4 \$3,$d,$t3 sub \$64,$len jb .Ltail_avx512 vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jnz .Loop_outer_avx512 jmp .Ldone_avx512 .align 16 .Ltail64_avx512: vmovdqa %x#$a,0x00(%rsp) vmovdqa %x#$b,0x10(%rsp) vmovdqa %x#$c,0x20(%rsp) vmovdqa %x#$d,0x30(%rsp) add \$64,$len jmp .Loop_tail_avx512 .align 16 .Ltail_avx512: vmovdqa $t0,0x00(%rsp) vmovdqa $t1,0x10(%rsp) vmovdqa $t2,0x20(%rsp) vmovdqa $t3,0x30(%rsp) add \$64,$len .Loop_tail_avx512: movzb ($inp,$counter),%eax movzb (%rsp,$counter),%ecx lea 1($counter),$counter xor %ecx,%eax mov %al,-1($out,$counter) dec $len jnz .Loop_tail_avx512 vmovdqu32 $a_,0x00(%rsp) .Ldone_avx512: vzeroall ___ $code.=<<___ if ($win64); movaps -0x28(%r9),%xmm6 movaps -0x18(%r9),%xmm7 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .Lavx512_epilogue: ret .cfi_endproc .size ChaCha20_avx512,.-ChaCha20_avx512 ___ map(s/%z/%y/, $a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz); $code.=<<___; .type ChaCha20_avx512vl,\@function,5 .align 32 ChaCha20_avx512vl: .cfi_startproc .LChaCha20_avx512vl: mov %rsp,%r9 # frame pointer .cfi_def_cfa_register %r9 cmp \$128,$len ja .LChaCha20_8xvl sub \$64+$xframe,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0x28(%r9) movaps %xmm7,-0x18(%r9) .Lavx512vl_body: ___ $code.=<<___; vbroadcasti128 .Lsigma(%rip),$a vbroadcasti128 ($key),$b vbroadcasti128 16($key),$c vbroadcasti128 ($counter),$d vmovdqa32 $a,$a_ vmovdqa32 $b,$b_ vmovdqa32 $c,$c_ vpaddd .Lzeroz(%rip),$d,$d vmovdqa32 .Ltwoy(%rip),$fourz mov \$10,$counter # reuse $counter vmovdqa32 $d,$d_ jmp .Loop_avx512vl .align 16 .Loop_outer_avx512vl: vmovdqa32 $c_,$c vpaddd $fourz,$d_,$d mov \$10,$counter vmovdqa32 $d,$d_ jmp .Loop_avx512vl .align 32 .Loop_avx512vl: ___ &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b00111001); &vpshufd ($d,$d,0b10010011); &AVX512ROUND(); &vpshufd ($c,$c,0b01001110); &vpshufd ($b,$b,0b10010011); &vpshufd ($d,$d,0b00111001); &dec ($counter); &jnz (".Loop_avx512vl"); $code.=<<___; vpaddd $a_,$a,$a vpaddd $b_,$b,$b vpaddd $c_,$c,$c vpaddd $d_,$d,$d sub \$64,$len jb .Ltail64_avx512vl vpxor 0x00($inp),%x#$a,$t0 # xor with input vpxor 0x10($inp),%x#$b,$t1 vpxor 0x20($inp),%x#$c,$t2 vpxor 0x30($inp),%x#$d,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 jz .Ldone_avx512vl vextracti128 \$1,$a,$t0 vextracti128 \$1,$b,$t1 vextracti128 \$1,$c,$t2 vextracti128 \$1,$d,$t3 sub \$64,$len jb .Ltail_avx512vl vpxor 0x00($inp),$t0,$t0 # xor with input vpxor 0x10($inp),$t1,$t1 vpxor 0x20($inp),$t2,$t2 vpxor 0x30($inp),$t3,$t3 lea 0x40($inp),$inp # inp+=64 vmovdqu $t0,0x00($out) # write output vmovdqu $t1,0x10($out) vmovdqu $t2,0x20($out) vmovdqu $t3,0x30($out) lea 0x40($out),$out # out+=64 vmovdqa32 $a_,$a vmovdqa32 $b_,$b jnz .Loop_outer_avx512vl jmp .Ldone_avx512vl .align 16 .Ltail64_avx512vl: vmovdqa %x#$a,0x00(%rsp) vmovdqa %x#$b,0x10(%rsp) vmovdqa %x#$c,0x20(%rsp) vmovdqa %x#$d,0x30(%rsp) add \$64,$len jmp .Loop_tail_avx512vl .align 16 .Ltail_avx512vl: vmovdqa $t0,0x00(%rsp) vmovdqa $t1,0x10(%rsp) vmovdqa $t2,0x20(%rsp) vmovdqa $t3,0x30(%rsp) add \$64,$len .Loop_tail_avx512vl: movzb ($inp,$counter),%eax movzb (%rsp,$counter),%ecx lea 1($counter),$counter xor %ecx,%eax mov %al,-1($out,$counter) dec $len jnz .Loop_tail_avx512vl vmovdqu32 $a_,0x00(%rsp) vmovdqu32 $a_,0x20(%rsp) .Ldone_avx512vl: vzeroall ___ $code.=<<___ if ($win64); movaps -0x28(%r9),%xmm6 movaps -0x18(%r9),%xmm7 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .Lavx512vl_epilogue: ret .cfi_endproc .size ChaCha20_avx512vl,.-ChaCha20_avx512vl ___ } if ($avx>2) { # This one handles longer inputs... my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15)); my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); my @key=map("%zmm$_",(16..31)); my ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; sub AVX512_lane_ROUND { my ($a0,$b0,$c0,$d0)=@_; my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); my @x=map("\"$_\"",@xx); ( "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", # Q1 "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", # Q2 "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", # Q3 "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", # Q4 "&vpxord (@x[$d0],@x[$d0],@x[$a0])", "&vpxord (@x[$d1],@x[$d1],@x[$a1])", "&vpxord (@x[$d2],@x[$d2],@x[$a2])", "&vpxord (@x[$d3],@x[$d3],@x[$a3])", "&vprold (@x[$d0],@x[$d0],16)", "&vprold (@x[$d1],@x[$d1],16)", "&vprold (@x[$d2],@x[$d2],16)", "&vprold (@x[$d3],@x[$d3],16)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxord (@x[$b0],@x[$b0],@x[$c0])", "&vpxord (@x[$b1],@x[$b1],@x[$c1])", "&vpxord (@x[$b2],@x[$b2],@x[$c2])", "&vpxord (@x[$b3],@x[$b3],@x[$c3])", "&vprold (@x[$b0],@x[$b0],12)", "&vprold (@x[$b1],@x[$b1],12)", "&vprold (@x[$b2],@x[$b2],12)", "&vprold (@x[$b3],@x[$b3],12)", "&vpaddd (@x[$a0],@x[$a0],@x[$b0])", "&vpaddd (@x[$a1],@x[$a1],@x[$b1])", "&vpaddd (@x[$a2],@x[$a2],@x[$b2])", "&vpaddd (@x[$a3],@x[$a3],@x[$b3])", "&vpxord (@x[$d0],@x[$d0],@x[$a0])", "&vpxord (@x[$d1],@x[$d1],@x[$a1])", "&vpxord (@x[$d2],@x[$d2],@x[$a2])", "&vpxord (@x[$d3],@x[$d3],@x[$a3])", "&vprold (@x[$d0],@x[$d0],8)", "&vprold (@x[$d1],@x[$d1],8)", "&vprold (@x[$d2],@x[$d2],8)", "&vprold (@x[$d3],@x[$d3],8)", "&vpaddd (@x[$c0],@x[$c0],@x[$d0])", "&vpaddd (@x[$c1],@x[$c1],@x[$d1])", "&vpaddd (@x[$c2],@x[$c2],@x[$d2])", "&vpaddd (@x[$c3],@x[$c3],@x[$d3])", "&vpxord (@x[$b0],@x[$b0],@x[$c0])", "&vpxord (@x[$b1],@x[$b1],@x[$c1])", "&vpxord (@x[$b2],@x[$b2],@x[$c2])", "&vpxord (@x[$b3],@x[$b3],@x[$c3])", "&vprold (@x[$b0],@x[$b0],7)", "&vprold (@x[$b1],@x[$b1],7)", "&vprold (@x[$b2],@x[$b2],7)", "&vprold (@x[$b3],@x[$b3],7)" ); } my $xframe = $win64 ? 0xa8 : 8; $code.=<<___; .type ChaCha20_16x,\@function,5 .align 32 ChaCha20_16x: .cfi_startproc .LChaCha20_16x: mov %rsp,%r9 # frame register .cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L16x_body: ___ $code.=<<___; vzeroupper lea .Lsigma(%rip),%r10 vbroadcasti32x4 (%r10),$xa3 # key[0] vbroadcasti32x4 ($key),$xb3 # key[1] vbroadcasti32x4 16($key),$xc3 # key[2] vbroadcasti32x4 ($counter),$xd3 # key[3] vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vpshufd \$0xaa,$xa3,$xa2 vpshufd \$0xff,$xa3,$xa3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vpshufd \$0xaa,$xb3,$xb2 vpshufd \$0xff,$xb3,$xb3 vmovdqa64 $xb0,@key[4] vmovdqa64 $xb1,@key[5] vmovdqa64 $xb2,@key[6] vmovdqa64 $xb3,@key[7] vpshufd \$0x00,$xc3,$xc0 vpshufd \$0x55,$xc3,$xc1 vpshufd \$0xaa,$xc3,$xc2 vpshufd \$0xff,$xc3,$xc3 vmovdqa64 $xc0,@key[8] vmovdqa64 $xc1,@key[9] vmovdqa64 $xc2,@key[10] vmovdqa64 $xc3,@key[11] vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpshufd \$0xaa,$xd3,$xd2 vpshufd \$0xff,$xd3,$xd3 vpaddd .Lincz(%rip),$xd0,$xd0 # don't save counters yet vmovdqa64 $xd0,@key[12] vmovdqa64 $xd1,@key[13] vmovdqa64 $xd2,@key[14] vmovdqa64 $xd3,@key[15] mov \$10,%eax jmp .Loop16x .align 32 .Loop_outer16x: vpbroadcastd 0(%r10),$xa0 # reload key vpbroadcastd 4(%r10),$xa1 vpbroadcastd 8(%r10),$xa2 vpbroadcastd 12(%r10),$xa3 vpaddd .Lsixteen(%rip),@key[12],@key[12] # next SIMD counters vmovdqa64 @key[4],$xb0 vmovdqa64 @key[5],$xb1 vmovdqa64 @key[6],$xb2 vmovdqa64 @key[7],$xb3 vmovdqa64 @key[8],$xc0 vmovdqa64 @key[9],$xc1 vmovdqa64 @key[10],$xc2 vmovdqa64 @key[11],$xc3 vmovdqa64 @key[12],$xd0 vmovdqa64 @key[13],$xd1 vmovdqa64 @key[14],$xd2 vmovdqa64 @key[15],$xd3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] mov \$10,%eax jmp .Loop16x .align 32 .Loop16x: ___ foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop16x vpaddd @key[0],$xa0,$xa0 # accumulate key vpaddd @key[1],$xa1,$xa1 vpaddd @key[2],$xa2,$xa2 vpaddd @key[3],$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd @key[4],$xb0,$xb0 vpaddd @key[5],$xb1,$xb1 vpaddd @key[6],$xb2,$xb2 vpaddd @key[7],$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vshufi32x4 \$0x44,$xb0,$xa0,$xt3 # "de-interlace" further vshufi32x4 \$0xee,$xb0,$xa0,$xb0 vshufi32x4 \$0x44,$xb1,$xa1,$xa0 vshufi32x4 \$0xee,$xb1,$xa1,$xb1 vshufi32x4 \$0x44,$xb2,$xa2,$xa1 vshufi32x4 \$0xee,$xb2,$xa2,$xb2 vshufi32x4 \$0x44,$xb3,$xa3,$xa2 vshufi32x4 \$0xee,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); $code.=<<___; vpaddd @key[8],$xc0,$xc0 vpaddd @key[9],$xc1,$xc1 vpaddd @key[10],$xc2,$xc2 vpaddd @key[11],$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd @key[12],$xd0,$xd0 vpaddd @key[13],$xd1,$xd1 vpaddd @key[14],$xd2,$xd2 vpaddd @key[15],$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vshufi32x4 \$0x44,$xd0,$xc0,$xt3 # "de-interlace" further vshufi32x4 \$0xee,$xd0,$xc0,$xd0 vshufi32x4 \$0x44,$xd1,$xc1,$xc0 vshufi32x4 \$0xee,$xd1,$xc1,$xd1 vshufi32x4 \$0x44,$xd2,$xc2,$xc1 vshufi32x4 \$0xee,$xd2,$xc2,$xd2 vshufi32x4 \$0x44,$xd3,$xc3,$xc2 vshufi32x4 \$0xee,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); $code.=<<___; vshufi32x4 \$0x88,$xc0,$xa0,$xt0 # "de-interlace" further vshufi32x4 \$0xdd,$xc0,$xa0,$xa0 vshufi32x4 \$0x88,$xd0,$xb0,$xc0 vshufi32x4 \$0xdd,$xd0,$xb0,$xd0 vshufi32x4 \$0x88,$xc1,$xa1,$xt1 vshufi32x4 \$0xdd,$xc1,$xa1,$xa1 vshufi32x4 \$0x88,$xd1,$xb1,$xc1 vshufi32x4 \$0xdd,$xd1,$xb1,$xd1 vshufi32x4 \$0x88,$xc2,$xa2,$xt2 vshufi32x4 \$0xdd,$xc2,$xa2,$xa2 vshufi32x4 \$0x88,$xd2,$xb2,$xc2 vshufi32x4 \$0xdd,$xd2,$xb2,$xd2 vshufi32x4 \$0x88,$xc3,$xa3,$xt3 vshufi32x4 \$0xdd,$xc3,$xa3,$xa3 vshufi32x4 \$0x88,$xd3,$xb3,$xc3 vshufi32x4 \$0xdd,$xd3,$xb3,$xd3 ___ ($xa0,$xa1,$xa2,$xa3,$xb0,$xb1,$xb2,$xb3)= ($xt0,$xt1,$xt2,$xt3,$xa0,$xa1,$xa2,$xa3); ($xa0,$xb0,$xc0,$xd0, $xa1,$xb1,$xc1,$xd1, $xa2,$xb2,$xc2,$xd2, $xa3,$xb3,$xc3,$xd3) = ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); $code.=<<___; cmp \$64*16,$len jb .Ltail16x vpxord 0x00($inp),$xa0,$xa0 # xor with input vpxord 0x40($inp),$xb0,$xb0 vpxord 0x80($inp),$xc0,$xc0 vpxord 0xc0($inp),$xd0,$xd0 vmovdqu32 $xa0,0x00($out) vmovdqu32 $xb0,0x40($out) vmovdqu32 $xc0,0x80($out) vmovdqu32 $xd0,0xc0($out) vpxord 0x100($inp),$xa1,$xa1 vpxord 0x140($inp),$xb1,$xb1 vpxord 0x180($inp),$xc1,$xc1 vpxord 0x1c0($inp),$xd1,$xd1 vmovdqu32 $xa1,0x100($out) vmovdqu32 $xb1,0x140($out) vmovdqu32 $xc1,0x180($out) vmovdqu32 $xd1,0x1c0($out) vpxord 0x200($inp),$xa2,$xa2 vpxord 0x240($inp),$xb2,$xb2 vpxord 0x280($inp),$xc2,$xc2 vpxord 0x2c0($inp),$xd2,$xd2 vmovdqu32 $xa2,0x200($out) vmovdqu32 $xb2,0x240($out) vmovdqu32 $xc2,0x280($out) vmovdqu32 $xd2,0x2c0($out) vpxord 0x300($inp),$xa3,$xa3 vpxord 0x340($inp),$xb3,$xb3 vpxord 0x380($inp),$xc3,$xc3 vpxord 0x3c0($inp),$xd3,$xd3 lea 0x400($inp),$inp vmovdqu32 $xa3,0x300($out) vmovdqu32 $xb3,0x340($out) vmovdqu32 $xc3,0x380($out) vmovdqu32 $xd3,0x3c0($out) lea 0x400($out),$out sub \$64*16,$len jnz .Loop_outer16x jmp .Ldone16x .align 32 .Ltail16x: xor %r10,%r10 sub $inp,$out cmp \$64*1,$len jb .Less_than_64_16x vpxord ($inp),$xa0,$xa0 # xor with input vmovdqu32 $xa0,($out,$inp) je .Ldone16x vmovdqa32 $xb0,$xa0 lea 64($inp),$inp cmp \$64*2,$len jb .Less_than_64_16x vpxord ($inp),$xb0,$xb0 vmovdqu32 $xb0,($out,$inp) je .Ldone16x vmovdqa32 $xc0,$xa0 lea 64($inp),$inp cmp \$64*3,$len jb .Less_than_64_16x vpxord ($inp),$xc0,$xc0 vmovdqu32 $xc0,($out,$inp) je .Ldone16x vmovdqa32 $xd0,$xa0 lea 64($inp),$inp cmp \$64*4,$len jb .Less_than_64_16x vpxord ($inp),$xd0,$xd0 vmovdqu32 $xd0,($out,$inp) je .Ldone16x vmovdqa32 $xa1,$xa0 lea 64($inp),$inp cmp \$64*5,$len jb .Less_than_64_16x vpxord ($inp),$xa1,$xa1 vmovdqu32 $xa1,($out,$inp) je .Ldone16x vmovdqa32 $xb1,$xa0 lea 64($inp),$inp cmp \$64*6,$len jb .Less_than_64_16x vpxord ($inp),$xb1,$xb1 vmovdqu32 $xb1,($out,$inp) je .Ldone16x vmovdqa32 $xc1,$xa0 lea 64($inp),$inp cmp \$64*7,$len jb .Less_than_64_16x vpxord ($inp),$xc1,$xc1 vmovdqu32 $xc1,($out,$inp) je .Ldone16x vmovdqa32 $xd1,$xa0 lea 64($inp),$inp cmp \$64*8,$len jb .Less_than_64_16x vpxord ($inp),$xd1,$xd1 vmovdqu32 $xd1,($out,$inp) je .Ldone16x vmovdqa32 $xa2,$xa0 lea 64($inp),$inp cmp \$64*9,$len jb .Less_than_64_16x vpxord ($inp),$xa2,$xa2 vmovdqu32 $xa2,($out,$inp) je .Ldone16x vmovdqa32 $xb2,$xa0 lea 64($inp),$inp cmp \$64*10,$len jb .Less_than_64_16x vpxord ($inp),$xb2,$xb2 vmovdqu32 $xb2,($out,$inp) je .Ldone16x vmovdqa32 $xc2,$xa0 lea 64($inp),$inp cmp \$64*11,$len jb .Less_than_64_16x vpxord ($inp),$xc2,$xc2 vmovdqu32 $xc2,($out,$inp) je .Ldone16x vmovdqa32 $xd2,$xa0 lea 64($inp),$inp cmp \$64*12,$len jb .Less_than_64_16x vpxord ($inp),$xd2,$xd2 vmovdqu32 $xd2,($out,$inp) je .Ldone16x vmovdqa32 $xa3,$xa0 lea 64($inp),$inp cmp \$64*13,$len jb .Less_than_64_16x vpxord ($inp),$xa3,$xa3 vmovdqu32 $xa3,($out,$inp) je .Ldone16x vmovdqa32 $xb3,$xa0 lea 64($inp),$inp cmp \$64*14,$len jb .Less_than_64_16x vpxord ($inp),$xb3,$xb3 vmovdqu32 $xb3,($out,$inp) je .Ldone16x vmovdqa32 $xc3,$xa0 lea 64($inp),$inp cmp \$64*15,$len jb .Less_than_64_16x vpxord ($inp),$xc3,$xc3 vmovdqu32 $xc3,($out,$inp) je .Ldone16x vmovdqa32 $xd3,$xa0 lea 64($inp),$inp .Less_than_64_16x: vmovdqa32 $xa0,0x00(%rsp) lea ($out,$inp),$out and \$63,$len .Loop_tail16x: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail16x vpxord $xa0,$xa0,$xa0 vmovdqa32 $xa0,0(%rsp) .Ldone16x: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L16x_epilogue: ret .cfi_endproc .size ChaCha20_16x,.-ChaCha20_16x ___ # switch to %ymm domain ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%ymm$_",(0..15)); @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3); @key=map("%ymm$_",(16..31)); ($xt0,$xt1,$xt2,$xt3)=@key[0..3]; $code.=<<___; .type ChaCha20_8xvl,\@function,5 .align 32 ChaCha20_8xvl: .cfi_startproc .LChaCha20_8xvl: mov %rsp,%r9 # frame register .cfi_def_cfa_register %r9 sub \$64+$xframe,%rsp and \$-64,%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-0xa8(%r9) movaps %xmm7,-0x98(%r9) movaps %xmm8,-0x88(%r9) movaps %xmm9,-0x78(%r9) movaps %xmm10,-0x68(%r9) movaps %xmm11,-0x58(%r9) movaps %xmm12,-0x48(%r9) movaps %xmm13,-0x38(%r9) movaps %xmm14,-0x28(%r9) movaps %xmm15,-0x18(%r9) .L8xvl_body: ___ $code.=<<___; vzeroupper lea .Lsigma(%rip),%r10 vbroadcasti128 (%r10),$xa3 # key[0] vbroadcasti128 ($key),$xb3 # key[1] vbroadcasti128 16($key),$xc3 # key[2] vbroadcasti128 ($counter),$xd3 # key[3] vpshufd \$0x00,$xa3,$xa0 # smash key by lanes... vpshufd \$0x55,$xa3,$xa1 vpshufd \$0xaa,$xa3,$xa2 vpshufd \$0xff,$xa3,$xa3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] vpshufd \$0x00,$xb3,$xb0 vpshufd \$0x55,$xb3,$xb1 vpshufd \$0xaa,$xb3,$xb2 vpshufd \$0xff,$xb3,$xb3 vmovdqa64 $xb0,@key[4] vmovdqa64 $xb1,@key[5] vmovdqa64 $xb2,@key[6] vmovdqa64 $xb3,@key[7] vpshufd \$0x00,$xc3,$xc0 vpshufd \$0x55,$xc3,$xc1 vpshufd \$0xaa,$xc3,$xc2 vpshufd \$0xff,$xc3,$xc3 vmovdqa64 $xc0,@key[8] vmovdqa64 $xc1,@key[9] vmovdqa64 $xc2,@key[10] vmovdqa64 $xc3,@key[11] vpshufd \$0x00,$xd3,$xd0 vpshufd \$0x55,$xd3,$xd1 vpshufd \$0xaa,$xd3,$xd2 vpshufd \$0xff,$xd3,$xd3 vpaddd .Lincy(%rip),$xd0,$xd0 # don't save counters yet vmovdqa64 $xd0,@key[12] vmovdqa64 $xd1,@key[13] vmovdqa64 $xd2,@key[14] vmovdqa64 $xd3,@key[15] mov \$10,%eax jmp .Loop8xvl .align 32 .Loop_outer8xvl: #vpbroadcastd 0(%r10),$xa0 # reload key #vpbroadcastd 4(%r10),$xa1 vpbroadcastd 8(%r10),$xa2 vpbroadcastd 12(%r10),$xa3 vpaddd .Leight(%rip),@key[12],@key[12] # next SIMD counters vmovdqa64 @key[4],$xb0 vmovdqa64 @key[5],$xb1 vmovdqa64 @key[6],$xb2 vmovdqa64 @key[7],$xb3 vmovdqa64 @key[8],$xc0 vmovdqa64 @key[9],$xc1 vmovdqa64 @key[10],$xc2 vmovdqa64 @key[11],$xc3 vmovdqa64 @key[12],$xd0 vmovdqa64 @key[13],$xd1 vmovdqa64 @key[14],$xd2 vmovdqa64 @key[15],$xd3 vmovdqa64 $xa0,@key[0] vmovdqa64 $xa1,@key[1] vmovdqa64 $xa2,@key[2] vmovdqa64 $xa3,@key[3] mov \$10,%eax jmp .Loop8xvl .align 32 .Loop8xvl: ___ foreach (&AVX512_lane_ROUND(0, 4, 8,12)) { eval; } foreach (&AVX512_lane_ROUND(0, 5,10,15)) { eval; } $code.=<<___; dec %eax jnz .Loop8xvl vpaddd @key[0],$xa0,$xa0 # accumulate key vpaddd @key[1],$xa1,$xa1 vpaddd @key[2],$xa2,$xa2 vpaddd @key[3],$xa3,$xa3 vpunpckldq $xa1,$xa0,$xt2 # "de-interlace" data vpunpckldq $xa3,$xa2,$xt3 vpunpckhdq $xa1,$xa0,$xa0 vpunpckhdq $xa3,$xa2,$xa2 vpunpcklqdq $xt3,$xt2,$xa1 # "a0" vpunpckhqdq $xt3,$xt2,$xt2 # "a1" vpunpcklqdq $xa2,$xa0,$xa3 # "a2" vpunpckhqdq $xa2,$xa0,$xa0 # "a3" ___ ($xa0,$xa1,$xa2,$xa3,$xt2)=($xa1,$xt2,$xa3,$xa0,$xa2); $code.=<<___; vpaddd @key[4],$xb0,$xb0 vpaddd @key[5],$xb1,$xb1 vpaddd @key[6],$xb2,$xb2 vpaddd @key[7],$xb3,$xb3 vpunpckldq $xb1,$xb0,$xt2 vpunpckldq $xb3,$xb2,$xt3 vpunpckhdq $xb1,$xb0,$xb0 vpunpckhdq $xb3,$xb2,$xb2 vpunpcklqdq $xt3,$xt2,$xb1 # "b0" vpunpckhqdq $xt3,$xt2,$xt2 # "b1" vpunpcklqdq $xb2,$xb0,$xb3 # "b2" vpunpckhqdq $xb2,$xb0,$xb0 # "b3" ___ ($xb0,$xb1,$xb2,$xb3,$xt2)=($xb1,$xt2,$xb3,$xb0,$xb2); $code.=<<___; vshufi32x4 \$0,$xb0,$xa0,$xt3 # "de-interlace" further vshufi32x4 \$3,$xb0,$xa0,$xb0 vshufi32x4 \$0,$xb1,$xa1,$xa0 vshufi32x4 \$3,$xb1,$xa1,$xb1 vshufi32x4 \$0,$xb2,$xa2,$xa1 vshufi32x4 \$3,$xb2,$xa2,$xb2 vshufi32x4 \$0,$xb3,$xa3,$xa2 vshufi32x4 \$3,$xb3,$xa3,$xb3 ___ ($xa0,$xa1,$xa2,$xa3,$xt3)=($xt3,$xa0,$xa1,$xa2,$xa3); $code.=<<___; vpaddd @key[8],$xc0,$xc0 vpaddd @key[9],$xc1,$xc1 vpaddd @key[10],$xc2,$xc2 vpaddd @key[11],$xc3,$xc3 vpunpckldq $xc1,$xc0,$xt2 vpunpckldq $xc3,$xc2,$xt3 vpunpckhdq $xc1,$xc0,$xc0 vpunpckhdq $xc3,$xc2,$xc2 vpunpcklqdq $xt3,$xt2,$xc1 # "c0" vpunpckhqdq $xt3,$xt2,$xt2 # "c1" vpunpcklqdq $xc2,$xc0,$xc3 # "c2" vpunpckhqdq $xc2,$xc0,$xc0 # "c3" ___ ($xc0,$xc1,$xc2,$xc3,$xt2)=($xc1,$xt2,$xc3,$xc0,$xc2); $code.=<<___; vpaddd @key[12],$xd0,$xd0 vpaddd @key[13],$xd1,$xd1 vpaddd @key[14],$xd2,$xd2 vpaddd @key[15],$xd3,$xd3 vpunpckldq $xd1,$xd0,$xt2 vpunpckldq $xd3,$xd2,$xt3 vpunpckhdq $xd1,$xd0,$xd0 vpunpckhdq $xd3,$xd2,$xd2 vpunpcklqdq $xt3,$xt2,$xd1 # "d0" vpunpckhqdq $xt3,$xt2,$xt2 # "d1" vpunpcklqdq $xd2,$xd0,$xd3 # "d2" vpunpckhqdq $xd2,$xd0,$xd0 # "d3" ___ ($xd0,$xd1,$xd2,$xd3,$xt2)=($xd1,$xt2,$xd3,$xd0,$xd2); $code.=<<___; vperm2i128 \$0x20,$xd0,$xc0,$xt3 # "de-interlace" further vperm2i128 \$0x31,$xd0,$xc0,$xd0 vperm2i128 \$0x20,$xd1,$xc1,$xc0 vperm2i128 \$0x31,$xd1,$xc1,$xd1 vperm2i128 \$0x20,$xd2,$xc2,$xc1 vperm2i128 \$0x31,$xd2,$xc2,$xd2 vperm2i128 \$0x20,$xd3,$xc3,$xc2 vperm2i128 \$0x31,$xd3,$xc3,$xd3 ___ ($xc0,$xc1,$xc2,$xc3,$xt3)=($xt3,$xc0,$xc1,$xc2,$xc3); ($xb0,$xb1,$xb2,$xb3,$xc0,$xc1,$xc2,$xc3)= ($xc0,$xc1,$xc2,$xc3,$xb0,$xb1,$xb2,$xb3); $code.=<<___; cmp \$64*8,$len jb .Ltail8xvl mov \$0x80,%eax # size optimization vpxord 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vpxor 0x40($inp),$xc0,$xc0 vpxor 0x60($inp),$xd0,$xd0 lea ($inp,%rax),$inp # size optimization vmovdqu32 $xa0,0x00($out) vmovdqu $xb0,0x20($out) vmovdqu $xc0,0x40($out) vmovdqu $xd0,0x60($out) lea ($out,%rax),$out # size optimization vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vpxor 0x40($inp),$xc1,$xc1 vpxor 0x60($inp),$xd1,$xd1 lea ($inp,%rax),$inp # size optimization vmovdqu $xa1,0x00($out) vmovdqu $xb1,0x20($out) vmovdqu $xc1,0x40($out) vmovdqu $xd1,0x60($out) lea ($out,%rax),$out # size optimization vpxord 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vpxor 0x40($inp),$xc2,$xc2 vpxor 0x60($inp),$xd2,$xd2 lea ($inp,%rax),$inp # size optimization vmovdqu32 $xa2,0x00($out) vmovdqu $xb2,0x20($out) vmovdqu $xc2,0x40($out) vmovdqu $xd2,0x60($out) lea ($out,%rax),$out # size optimization vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vpxor 0x40($inp),$xc3,$xc3 vpxor 0x60($inp),$xd3,$xd3 lea ($inp,%rax),$inp # size optimization vmovdqu $xa3,0x00($out) vmovdqu $xb3,0x20($out) vmovdqu $xc3,0x40($out) vmovdqu $xd3,0x60($out) lea ($out,%rax),$out # size optimization vpbroadcastd 0(%r10),%ymm0 # reload key vpbroadcastd 4(%r10),%ymm1 sub \$64*8,$len jnz .Loop_outer8xvl jmp .Ldone8xvl .align 32 .Ltail8xvl: vmovdqa64 $xa0,%ymm8 # size optimization ___ $xa0 = "%ymm8"; $code.=<<___; xor %r10,%r10 sub $inp,$out cmp \$64*1,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xa0,$xa0 # xor with input vpxor 0x20($inp),$xb0,$xb0 vmovdqu $xa0,0x00($out,$inp) vmovdqu $xb0,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc0,$xa0 vmovdqa $xd0,$xb0 lea 64($inp),$inp cmp \$64*2,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xc0,$xc0 vpxor 0x20($inp),$xd0,$xd0 vmovdqu $xc0,0x00($out,$inp) vmovdqu $xd0,0x20($out,$inp) je .Ldone8xvl vmovdqa $xa1,$xa0 vmovdqa $xb1,$xb0 lea 64($inp),$inp cmp \$64*3,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xa1,$xa1 vpxor 0x20($inp),$xb1,$xb1 vmovdqu $xa1,0x00($out,$inp) vmovdqu $xb1,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc1,$xa0 vmovdqa $xd1,$xb0 lea 64($inp),$inp cmp \$64*4,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xc1,$xc1 vpxor 0x20($inp),$xd1,$xd1 vmovdqu $xc1,0x00($out,$inp) vmovdqu $xd1,0x20($out,$inp) je .Ldone8xvl vmovdqa32 $xa2,$xa0 vmovdqa $xb2,$xb0 lea 64($inp),$inp cmp \$64*5,$len jb .Less_than_64_8xvl vpxord 0x00($inp),$xa2,$xa2 vpxor 0x20($inp),$xb2,$xb2 vmovdqu32 $xa2,0x00($out,$inp) vmovdqu $xb2,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc2,$xa0 vmovdqa $xd2,$xb0 lea 64($inp),$inp cmp \$64*6,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xc2,$xc2 vpxor 0x20($inp),$xd2,$xd2 vmovdqu $xc2,0x00($out,$inp) vmovdqu $xd2,0x20($out,$inp) je .Ldone8xvl vmovdqa $xa3,$xa0 vmovdqa $xb3,$xb0 lea 64($inp),$inp cmp \$64*7,$len jb .Less_than_64_8xvl vpxor 0x00($inp),$xa3,$xa3 vpxor 0x20($inp),$xb3,$xb3 vmovdqu $xa3,0x00($out,$inp) vmovdqu $xb3,0x20($out,$inp) je .Ldone8xvl vmovdqa $xc3,$xa0 vmovdqa $xd3,$xb0 lea 64($inp),$inp .Less_than_64_8xvl: vmovdqa $xa0,0x00(%rsp) vmovdqa $xb0,0x20(%rsp) lea ($out,$inp),$out and \$63,$len .Loop_tail8xvl: movzb ($inp,%r10),%eax movzb (%rsp,%r10),%ecx lea 1(%r10),%r10 xor %ecx,%eax mov %al,-1($out,%r10) dec $len jnz .Loop_tail8xvl vpxor $xa0,$xa0,$xa0 vmovdqa $xa0,0x00(%rsp) vmovdqa $xa0,0x20(%rsp) .Ldone8xvl: vzeroall ___ $code.=<<___ if ($win64); movaps -0xa8(%r9),%xmm6 movaps -0x98(%r9),%xmm7 movaps -0x88(%r9),%xmm8 movaps -0x78(%r9),%xmm9 movaps -0x68(%r9),%xmm10 movaps -0x58(%r9),%xmm11 movaps -0x48(%r9),%xmm12 movaps -0x38(%r9),%xmm13 movaps -0x28(%r9),%xmm14 movaps -0x18(%r9),%xmm15 ___ $code.=<<___; lea (%r9),%rsp .cfi_def_cfa_register %rsp .L8xvl_epilogue: ret .cfi_endproc .size ChaCha20_8xvl,.-ChaCha20_8xvl ___ } # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData lea .Lctr32_body(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lno_data(%rip),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea 64+24+48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .type simd_handler,\@abi-omnipotent .align 16 simd_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipR9 mov 4(%r11),%r10d # HandlerData[1] mov 8(%r11),%ecx # HandlerData[2] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail neg %rcx lea -8(%rax,%rcx),%rsi lea 512($context),%rdi # &context.Xmm6 neg %ecx shr \$3,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_seh_tail .size simd_handler,.-simd_handler .section .pdata .align 4 .rva .LSEH_begin_ChaCha20_ctr32 .rva .LSEH_end_ChaCha20_ctr32 .rva .LSEH_info_ChaCha20_ctr32 .rva .LSEH_begin_ChaCha20_ssse3 .rva .LSEH_end_ChaCha20_ssse3 .rva .LSEH_info_ChaCha20_ssse3 .rva .LSEH_begin_ChaCha20_128 .rva .LSEH_end_ChaCha20_128 .rva .LSEH_info_ChaCha20_128 .rva .LSEH_begin_ChaCha20_4x .rva .LSEH_end_ChaCha20_4x .rva .LSEH_info_ChaCha20_4x ___ $code.=<<___ if ($avx); .rva .LSEH_begin_ChaCha20_4xop .rva .LSEH_end_ChaCha20_4xop .rva .LSEH_info_ChaCha20_4xop ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_ChaCha20_8x .rva .LSEH_end_ChaCha20_8x .rva .LSEH_info_ChaCha20_8x ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_ChaCha20_avx512 .rva .LSEH_end_ChaCha20_avx512 .rva .LSEH_info_ChaCha20_avx512 .rva .LSEH_begin_ChaCha20_avx512vl .rva .LSEH_end_ChaCha20_avx512vl .rva .LSEH_info_ChaCha20_avx512vl .rva .LSEH_begin_ChaCha20_16x .rva .LSEH_end_ChaCha20_16x .rva .LSEH_info_ChaCha20_16x .rva .LSEH_begin_ChaCha20_8xvl .rva .LSEH_end_ChaCha20_8xvl .rva .LSEH_info_ChaCha20_8xvl ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_ChaCha20_ctr32: .byte 9,0,0,0 .rva se_handler .LSEH_info_ChaCha20_ssse3: .byte 9,0,0,0 .rva simd_handler .rva .Lssse3_body,.Lssse3_epilogue .long 0x20,0 .LSEH_info_ChaCha20_128: .byte 9,0,0,0 .rva simd_handler .rva .L128_body,.L128_epilogue .long 0x60,0 .LSEH_info_ChaCha20_4x: .byte 9,0,0,0 .rva simd_handler .rva .L4x_body,.L4x_epilogue .long 0xa0,0 ___ $code.=<<___ if ($avx); .LSEH_info_ChaCha20_4xop: .byte 9,0,0,0 .rva simd_handler .rva .L4xop_body,.L4xop_epilogue # HandlerData[] .long 0xa0,0 ___ $code.=<<___ if ($avx>1); .LSEH_info_ChaCha20_8x: .byte 9,0,0,0 .rva simd_handler .rva .L8x_body,.L8x_epilogue # HandlerData[] .long 0xa0,0 ___ $code.=<<___ if ($avx>2); .LSEH_info_ChaCha20_avx512: .byte 9,0,0,0 .rva simd_handler .rva .Lavx512_body,.Lavx512_epilogue # HandlerData[] .long 0x20,0 .LSEH_info_ChaCha20_avx512vl: .byte 9,0,0,0 .rva simd_handler .rva .Lavx512vl_body,.Lavx512vl_epilogue # HandlerData[] .long 0x20,0 .LSEH_info_ChaCha20_16x: .byte 9,0,0,0 .rva simd_handler .rva .L16x_body,.L16x_epilogue # HandlerData[] .long 0xa0,0 .LSEH_info_ChaCha20_8xvl: .byte 9,0,0,0 .rva simd_handler .rva .L8xvl_body,.L8xvl_epilogue # HandlerData[] .long 0xa0,0 ___ } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/ge; s/%x#%[yz]/%x/g; # "down-shift" print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl =================================================================== --- stable/12/crypto/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/ec/asm/ecp_nistz256-avx2.pl (revision 364963) @@ -1,2080 +1,2080 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2014, Intel Corporation. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # # Reference: # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with # 256 Bit Primes" $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" $xlate $flavour $output"; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } if ($avx>=2) {{ $digit_size = "\$29"; $n_digits = "\$9"; $code.=<<___; .text .align 64 .LAVX2_AND_MASK: .LAVX2_POLY: .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x000001ff, 0x000001ff, 0x000001ff, 0x000001ff .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00040000, 0x00040000, 0x00040000, 0x00040000 .quad 0x1fe00000, 0x1fe00000, 0x1fe00000, 0x1fe00000 .quad 0x00ffffff, 0x00ffffff, 0x00ffffff, 0x00ffffff .LAVX2_POLY_x2: .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x400007FC, 0x400007FC, 0x400007FC, 0x400007FC .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE .quad 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE, 0x3FFFFFFE .quad 0x400FFFFE, 0x400FFFFE, 0x400FFFFE, 0x400FFFFE .quad 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE, 0x7F7FFFFE .quad 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC, 0x03FFFFFC .LAVX2_POLY_x8: .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 .quad 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8, 0xFFFFFFF8 .quad 0x80000FF8, 0x80000FF8, 0x80000FF8, 0x80000FF8 .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC, 0x7FFFFFFC .quad 0x801FFFFC, 0x801FFFFC, 0x801FFFFC, 0x801FFFFC .quad 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC, 0xFEFFFFFC .quad 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8, 0x07FFFFF8 .LONE: .quad 0x00000020, 0x00000020, 0x00000020, 0x00000020 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x1fffc000, 0x1fffc000, 0x1fffc000, 0x1fffc000 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1f7fffff, 0x1f7fffff, 0x1f7fffff, 0x1f7fffff .quad 0x03ffffff, 0x03ffffff, 0x03ffffff, 0x03ffffff .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 # RR = 2^266 mod p in AVX2 format, to transform from the native OpenSSL # Montgomery form (*2^256) to our format (*2^261) .LTO_MONT_AVX2: .quad 0x00000400, 0x00000400, 0x00000400, 0x00000400 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x1ff80000, 0x1ff80000, 0x1ff80000, 0x1ff80000 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x0fffffff, 0x0fffffff, 0x0fffffff, 0x0fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x00000003, 0x00000003, 0x00000003, 0x00000003 .LFROM_MONT_AVX2: .quad 0x00000001, 0x00000001, 0x00000001, 0x00000001 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .quad 0x1ffffe00, 0x1ffffe00, 0x1ffffe00, 0x1ffffe00 .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1fffffff, 0x1fffffff, 0x1fffffff, 0x1fffffff .quad 0x1ffbffff, 0x1ffbffff, 0x1ffbffff, 0x1ffbffff .quad 0x001fffff, 0x001fffff, 0x001fffff, 0x001fffff .quad 0x00000000, 0x00000000, 0x00000000, 0x00000000 .LIntOne: .long 1,1,1,1,1,1,1,1 ___ { # This function receives a pointer to an array of four affine points # (X, Y, <1>) and rearranges the data for AVX2 execution, while # converting it to 2^29 radix redundant form my ($X0,$X1,$X2,$X3, $Y0,$Y1,$Y2,$Y3, $T0,$T1,$T2,$T3, $T4,$T5,$T6,$T7)=map("%ymm$_",(0..15)); $code.=<<___; .globl ecp_nistz256_avx2_transpose_convert .type ecp_nistz256_avx2_transpose_convert,\@function,2 .align 64 ecp_nistz256_avx2_transpose_convert: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; # Load the data vmovdqa 32*0(%rsi), $X0 lea 112(%rsi), %rax # size optimization vmovdqa 32*1(%rsi), $Y0 lea .LAVX2_AND_MASK(%rip), %rdx vmovdqa 32*2(%rsi), $X1 vmovdqa 32*3(%rsi), $Y1 vmovdqa 32*4-112(%rax), $X2 vmovdqa 32*5-112(%rax), $Y2 vmovdqa 32*6-112(%rax), $X3 vmovdqa 32*7-112(%rax), $Y3 # Transpose X and Y independently vpunpcklqdq $X1, $X0, $T0 # T0 = [B2 A2 B0 A0] vpunpcklqdq $X3, $X2, $T1 # T1 = [D2 C2 D0 C0] vpunpckhqdq $X1, $X0, $T2 # T2 = [B3 A3 B1 A1] vpunpckhqdq $X3, $X2, $T3 # T3 = [D3 C3 D1 C1] vpunpcklqdq $Y1, $Y0, $T4 vpunpcklqdq $Y3, $Y2, $T5 vpunpckhqdq $Y1, $Y0, $T6 vpunpckhqdq $Y3, $Y2, $T7 vperm2i128 \$0x20, $T1, $T0, $X0 # X0 = [D0 C0 B0 A0] vperm2i128 \$0x20, $T3, $T2, $X1 # X1 = [D1 C1 B1 A1] vperm2i128 \$0x31, $T1, $T0, $X2 # X2 = [D2 C2 B2 A2] vperm2i128 \$0x31, $T3, $T2, $X3 # X3 = [D3 C3 B3 A3] vperm2i128 \$0x20, $T5, $T4, $Y0 vperm2i128 \$0x20, $T7, $T6, $Y1 vperm2i128 \$0x31, $T5, $T4, $Y2 vperm2i128 \$0x31, $T7, $T6, $Y3 vmovdqa (%rdx), $T7 vpand (%rdx), $X0, $T0 # out[0] = in[0] & mask; vpsrlq \$29, $X0, $X0 vpand $T7, $X0, $T1 # out[1] = (in[0] >> shift) & mask; vpsrlq \$29, $X0, $X0 vpsllq \$6, $X1, $T2 vpxor $X0, $T2, $T2 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; vpsrlq \$23, $X1, $X1 vpand $T7, $X1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; vpsrlq \$29, $X1, $X1 vpsllq \$12, $X2, $T4 vpxor $X1, $T4, $T4 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; vpsrlq \$17, $X2, $X2 vpand $T7, $X2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; vpsrlq \$29, $X2, $X2 vpsllq \$18, $X3, $T6 vpxor $X2, $T6, $T6 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; vpsrlq \$11, $X3, $X3 vmovdqa $T0, 32*0(%rdi) lea 112(%rdi), %rax # size optimization vpand $T7, $X3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; vpsrlq \$29, $X3, $X3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; vmovdqa $T1, 32*1(%rdi) vmovdqa $T2, 32*2(%rdi) vmovdqa $T3, 32*3(%rdi) vmovdqa $T4, 32*4-112(%rax) vmovdqa $T5, 32*5-112(%rax) vmovdqa $T6, 32*6-112(%rax) vmovdqa $T0, 32*7-112(%rax) vmovdqa $X3, 32*8-112(%rax) lea 448(%rdi), %rax # size optimization vpand $T7, $Y0, $T0 # out[0] = in[0] & mask; vpsrlq \$29, $Y0, $Y0 vpand $T7, $Y0, $T1 # out[1] = (in[0] >> shift) & mask; vpsrlq \$29, $Y0, $Y0 vpsllq \$6, $Y1, $T2 vpxor $Y0, $T2, $T2 vpand $T7, $T2, $T2 # out[2] = ((in[0] >> (shift*2)) ^ (in[1] << (64-shift*2))) & mask; vpsrlq \$23, $Y1, $Y1 vpand $T7, $Y1, $T3 # out[3] = (in[1] >> ((shift*3)%64)) & mask; vpsrlq \$29, $Y1, $Y1 vpsllq \$12, $Y2, $T4 vpxor $Y1, $T4, $T4 vpand $T7, $T4, $T4 # out[4] = ((in[1] >> ((shift*4)%64)) ^ (in[2] << (64*2-shift*4))) & mask; vpsrlq \$17, $Y2, $Y2 vpand $T7, $Y2, $T5 # out[5] = (in[2] >> ((shift*5)%64)) & mask; vpsrlq \$29, $Y2, $Y2 vpsllq \$18, $Y3, $T6 vpxor $Y2, $T6, $T6 vpand $T7, $T6, $T6 # out[6] = ((in[2] >> ((shift*6)%64)) ^ (in[3] << (64*3-shift*6))) & mask; vpsrlq \$11, $Y3, $Y3 vmovdqa $T0, 32*9-448(%rax) vpand $T7, $Y3, $T0 # out[7] = (in[3] >> ((shift*7)%64)) & mask; vpsrlq \$29, $Y3, $Y3 # out[8] = (in[3] >> ((shift*8)%64)) & mask; vmovdqa $T1, 32*10-448(%rax) vmovdqa $T2, 32*11-448(%rax) vmovdqa $T3, 32*12-448(%rax) vmovdqa $T4, 32*13-448(%rax) vmovdqa $T5, 32*14-448(%rax) vmovdqa $T6, 32*15-448(%rax) vmovdqa $T0, 32*16-448(%rax) vmovdqa $Y3, 32*17-448(%rax) vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_transpose_convert,.-ecp_nistz256_avx2_transpose_convert ___ } { ################################################################################ # This function receives a pointer to an array of four AVX2 formatted points # (X, Y, Z) convert the data to normal representation, and rearranges the data my ($D0,$D1,$D2,$D3, $D4,$D5,$D6,$D7, $D8)=map("%ymm$_",(0..8)); my ($T0,$T1,$T2,$T3, $T4,$T5,$T6)=map("%ymm$_",(9..15)); $code.=<<___; .globl ecp_nistz256_avx2_convert_transpose_back .type ecp_nistz256_avx2_convert_transpose_back,\@function,2 .align 32 ecp_nistz256_avx2_convert_transpose_back: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; mov \$3, %ecx .Lconv_loop: vmovdqa 32*0(%rsi), $D0 lea 160(%rsi), %rax # size optimization vmovdqa 32*1(%rsi), $D1 vmovdqa 32*2(%rsi), $D2 vmovdqa 32*3(%rsi), $D3 vmovdqa 32*4-160(%rax), $D4 vmovdqa 32*5-160(%rax), $D5 vmovdqa 32*6-160(%rax), $D6 vmovdqa 32*7-160(%rax), $D7 vmovdqa 32*8-160(%rax), $D8 vpsllq \$29, $D1, $D1 vpsllq \$58, $D2, $T0 vpaddq $D1, $D0, $D0 vpaddq $T0, $D0, $D0 # out[0] = (in[0]) ^ (in[1] << shift*1) ^ (in[2] << shift*2); vpsrlq \$6, $D2, $D2 vpsllq \$23, $D3, $D3 vpsllq \$52, $D4, $T1 vpaddq $D2, $D3, $D3 vpaddq $D3, $T1, $D1 # out[1] = (in[2] >> (64*1-shift*2)) ^ (in[3] << shift*3%64) ^ (in[4] << shift*4%64); vpsrlq \$12, $D4, $D4 vpsllq \$17, $D5, $D5 vpsllq \$46, $D6, $T2 vpaddq $D4, $D5, $D5 vpaddq $D5, $T2, $D2 # out[2] = (in[4] >> (64*2-shift*4)) ^ (in[5] << shift*5%64) ^ (in[6] << shift*6%64); vpsrlq \$18, $D6, $D6 vpsllq \$11, $D7, $D7 vpsllq \$40, $D8, $T3 vpaddq $D6, $D7, $D7 vpaddq $D7, $T3, $D3 # out[3] = (in[6] >> (64*3-shift*6)) ^ (in[7] << shift*7%64) ^ (in[8] << shift*8%64); vpunpcklqdq $D1, $D0, $T0 # T0 = [B2 A2 B0 A0] vpunpcklqdq $D3, $D2, $T1 # T1 = [D2 C2 D0 C0] vpunpckhqdq $D1, $D0, $T2 # T2 = [B3 A3 B1 A1] vpunpckhqdq $D3, $D2, $T3 # T3 = [D3 C3 D1 C1] vperm2i128 \$0x20, $T1, $T0, $D0 # X0 = [D0 C0 B0 A0] vperm2i128 \$0x20, $T3, $T2, $D1 # X1 = [D1 C1 B1 A1] vperm2i128 \$0x31, $T1, $T0, $D2 # X2 = [D2 C2 B2 A2] vperm2i128 \$0x31, $T3, $T2, $D3 # X3 = [D3 C3 B3 A3] vmovdqa $D0, 32*0(%rdi) vmovdqa $D1, 32*3(%rdi) vmovdqa $D2, 32*6(%rdi) vmovdqa $D3, 32*9(%rdi) lea 32*9(%rsi), %rsi lea 32*1(%rdi), %rdi dec %ecx jnz .Lconv_loop vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_convert_transpose_back,.-ecp_nistz256_avx2_convert_transpose_back ___ } { my ($r_ptr,$a_ptr,$b_ptr,$itr)=("%rdi","%rsi","%rdx","%ecx"); my ($ACC0,$ACC1,$ACC2,$ACC3,$ACC4,$ACC5,$ACC6,$ACC7,$ACC8)=map("%ymm$_",(0..8)); my ($B,$Y,$T0,$AND_MASK,$OVERFLOW)=map("%ymm$_",(9..13)); sub NORMALIZE { my $ret=<<___; vpsrlq $digit_size, $ACC0, $T0 vpand $AND_MASK, $ACC0, $ACC0 vpaddq $T0, $ACC1, $ACC1 vpsrlq $digit_size, $ACC1, $T0 vpand $AND_MASK, $ACC1, $ACC1 vpaddq $T0, $ACC2, $ACC2 vpsrlq $digit_size, $ACC2, $T0 vpand $AND_MASK, $ACC2, $ACC2 vpaddq $T0, $ACC3, $ACC3 vpsrlq $digit_size, $ACC3, $T0 vpand $AND_MASK, $ACC3, $ACC3 vpaddq $T0, $ACC4, $ACC4 vpsrlq $digit_size, $ACC4, $T0 vpand $AND_MASK, $ACC4, $ACC4 vpaddq $T0, $ACC5, $ACC5 vpsrlq $digit_size, $ACC5, $T0 vpand $AND_MASK, $ACC5, $ACC5 vpaddq $T0, $ACC6, $ACC6 vpsrlq $digit_size, $ACC6, $T0 vpand $AND_MASK, $ACC6, $ACC6 vpaddq $T0, $ACC7, $ACC7 vpsrlq $digit_size, $ACC7, $T0 vpand $AND_MASK, $ACC7, $ACC7 vpaddq $T0, $ACC8, $ACC8 #vpand $AND_MASK, $ACC8, $ACC8 ___ $ret; } sub STORE { my $ret=<<___; vmovdqa $ACC0, 32*0(%rdi) lea 160(%rdi), %rax # size optimization vmovdqa $ACC1, 32*1(%rdi) vmovdqa $ACC2, 32*2(%rdi) vmovdqa $ACC3, 32*3(%rdi) vmovdqa $ACC4, 32*4-160(%rax) vmovdqa $ACC5, 32*5-160(%rax) vmovdqa $ACC6, 32*6-160(%rax) vmovdqa $ACC7, 32*7-160(%rax) vmovdqa $ACC8, 32*8-160(%rax) ___ $ret; } $code.=<<___; .type avx2_normalize,\@abi-omnipotent .align 32 avx2_normalize: vpsrlq $digit_size, $ACC0, $T0 vpand $AND_MASK, $ACC0, $ACC0 vpaddq $T0, $ACC1, $ACC1 vpsrlq $digit_size, $ACC1, $T0 vpand $AND_MASK, $ACC1, $ACC1 vpaddq $T0, $ACC2, $ACC2 vpsrlq $digit_size, $ACC2, $T0 vpand $AND_MASK, $ACC2, $ACC2 vpaddq $T0, $ACC3, $ACC3 vpsrlq $digit_size, $ACC3, $T0 vpand $AND_MASK, $ACC3, $ACC3 vpaddq $T0, $ACC4, $ACC4 vpsrlq $digit_size, $ACC4, $T0 vpand $AND_MASK, $ACC4, $ACC4 vpaddq $T0, $ACC5, $ACC5 vpsrlq $digit_size, $ACC5, $T0 vpand $AND_MASK, $ACC5, $ACC5 vpaddq $T0, $ACC6, $ACC6 vpsrlq $digit_size, $ACC6, $T0 vpand $AND_MASK, $ACC6, $ACC6 vpaddq $T0, $ACC7, $ACC7 vpsrlq $digit_size, $ACC7, $T0 vpand $AND_MASK, $ACC7, $ACC7 vpaddq $T0, $ACC8, $ACC8 #vpand $AND_MASK, $ACC8, $ACC8 ret .size avx2_normalize,.-avx2_normalize .type avx2_normalize_n_store,\@abi-omnipotent .align 32 avx2_normalize_n_store: vpsrlq $digit_size, $ACC0, $T0 vpand $AND_MASK, $ACC0, $ACC0 vpaddq $T0, $ACC1, $ACC1 vpsrlq $digit_size, $ACC1, $T0 vpand $AND_MASK, $ACC1, $ACC1 vmovdqa $ACC0, 32*0(%rdi) lea 160(%rdi), %rax # size optimization vpaddq $T0, $ACC2, $ACC2 vpsrlq $digit_size, $ACC2, $T0 vpand $AND_MASK, $ACC2, $ACC2 vmovdqa $ACC1, 32*1(%rdi) vpaddq $T0, $ACC3, $ACC3 vpsrlq $digit_size, $ACC3, $T0 vpand $AND_MASK, $ACC3, $ACC3 vmovdqa $ACC2, 32*2(%rdi) vpaddq $T0, $ACC4, $ACC4 vpsrlq $digit_size, $ACC4, $T0 vpand $AND_MASK, $ACC4, $ACC4 vmovdqa $ACC3, 32*3(%rdi) vpaddq $T0, $ACC5, $ACC5 vpsrlq $digit_size, $ACC5, $T0 vpand $AND_MASK, $ACC5, $ACC5 vmovdqa $ACC4, 32*4-160(%rax) vpaddq $T0, $ACC6, $ACC6 vpsrlq $digit_size, $ACC6, $T0 vpand $AND_MASK, $ACC6, $ACC6 vmovdqa $ACC5, 32*5-160(%rax) vpaddq $T0, $ACC7, $ACC7 vpsrlq $digit_size, $ACC7, $T0 vpand $AND_MASK, $ACC7, $ACC7 vmovdqa $ACC6, 32*6-160(%rax) vpaddq $T0, $ACC8, $ACC8 #vpand $AND_MASK, $ACC8, $ACC8 vmovdqa $ACC7, 32*7-160(%rax) vmovdqa $ACC8, 32*8-160(%rax) ret .size avx2_normalize_n_store,.-avx2_normalize_n_store ################################################################################ # void avx2_mul_x4(void* RESULTx4, void *Ax4, void *Bx4); .type avx2_mul_x4,\@abi-omnipotent .align 32 avx2_mul_x4: lea .LAVX2_POLY(%rip), %rax vpxor $ACC0, $ACC0, $ACC0 vpxor $ACC1, $ACC1, $ACC1 vpxor $ACC2, $ACC2, $ACC2 vpxor $ACC3, $ACC3, $ACC3 vpxor $ACC4, $ACC4, $ACC4 vpxor $ACC5, $ACC5, $ACC5 vpxor $ACC6, $ACC6, $ACC6 vpxor $ACC7, $ACC7, $ACC7 vmovdqa 32*7(%rax), %ymm14 vmovdqa 32*8(%rax), %ymm15 mov $n_digits, $itr lea -512($a_ptr), $a_ptr # strategic bias to control u-op density jmp .Lavx2_mul_x4_loop .align 32 .Lavx2_mul_x4_loop: vmovdqa 32*0($b_ptr), $B lea 32*1($b_ptr), $b_ptr vpmuludq 32*0+512($a_ptr), $B, $T0 vpmuludq 32*1+512($a_ptr), $B, $OVERFLOW # borrow $OVERFLOW vpaddq $T0, $ACC0, $ACC0 vpmuludq 32*2+512($a_ptr), $B, $T0 vpaddq $OVERFLOW, $ACC1, $ACC1 vpand $AND_MASK, $ACC0, $Y vpmuludq 32*3+512($a_ptr), $B, $OVERFLOW vpaddq $T0, $ACC2, $ACC2 vpmuludq 32*4+512($a_ptr), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*5+512($a_ptr), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*6+512($a_ptr), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*7+512($a_ptr), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 # Skip some multiplications, optimizing for the constant poly vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*8+512($a_ptr), $B, $ACC8 vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 .byte 0x67 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $OVERFLOW .byte 0x67 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $T0 vpaddq $OVERFLOW, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $OVERFLOW vpaddq $T0, $ACC7, $ACC6 vpaddq $OVERFLOW, $ACC8, $ACC7 dec $itr jnz .Lavx2_mul_x4_loop vpxor $ACC8, $ACC8, $ACC8 ret .size avx2_mul_x4,.-avx2_mul_x4 # Function optimized for the constant 1 ################################################################################ # void avx2_mul_by1_x4(void* RESULTx4, void *Ax4); .type avx2_mul_by1_x4,\@abi-omnipotent .align 32 avx2_mul_by1_x4: lea .LAVX2_POLY(%rip), %rax vpxor $ACC0, $ACC0, $ACC0 vpxor $ACC1, $ACC1, $ACC1 vpxor $ACC2, $ACC2, $ACC2 vpxor $ACC3, $ACC3, $ACC3 vpxor $ACC4, $ACC4, $ACC4 vpxor $ACC5, $ACC5, $ACC5 vpxor $ACC6, $ACC6, $ACC6 vpxor $ACC7, $ACC7, $ACC7 vpxor $ACC8, $ACC8, $ACC8 vmovdqa 32*3+.LONE(%rip), %ymm14 vmovdqa 32*7+.LONE(%rip), %ymm15 mov $n_digits, $itr jmp .Lavx2_mul_by1_x4_loop .align 32 .Lavx2_mul_by1_x4_loop: vmovdqa 32*0($a_ptr), $B .byte 0x48,0x8d,0xb6,0x20,0,0,0 # lea 32*1($a_ptr), $a_ptr vpsllq \$5, $B, $OVERFLOW vpmuludq %ymm14, $B, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC3 .byte 0x67 vpmuludq $AND_MASK, $B, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $T0, $ACC4, $ACC4 vpaddq $T0, $ACC5, $ACC5 vpaddq $T0, $ACC6, $ACC6 vpsllq \$23, $B, $T0 .byte 0x67,0x67 vpmuludq %ymm15, $B, $OVERFLOW vpsubq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 .byte 0x67,0x67 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $OVERFLOW vmovdqa $ACC5, $ACC4 vpmuludq 32*7(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC6, $ACC5 vpaddq $T0, $ACC7, $ACC6 vpmuludq 32*8(%rax), $Y, $ACC7 dec $itr jnz .Lavx2_mul_by1_x4_loop ret .size avx2_mul_by1_x4,.-avx2_mul_by1_x4 ################################################################################ # void avx2_sqr_x4(void* RESULTx4, void *Ax4, void *Bx4); .type avx2_sqr_x4,\@abi-omnipotent .align 32 avx2_sqr_x4: lea .LAVX2_POLY(%rip), %rax vmovdqa 32*7(%rax), %ymm14 vmovdqa 32*8(%rax), %ymm15 vmovdqa 32*0($a_ptr), $B vmovdqa 32*1($a_ptr), $ACC1 vmovdqa 32*2($a_ptr), $ACC2 vmovdqa 32*3($a_ptr), $ACC3 vmovdqa 32*4($a_ptr), $ACC4 vmovdqa 32*5($a_ptr), $ACC5 vmovdqa 32*6($a_ptr), $ACC6 vmovdqa 32*7($a_ptr), $ACC7 vpaddq $ACC1, $ACC1, $ACC1 # 2*$ACC0..7 vmovdqa 32*8($a_ptr), $ACC8 vpaddq $ACC2, $ACC2, $ACC2 vmovdqa $ACC1, 32*0(%rcx) vpaddq $ACC3, $ACC3, $ACC3 vmovdqa $ACC2, 32*1(%rcx) vpaddq $ACC4, $ACC4, $ACC4 vmovdqa $ACC3, 32*2(%rcx) vpaddq $ACC5, $ACC5, $ACC5 vmovdqa $ACC4, 32*3(%rcx) vpaddq $ACC6, $ACC6, $ACC6 vmovdqa $ACC5, 32*4(%rcx) vpaddq $ACC7, $ACC7, $ACC7 vmovdqa $ACC6, 32*5(%rcx) vpaddq $ACC8, $ACC8, $ACC8 vmovdqa $ACC7, 32*6(%rcx) vmovdqa $ACC8, 32*7(%rcx) #itr 1 vpmuludq $B, $B, $ACC0 vpmuludq $B, $ACC1, $ACC1 vpand $AND_MASK, $ACC0, $Y vpmuludq $B, $ACC2, $ACC2 vpmuludq $B, $ACC3, $ACC3 vpmuludq $B, $ACC4, $ACC4 vpmuludq $B, $ACC5, $ACC5 vpmuludq $B, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpmuludq $B, $ACC7, $ACC7 vpmuludq $B, $ACC8, $ACC8 vmovdqa 32*1($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 2 vpmuludq $B, $B, $OVERFLOW vpand $AND_MASK, $ACC0, $Y vpmuludq 32*1(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC1, $ACC1 vpmuludq 32*2(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC2, $ACC2 vpmuludq 32*3(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*2($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 3 vpmuludq $B, $B, $T0 vpand $AND_MASK, $ACC0, $Y vpmuludq 32*2(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC2, $ACC2 vpmuludq 32*3(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*3($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 4 vpmuludq $B, $B, $OVERFLOW vpmuludq 32*3(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC3, $ACC3 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*4($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 5 vpmuludq $B, $B, $T0 vpmuludq 32*4(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC4, $ACC4 vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*5($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3+.LAVX2_POLY(%rip), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 6 vpmuludq $B, $B, $OVERFLOW vpmuludq 32*5(%rcx), $B, $T0 vpaddq $OVERFLOW, $ACC5, $ACC5 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*6($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 7 vpmuludq $B, $B, $T0 vpmuludq 32*6(%rcx), $B, $OVERFLOW vpaddq $T0, $ACC6, $ACC6 vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*7($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 8 vpmuludq $B, $B, $OVERFLOW vpmuludq $AND_MASK, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC7 vpmuludq 32*7(%rcx), $B, $ACC8 vmovdqa 32*8($a_ptr), $B vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpand $AND_MASK, $ACC0, $Y vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 #itr 9 vpmuludq $B, $B, $ACC8 vpmuludq $AND_MASK, $Y, $T0 vpaddq $T0, $ACC0, $OVERFLOW vpsrlq $digit_size, $OVERFLOW, $OVERFLOW vpaddq $T0, $ACC1, $ACC0 vpaddq $T0, $ACC2, $ACC1 vpmuludq 32*3(%rax), $Y, $T0 vpaddq $OVERFLOW, $ACC0, $ACC0 vpaddq $T0, $ACC3, $ACC2 vmovdqa $ACC4, $ACC3 vpsllq \$18, $Y, $T0 vmovdqa $ACC5, $ACC4 vpmuludq %ymm14, $Y, $OVERFLOW vpaddq $T0, $ACC6, $ACC5 vpmuludq %ymm15, $Y, $T0 vpaddq $OVERFLOW, $ACC7, $ACC6 vpaddq $T0, $ACC8, $ACC7 vpxor $ACC8, $ACC8, $ACC8 ret .size avx2_sqr_x4,.-avx2_sqr_x4 ################################################################################ # void avx2_sub_x4(void* RESULTx4, void *Ax4, void *Bx4); .type avx2_sub_x4,\@abi-omnipotent .align 32 avx2_sub_x4: vmovdqa 32*0($a_ptr), $ACC0 lea 160($a_ptr), $a_ptr lea .LAVX2_POLY_x8+128(%rip), %rax lea 128($b_ptr), $b_ptr vmovdqa 32*1-160($a_ptr), $ACC1 vmovdqa 32*2-160($a_ptr), $ACC2 vmovdqa 32*3-160($a_ptr), $ACC3 vmovdqa 32*4-160($a_ptr), $ACC4 vmovdqa 32*5-160($a_ptr), $ACC5 vmovdqa 32*6-160($a_ptr), $ACC6 vmovdqa 32*7-160($a_ptr), $ACC7 vmovdqa 32*8-160($a_ptr), $ACC8 vpaddq 32*0-128(%rax), $ACC0, $ACC0 vpaddq 32*1-128(%rax), $ACC1, $ACC1 vpaddq 32*2-128(%rax), $ACC2, $ACC2 vpaddq 32*3-128(%rax), $ACC3, $ACC3 vpaddq 32*4-128(%rax), $ACC4, $ACC4 vpaddq 32*5-128(%rax), $ACC5, $ACC5 vpaddq 32*6-128(%rax), $ACC6, $ACC6 vpaddq 32*7-128(%rax), $ACC7, $ACC7 vpaddq 32*8-128(%rax), $ACC8, $ACC8 vpsubq 32*0-128($b_ptr), $ACC0, $ACC0 vpsubq 32*1-128($b_ptr), $ACC1, $ACC1 vpsubq 32*2-128($b_ptr), $ACC2, $ACC2 vpsubq 32*3-128($b_ptr), $ACC3, $ACC3 vpsubq 32*4-128($b_ptr), $ACC4, $ACC4 vpsubq 32*5-128($b_ptr), $ACC5, $ACC5 vpsubq 32*6-128($b_ptr), $ACC6, $ACC6 vpsubq 32*7-128($b_ptr), $ACC7, $ACC7 vpsubq 32*8-128($b_ptr), $ACC8, $ACC8 ret .size avx2_sub_x4,.-avx2_sub_x4 .type avx2_select_n_store,\@abi-omnipotent .align 32 avx2_select_n_store: vmovdqa `8+32*9*8`(%rsp), $Y vpor `8+32*9*8+32`(%rsp), $Y, $Y vpandn $ACC0, $Y, $ACC0 vpandn $ACC1, $Y, $ACC1 vpandn $ACC2, $Y, $ACC2 vpandn $ACC3, $Y, $ACC3 vpandn $ACC4, $Y, $ACC4 vpandn $ACC5, $Y, $ACC5 vpandn $ACC6, $Y, $ACC6 vmovdqa `8+32*9*8+32`(%rsp), $B vpandn $ACC7, $Y, $ACC7 vpandn `8+32*9*8`(%rsp), $B, $B vpandn $ACC8, $Y, $ACC8 vpand 32*0(%rsi), $B, $T0 lea 160(%rsi), %rax vpand 32*1(%rsi), $B, $Y vpxor $T0, $ACC0, $ACC0 vpand 32*2(%rsi), $B, $T0 vpxor $Y, $ACC1, $ACC1 vpand 32*3(%rsi), $B, $Y vpxor $T0, $ACC2, $ACC2 vpand 32*4-160(%rax), $B, $T0 vpxor $Y, $ACC3, $ACC3 vpand 32*5-160(%rax), $B, $Y vpxor $T0, $ACC4, $ACC4 vpand 32*6-160(%rax), $B, $T0 vpxor $Y, $ACC5, $ACC5 vpand 32*7-160(%rax), $B, $Y vpxor $T0, $ACC6, $ACC6 vpand 32*8-160(%rax), $B, $T0 vmovdqa `8+32*9*8+32`(%rsp), $B vpxor $Y, $ACC7, $ACC7 vpand 32*0(%rdx), $B, $Y lea 160(%rdx), %rax vpxor $T0, $ACC8, $ACC8 vpand 32*1(%rdx), $B, $T0 vpxor $Y, $ACC0, $ACC0 vpand 32*2(%rdx), $B, $Y vpxor $T0, $ACC1, $ACC1 vpand 32*3(%rdx), $B, $T0 vpxor $Y, $ACC2, $ACC2 vpand 32*4-160(%rax), $B, $Y vpxor $T0, $ACC3, $ACC3 vpand 32*5-160(%rax), $B, $T0 vpxor $Y, $ACC4, $ACC4 vpand 32*6-160(%rax), $B, $Y vpxor $T0, $ACC5, $ACC5 vpand 32*7-160(%rax), $B, $T0 vpxor $Y, $ACC6, $ACC6 vpand 32*8-160(%rax), $B, $Y vpxor $T0, $ACC7, $ACC7 vpxor $Y, $ACC8, $ACC8 `&STORE` ret .size avx2_select_n_store,.-avx2_select_n_store ___ $code.=<<___ if (0); # inlined ################################################################################ # void avx2_mul_by2_x4(void* RESULTx4, void *Ax4); .type avx2_mul_by2_x4,\@abi-omnipotent .align 32 avx2_mul_by2_x4: vmovdqa 32*0($a_ptr), $ACC0 lea 160($a_ptr), %rax vmovdqa 32*1($a_ptr), $ACC1 vmovdqa 32*2($a_ptr), $ACC2 vmovdqa 32*3($a_ptr), $ACC3 vmovdqa 32*4-160(%rax), $ACC4 vmovdqa 32*5-160(%rax), $ACC5 vmovdqa 32*6-160(%rax), $ACC6 vmovdqa 32*7-160(%rax), $ACC7 vmovdqa 32*8-160(%rax), $ACC8 vpaddq $ACC0, $ACC0, $ACC0 vpaddq $ACC1, $ACC1, $ACC1 vpaddq $ACC2, $ACC2, $ACC2 vpaddq $ACC3, $ACC3, $ACC3 vpaddq $ACC4, $ACC4, $ACC4 vpaddq $ACC5, $ACC5, $ACC5 vpaddq $ACC6, $ACC6, $ACC6 vpaddq $ACC7, $ACC7, $ACC7 vpaddq $ACC8, $ACC8, $ACC8 ret .size avx2_mul_by2_x4,.-avx2_mul_by2_x4 ___ my ($r_ptr_in,$a_ptr_in,$b_ptr_in)=("%rdi","%rsi","%rdx"); my ($r_ptr,$a_ptr,$b_ptr)=("%r8","%r9","%r10"); $code.=<<___; ################################################################################ # void ecp_nistz256_avx2_point_add_affine_x4(void* RESULTx4, void *Ax4, void *Bx4); .globl ecp_nistz256_avx2_point_add_affine_x4 .type ecp_nistz256_avx2_point_add_affine_x4,\@function,3 .align 32 ecp_nistz256_avx2_point_add_affine_x4: mov %rsp, %rax push %rbp vzeroupper ___ $code.=<<___ if ($win64); lea -16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; lea -8(%rax), %rbp # Result + 32*0 = Result.X # Result + 32*9 = Result.Y # Result + 32*18 = Result.Z # A + 32*0 = A.X # A + 32*9 = A.Y # A + 32*18 = A.Z # B + 32*0 = B.X # B + 32*9 = B.Y sub \$`32*9*8+32*2+32*8`, %rsp and \$-64, %rsp mov $r_ptr_in, $r_ptr mov $a_ptr_in, $a_ptr mov $b_ptr_in, $b_ptr vmovdqa 32*0($a_ptr_in), %ymm0 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK vpxor %ymm1, %ymm1, %ymm1 lea 256($a_ptr_in), %rax # size optimization vpor 32*1($a_ptr_in), %ymm0, %ymm0 vpor 32*2($a_ptr_in), %ymm0, %ymm0 vpor 32*3($a_ptr_in), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8`(%rsp) vpxor %ymm1, %ymm1, %ymm1 vmovdqa 32*0($b_ptr), %ymm0 lea 256($b_ptr), %rax # size optimization vpor 32*1($b_ptr), %ymm0, %ymm0 vpor 32*2($b_ptr), %ymm0, %ymm0 vpor 32*3($b_ptr), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8+32`(%rsp) # Z1^2 = Z1*Z1 lea `32*9*2`($a_ptr), %rsi lea `32*9*2`(%rsp), %rdi lea `32*9*8+32*2`(%rsp), %rcx # temporary vector call avx2_sqr_x4 call avx2_normalize_n_store # U2 = X2*Z1^2 lea `32*9*0`($b_ptr), %rsi lea `32*9*2`(%rsp), %rdx lea `32*9*0`(%rsp), %rdi call avx2_mul_x4 #call avx2_normalize `&STORE` # S2 = Z1*Z1^2 = Z1^3 lea `32*9*2`($a_ptr), %rsi lea `32*9*2`(%rsp), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # S2 = S2*Y2 = Y2*Z1^3 lea `32*9*1`($b_ptr), %rsi lea `32*9*1`(%rsp), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # H = U2 - U1 = U2 - X1 lea `32*9*0`(%rsp), %rsi lea `32*9*0`($a_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # R = S2 - S1 = S2 - Y1 lea `32*9*1`(%rsp), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*4`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # Z3 = H*Z1*Z2 lea `32*9*3`(%rsp), %rsi lea `32*9*2`($a_ptr), %rdx lea `32*9*2`($r_ptr), %rdi call avx2_mul_x4 call avx2_normalize lea .LONE(%rip), %rsi lea `32*9*2`($a_ptr), %rdx call avx2_select_n_store # R^2 = R^2 lea `32*9*4`(%rsp), %rsi lea `32*9*6`(%rsp), %rdi lea `32*9*8+32*2`(%rsp), %rcx # temporary vector call avx2_sqr_x4 call avx2_normalize_n_store # H^2 = H^2 lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdi call avx2_sqr_x4 call avx2_normalize_n_store # H^3 = H^2*H lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*7`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # U2 = U1*H^2 lea `32*9*0`($a_ptr), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*0`(%rsp), %rdi call avx2_mul_x4 #call avx2_normalize `&STORE` # Hsqr = U2*2 #lea 32*9*0(%rsp), %rsi #lea 32*9*5(%rsp), %rdi #call avx2_mul_by2_x4 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 lea `32*9*5`(%rsp), %rdi vpaddq $ACC1, $ACC1, $ACC1 vpaddq $ACC2, $ACC2, $ACC2 vpaddq $ACC3, $ACC3, $ACC3 vpaddq $ACC4, $ACC4, $ACC4 vpaddq $ACC5, $ACC5, $ACC5 vpaddq $ACC6, $ACC6, $ACC6 vpaddq $ACC7, $ACC7, $ACC7 vpaddq $ACC8, $ACC8, $ACC8 call avx2_normalize_n_store # X3 = R^2 - H^3 #lea 32*9*6(%rsp), %rsi #lea 32*9*7(%rsp), %rdx #lea 32*9*5(%rsp), %rcx #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE # X3 = X3 - U2*2 #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE lea `32*9*6+128`(%rsp), %rsi lea .LAVX2_POLY_x2+128(%rip), %rax lea `32*9*7+128`(%rsp), %rdx lea `32*9*5+128`(%rsp), %rcx lea `32*9*0`($r_ptr), %rdi vmovdqa 32*0-128(%rsi), $ACC0 vmovdqa 32*1-128(%rsi), $ACC1 vmovdqa 32*2-128(%rsi), $ACC2 vmovdqa 32*3-128(%rsi), $ACC3 vmovdqa 32*4-128(%rsi), $ACC4 vmovdqa 32*5-128(%rsi), $ACC5 vmovdqa 32*6-128(%rsi), $ACC6 vmovdqa 32*7-128(%rsi), $ACC7 vmovdqa 32*8-128(%rsi), $ACC8 vpaddq 32*0-128(%rax), $ACC0, $ACC0 vpaddq 32*1-128(%rax), $ACC1, $ACC1 vpaddq 32*2-128(%rax), $ACC2, $ACC2 vpaddq 32*3-128(%rax), $ACC3, $ACC3 vpaddq 32*4-128(%rax), $ACC4, $ACC4 vpaddq 32*5-128(%rax), $ACC5, $ACC5 vpaddq 32*6-128(%rax), $ACC6, $ACC6 vpaddq 32*7-128(%rax), $ACC7, $ACC7 vpaddq 32*8-128(%rax), $ACC8, $ACC8 vpsubq 32*0-128(%rdx), $ACC0, $ACC0 vpsubq 32*1-128(%rdx), $ACC1, $ACC1 vpsubq 32*2-128(%rdx), $ACC2, $ACC2 vpsubq 32*3-128(%rdx), $ACC3, $ACC3 vpsubq 32*4-128(%rdx), $ACC4, $ACC4 vpsubq 32*5-128(%rdx), $ACC5, $ACC5 vpsubq 32*6-128(%rdx), $ACC6, $ACC6 vpsubq 32*7-128(%rdx), $ACC7, $ACC7 vpsubq 32*8-128(%rdx), $ACC8, $ACC8 vpsubq 32*0-128(%rcx), $ACC0, $ACC0 vpsubq 32*1-128(%rcx), $ACC1, $ACC1 vpsubq 32*2-128(%rcx), $ACC2, $ACC2 vpsubq 32*3-128(%rcx), $ACC3, $ACC3 vpsubq 32*4-128(%rcx), $ACC4, $ACC4 vpsubq 32*5-128(%rcx), $ACC5, $ACC5 vpsubq 32*6-128(%rcx), $ACC6, $ACC6 vpsubq 32*7-128(%rcx), $ACC7, $ACC7 vpsubq 32*8-128(%rcx), $ACC8, $ACC8 call avx2_normalize lea 32*0($b_ptr), %rsi lea 32*0($a_ptr), %rdx call avx2_select_n_store # H = U2 - X3 lea `32*9*0`(%rsp), %rsi lea `32*9*0`($r_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # lea `32*9*3`(%rsp), %rsi lea `32*9*4`(%rsp), %rdx lea `32*9*3`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # lea `32*9*7`(%rsp), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # lea `32*9*3`(%rsp), %rsi lea `32*9*1`(%rsp), %rdx lea `32*9*1`($r_ptr), %rdi call avx2_sub_x4 call avx2_normalize lea 32*9($b_ptr), %rsi lea 32*9($a_ptr), %rdx call avx2_select_n_store #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_mul_by1_x4 #NORMALIZE #STORE lea `32*9*1`($r_ptr), %rsi lea `32*9*1`($r_ptr), %rdi call avx2_mul_by1_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps %xmm6, -16*10(%rbp) movaps %xmm7, -16*9(%rbp) movaps %xmm8, -16*8(%rbp) movaps %xmm9, -16*7(%rbp) movaps %xmm10, -16*6(%rbp) movaps %xmm11, -16*5(%rbp) movaps %xmm12, -16*4(%rbp) movaps %xmm13, -16*3(%rbp) movaps %xmm14, -16*2(%rbp) movaps %xmm15, -16*1(%rbp) ___ $code.=<<___; mov %rbp, %rsp pop %rbp ret .size ecp_nistz256_avx2_point_add_affine_x4,.-ecp_nistz256_avx2_point_add_affine_x4 ################################################################################ # void ecp_nistz256_avx2_point_add_affines_x4(void* RESULTx4, void *Ax4, void *Bx4); .globl ecp_nistz256_avx2_point_add_affines_x4 .type ecp_nistz256_avx2_point_add_affines_x4,\@function,3 .align 32 ecp_nistz256_avx2_point_add_affines_x4: mov %rsp, %rax push %rbp vzeroupper ___ $code.=<<___ if ($win64); lea -16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; lea -8(%rax), %rbp # Result + 32*0 = Result.X # Result + 32*9 = Result.Y # Result + 32*18 = Result.Z # A + 32*0 = A.X # A + 32*9 = A.Y # B + 32*0 = B.X # B + 32*9 = B.Y sub \$`32*9*8+32*2+32*8`, %rsp and \$-64, %rsp mov $r_ptr_in, $r_ptr mov $a_ptr_in, $a_ptr mov $b_ptr_in, $b_ptr vmovdqa 32*0($a_ptr_in), %ymm0 vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK vpxor %ymm1, %ymm1, %ymm1 lea 256($a_ptr_in), %rax # size optimization vpor 32*1($a_ptr_in), %ymm0, %ymm0 vpor 32*2($a_ptr_in), %ymm0, %ymm0 vpor 32*3($a_ptr_in), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8`(%rsp) vpxor %ymm1, %ymm1, %ymm1 vmovdqa 32*0($b_ptr), %ymm0 lea 256($b_ptr), %rax # size optimization vpor 32*1($b_ptr), %ymm0, %ymm0 vpor 32*2($b_ptr), %ymm0, %ymm0 vpor 32*3($b_ptr), %ymm0, %ymm0 vpor 32*4-256(%rax), %ymm0, %ymm0 lea 256(%rax), %rcx # size optimization vpor 32*5-256(%rax), %ymm0, %ymm0 vpor 32*6-256(%rax), %ymm0, %ymm0 vpor 32*7-256(%rax), %ymm0, %ymm0 vpor 32*8-256(%rax), %ymm0, %ymm0 vpor 32*9-256(%rax), %ymm0, %ymm0 vpor 32*10-256(%rax), %ymm0, %ymm0 vpor 32*11-256(%rax), %ymm0, %ymm0 vpor 32*12-512(%rcx), %ymm0, %ymm0 vpor 32*13-512(%rcx), %ymm0, %ymm0 vpor 32*14-512(%rcx), %ymm0, %ymm0 vpor 32*15-512(%rcx), %ymm0, %ymm0 vpor 32*16-512(%rcx), %ymm0, %ymm0 vpor 32*17-512(%rcx), %ymm0, %ymm0 vpcmpeqq %ymm1, %ymm0, %ymm0 vmovdqa %ymm0, `32*9*8+32`(%rsp) # H = U2 - U1 = X2 - X1 lea `32*9*0`($b_ptr), %rsi lea `32*9*0`($a_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # R = S2 - S1 = Y2 - Y1 lea `32*9*1`($b_ptr), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*4`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # Z3 = H*Z1*Z2 = H lea `32*9*3`(%rsp), %rsi lea `32*9*2`($r_ptr), %rdi call avx2_mul_by1_x4 call avx2_normalize vmovdqa `32*9*8`(%rsp), $B vpor `32*9*8+32`(%rsp), $B, $B vpandn $ACC0, $B, $ACC0 lea .LONE+128(%rip), %rax vpandn $ACC1, $B, $ACC1 vpandn $ACC2, $B, $ACC2 vpandn $ACC3, $B, $ACC3 vpandn $ACC4, $B, $ACC4 vpandn $ACC5, $B, $ACC5 vpandn $ACC6, $B, $ACC6 vpandn $ACC7, $B, $ACC7 vpand 32*0-128(%rax), $B, $T0 vpandn $ACC8, $B, $ACC8 vpand 32*1-128(%rax), $B, $Y vpxor $T0, $ACC0, $ACC0 vpand 32*2-128(%rax), $B, $T0 vpxor $Y, $ACC1, $ACC1 vpand 32*3-128(%rax), $B, $Y vpxor $T0, $ACC2, $ACC2 vpand 32*4-128(%rax), $B, $T0 vpxor $Y, $ACC3, $ACC3 vpand 32*5-128(%rax), $B, $Y vpxor $T0, $ACC4, $ACC4 vpand 32*6-128(%rax), $B, $T0 vpxor $Y, $ACC5, $ACC5 vpand 32*7-128(%rax), $B, $Y vpxor $T0, $ACC6, $ACC6 vpand 32*8-128(%rax), $B, $T0 vpxor $Y, $ACC7, $ACC7 vpxor $T0, $ACC8, $ACC8 `&STORE` # R^2 = R^2 lea `32*9*4`(%rsp), %rsi lea `32*9*6`(%rsp), %rdi lea `32*9*8+32*2`(%rsp), %rcx # temporary vector call avx2_sqr_x4 call avx2_normalize_n_store # H^2 = H^2 lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdi call avx2_sqr_x4 call avx2_normalize_n_store # H^3 = H^2*H lea `32*9*3`(%rsp), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*7`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # U2 = U1*H^2 lea `32*9*0`($a_ptr), %rsi lea `32*9*5`(%rsp), %rdx lea `32*9*0`(%rsp), %rdi call avx2_mul_x4 #call avx2_normalize `&STORE` # Hsqr = U2*2 #lea 32*9*0(%rsp), %rsi #lea 32*9*5(%rsp), %rdi #call avx2_mul_by2_x4 vpaddq $ACC0, $ACC0, $ACC0 # inlined avx2_mul_by2_x4 lea `32*9*5`(%rsp), %rdi vpaddq $ACC1, $ACC1, $ACC1 vpaddq $ACC2, $ACC2, $ACC2 vpaddq $ACC3, $ACC3, $ACC3 vpaddq $ACC4, $ACC4, $ACC4 vpaddq $ACC5, $ACC5, $ACC5 vpaddq $ACC6, $ACC6, $ACC6 vpaddq $ACC7, $ACC7, $ACC7 vpaddq $ACC8, $ACC8, $ACC8 call avx2_normalize_n_store # X3 = R^2 - H^3 #lea 32*9*6(%rsp), %rsi #lea 32*9*7(%rsp), %rdx #lea 32*9*5(%rsp), %rcx #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE # X3 = X3 - U2*2 #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_sub_x4 #NORMALIZE #STORE lea `32*9*6+128`(%rsp), %rsi lea .LAVX2_POLY_x2+128(%rip), %rax lea `32*9*7+128`(%rsp), %rdx lea `32*9*5+128`(%rsp), %rcx lea `32*9*0`($r_ptr), %rdi vmovdqa 32*0-128(%rsi), $ACC0 vmovdqa 32*1-128(%rsi), $ACC1 vmovdqa 32*2-128(%rsi), $ACC2 vmovdqa 32*3-128(%rsi), $ACC3 vmovdqa 32*4-128(%rsi), $ACC4 vmovdqa 32*5-128(%rsi), $ACC5 vmovdqa 32*6-128(%rsi), $ACC6 vmovdqa 32*7-128(%rsi), $ACC7 vmovdqa 32*8-128(%rsi), $ACC8 vpaddq 32*0-128(%rax), $ACC0, $ACC0 vpaddq 32*1-128(%rax), $ACC1, $ACC1 vpaddq 32*2-128(%rax), $ACC2, $ACC2 vpaddq 32*3-128(%rax), $ACC3, $ACC3 vpaddq 32*4-128(%rax), $ACC4, $ACC4 vpaddq 32*5-128(%rax), $ACC5, $ACC5 vpaddq 32*6-128(%rax), $ACC6, $ACC6 vpaddq 32*7-128(%rax), $ACC7, $ACC7 vpaddq 32*8-128(%rax), $ACC8, $ACC8 vpsubq 32*0-128(%rdx), $ACC0, $ACC0 vpsubq 32*1-128(%rdx), $ACC1, $ACC1 vpsubq 32*2-128(%rdx), $ACC2, $ACC2 vpsubq 32*3-128(%rdx), $ACC3, $ACC3 vpsubq 32*4-128(%rdx), $ACC4, $ACC4 vpsubq 32*5-128(%rdx), $ACC5, $ACC5 vpsubq 32*6-128(%rdx), $ACC6, $ACC6 vpsubq 32*7-128(%rdx), $ACC7, $ACC7 vpsubq 32*8-128(%rdx), $ACC8, $ACC8 vpsubq 32*0-128(%rcx), $ACC0, $ACC0 vpsubq 32*1-128(%rcx), $ACC1, $ACC1 vpsubq 32*2-128(%rcx), $ACC2, $ACC2 vpsubq 32*3-128(%rcx), $ACC3, $ACC3 vpsubq 32*4-128(%rcx), $ACC4, $ACC4 vpsubq 32*5-128(%rcx), $ACC5, $ACC5 vpsubq 32*6-128(%rcx), $ACC6, $ACC6 vpsubq 32*7-128(%rcx), $ACC7, $ACC7 vpsubq 32*8-128(%rcx), $ACC8, $ACC8 call avx2_normalize lea 32*0($b_ptr), %rsi lea 32*0($a_ptr), %rdx call avx2_select_n_store # H = U2 - X3 lea `32*9*0`(%rsp), %rsi lea `32*9*0`($r_ptr), %rdx lea `32*9*3`(%rsp), %rdi call avx2_sub_x4 call avx2_normalize_n_store # H = H*R lea `32*9*3`(%rsp), %rsi lea `32*9*4`(%rsp), %rdx lea `32*9*3`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # S2 = S1 * H^3 lea `32*9*7`(%rsp), %rsi lea `32*9*1`($a_ptr), %rdx lea `32*9*1`(%rsp), %rdi call avx2_mul_x4 call avx2_normalize_n_store # lea `32*9*3`(%rsp), %rsi lea `32*9*1`(%rsp), %rdx lea `32*9*1`($r_ptr), %rdi call avx2_sub_x4 call avx2_normalize lea 32*9($b_ptr), %rsi lea 32*9($a_ptr), %rdx call avx2_select_n_store #lea 32*9*0($r_ptr), %rsi #lea 32*9*0($r_ptr), %rdi #call avx2_mul_by1_x4 #NORMALIZE #STORE lea `32*9*1`($r_ptr), %rsi lea `32*9*1`($r_ptr), %rdi call avx2_mul_by1_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps %xmm6, -16*10(%rbp) movaps %xmm7, -16*9(%rbp) movaps %xmm8, -16*8(%rbp) movaps %xmm9, -16*7(%rbp) movaps %xmm10, -16*6(%rbp) movaps %xmm11, -16*5(%rbp) movaps %xmm12, -16*4(%rbp) movaps %xmm13, -16*3(%rbp) movaps %xmm14, -16*2(%rbp) movaps %xmm15, -16*1(%rbp) ___ $code.=<<___; mov %rbp, %rsp pop %rbp ret .size ecp_nistz256_avx2_point_add_affines_x4,.-ecp_nistz256_avx2_point_add_affines_x4 ################################################################################ # void ecp_nistz256_avx2_to_mont(void* RESULTx4, void *Ax4); .globl ecp_nistz256_avx2_to_mont .type ecp_nistz256_avx2_to_mont,\@function,2 .align 32 ecp_nistz256_avx2_to_mont: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK lea .LTO_MONT_AVX2(%rip), %rdx call avx2_mul_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_to_mont,.-ecp_nistz256_avx2_to_mont ################################################################################ # void ecp_nistz256_avx2_from_mont(void* RESULTx4, void *Ax4); .globl ecp_nistz256_avx2_from_mont .type ecp_nistz256_avx2_from_mont,\@function,2 .align 32 ecp_nistz256_avx2_from_mont: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; vmovdqa .LAVX2_AND_MASK(%rip), $AND_MASK lea .LFROM_MONT_AVX2(%rip), %rdx call avx2_mul_x4 call avx2_normalize_n_store vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_from_mont,.-ecp_nistz256_avx2_from_mont ################################################################################ # void ecp_nistz256_avx2_set1(void* RESULTx4); .globl ecp_nistz256_avx2_set1 .type ecp_nistz256_avx2_set1,\@function,1 .align 32 ecp_nistz256_avx2_set1: lea .LONE+128(%rip), %rax lea 128(%rdi), %rdi vzeroupper vmovdqa 32*0-128(%rax), %ymm0 vmovdqa 32*1-128(%rax), %ymm1 vmovdqa 32*2-128(%rax), %ymm2 vmovdqa 32*3-128(%rax), %ymm3 vmovdqa 32*4-128(%rax), %ymm4 vmovdqa 32*5-128(%rax), %ymm5 vmovdqa %ymm0, 32*0-128(%rdi) vmovdqa 32*6-128(%rax), %ymm0 vmovdqa %ymm1, 32*1-128(%rdi) vmovdqa 32*7-128(%rax), %ymm1 vmovdqa %ymm2, 32*2-128(%rdi) vmovdqa 32*8-128(%rax), %ymm2 vmovdqa %ymm3, 32*3-128(%rdi) vmovdqa %ymm4, 32*4-128(%rdi) vmovdqa %ymm5, 32*5-128(%rdi) vmovdqa %ymm0, 32*6-128(%rdi) vmovdqa %ymm1, 32*7-128(%rdi) vmovdqa %ymm2, 32*8-128(%rdi) vzeroupper ret .size ecp_nistz256_avx2_set1,.-ecp_nistz256_avx2_set1 ___ } { ################################################################################ # void ecp_nistz256_avx2_multi_gather_w7(void* RESULT, void *in, # int index0, int index1, int index2, int index3); ################################################################################ my ($val,$in_t,$index0,$index1,$index2,$index3)=("%rdi","%rsi","%edx","%ecx","%r8d","%r9d"); my ($INDEX0,$INDEX1,$INDEX2,$INDEX3)=map("%ymm$_",(0..3)); my ($R0a,$R0b,$R1a,$R1b,$R2a,$R2b,$R3a,$R3b)=map("%ymm$_",(4..11)); my ($M0,$T0,$T1,$TMP0)=map("%ymm$_",(12..15)); $code.=<<___; .globl ecp_nistz256_avx2_multi_gather_w7 .type ecp_nistz256_avx2_multi_gather_w7,\@function,6 .align 32 ecp_nistz256_avx2_multi_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); lea -8-16*10(%rsp), %rsp vmovaps %xmm6, -8-16*10(%rax) vmovaps %xmm7, -8-16*9(%rax) vmovaps %xmm8, -8-16*8(%rax) vmovaps %xmm9, -8-16*7(%rax) vmovaps %xmm10, -8-16*6(%rax) vmovaps %xmm11, -8-16*5(%rax) vmovaps %xmm12, -8-16*4(%rax) vmovaps %xmm13, -8-16*3(%rax) vmovaps %xmm14, -8-16*2(%rax) vmovaps %xmm15, -8-16*1(%rax) ___ $code.=<<___; lea .LIntOne(%rip), %rax vmovd $index0, %xmm0 vmovd $index1, %xmm1 vmovd $index2, %xmm2 vmovd $index3, %xmm3 vpxor $R0a, $R0a, $R0a vpxor $R0b, $R0b, $R0b vpxor $R1a, $R1a, $R1a vpxor $R1b, $R1b, $R1b vpxor $R2a, $R2a, $R2a vpxor $R2b, $R2b, $R2b vpxor $R3a, $R3a, $R3a vpxor $R3b, $R3b, $R3b vmovdqa (%rax), $M0 vpermd $INDEX0, $R0a, $INDEX0 vpermd $INDEX1, $R0a, $INDEX1 vpermd $INDEX2, $R0a, $INDEX2 vpermd $INDEX3, $R0a, $INDEX3 mov \$64, %ecx lea 112($val), $val # size optimization jmp .Lmulti_select_loop_avx2 # INDEX=0, corresponds to the point at infty (0,0) .align 32 .Lmulti_select_loop_avx2: vpcmpeqd $INDEX0, $M0, $TMP0 vmovdqa `32*0+32*64*2*0`($in_t), $T0 vmovdqa `32*1+32*64*2*0`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R0a, $R0a vpxor $T1, $R0b, $R0b vpcmpeqd $INDEX1, $M0, $TMP0 vmovdqa `32*0+32*64*2*1`($in_t), $T0 vmovdqa `32*1+32*64*2*1`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R1a, $R1a vpxor $T1, $R1b, $R1b vpcmpeqd $INDEX2, $M0, $TMP0 vmovdqa `32*0+32*64*2*2`($in_t), $T0 vmovdqa `32*1+32*64*2*2`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R2a, $R2a vpxor $T1, $R2b, $R2b vpcmpeqd $INDEX3, $M0, $TMP0 vmovdqa `32*0+32*64*2*3`($in_t), $T0 vmovdqa `32*1+32*64*2*3`($in_t), $T1 vpand $TMP0, $T0, $T0 vpand $TMP0, $T1, $T1 vpxor $T0, $R3a, $R3a vpxor $T1, $R3b, $R3b vpaddd (%rax), $M0, $M0 # increment lea 32*2($in_t), $in_t dec %ecx jnz .Lmulti_select_loop_avx2 vmovdqu $R0a, 32*0-112($val) vmovdqu $R0b, 32*1-112($val) vmovdqu $R1a, 32*2-112($val) vmovdqu $R1b, 32*3-112($val) vmovdqu $R2a, 32*4-112($val) vmovdqu $R2b, 32*5-112($val) vmovdqu $R3a, 32*6-112($val) vmovdqu $R3b, 32*7-112($val) vzeroupper ___ $code.=<<___ if ($win64); movaps 16*0(%rsp), %xmm6 movaps 16*1(%rsp), %xmm7 movaps 16*2(%rsp), %xmm8 movaps 16*3(%rsp), %xmm9 movaps 16*4(%rsp), %xmm10 movaps 16*5(%rsp), %xmm11 movaps 16*6(%rsp), %xmm12 movaps 16*7(%rsp), %xmm13 movaps 16*8(%rsp), %xmm14 movaps 16*9(%rsp), %xmm15 lea 8+16*10(%rsp), %rsp ___ $code.=<<___; ret .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .extern OPENSSL_ia32cap_P .globl ecp_nistz_avx2_eligible .type ecp_nistz_avx2_eligible,\@abi-omnipotent .align 32 ecp_nistz_avx2_eligible: mov OPENSSL_ia32cap_P+8(%rip),%eax shr \$5,%eax and \$1,%eax ret .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible ___ } }} else {{ # assembler is too old $code.=<<___; .text .globl ecp_nistz256_avx2_transpose_convert .globl ecp_nistz256_avx2_convert_transpose_back .globl ecp_nistz256_avx2_point_add_affine_x4 .globl ecp_nistz256_avx2_point_add_affines_x4 .globl ecp_nistz256_avx2_to_mont .globl ecp_nistz256_avx2_from_mont .globl ecp_nistz256_avx2_set1 .globl ecp_nistz256_avx2_multi_gather_w7 .type ecp_nistz256_avx2_multi_gather_w7,\@abi-omnipotent ecp_nistz256_avx2_transpose_convert: ecp_nistz256_avx2_convert_transpose_back: ecp_nistz256_avx2_point_add_affine_x4: ecp_nistz256_avx2_point_add_affines_x4: ecp_nistz256_avx2_to_mont: ecp_nistz256_avx2_from_mont: ecp_nistz256_avx2_set1: ecp_nistz256_avx2_multi_gather_w7: .byte 0x0f,0x0b # ud2 ret .size ecp_nistz256_avx2_multi_gather_w7,.-ecp_nistz256_avx2_multi_gather_w7 .globl ecp_nistz_avx2_eligible .type ecp_nistz_avx2_eligible,\@abi-omnipotent ecp_nistz_avx2_eligible: xor %eax,%eax ret .size ecp_nistz_avx2_eligible,.-ecp_nistz_avx2_eligible ___ }} foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/ec/asm/ecp_nistz256-x86_64.pl (revision 364963) @@ -1,4739 +1,4739 @@ #! /usr/bin/env perl # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. # Copyright (c) 2014, Intel Corporation. All Rights Reserved. # Copyright (c) 2015 CloudFlare, Inc. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1, 3) # (1) Intel Corporation, Israel Development Center, Haifa, Israel # (2) University of Haifa, Israel # (3) CloudFlare, Inc. # # Reference: # S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with # 256 Bit Primes" # Further optimization by : # # this/original with/without -DECP_NISTZ256_ASM(*) # Opteron +15-49% +150-195% # Bulldozer +18-45% +175-240% # P4 +24-46% +100-150% # Westmere +18-34% +87-160% # Sandy Bridge +14-35% +120-185% # Ivy Bridge +11-35% +125-180% # Haswell +10-37% +160-200% # Broadwell +24-58% +210-270% # Atom +20-50% +180-240% # VIA Nano +50-160% +480-480% # # (*) "without -DECP_NISTZ256_ASM" refers to build with # "enable-ec_nistp_64_gcc_128"; # # Ranges denote minimum and maximum improvement coefficients depending # on benchmark. In "this/original" column lower coefficient is for # ECDSA sign, while in "with/without" - for ECDH key agreement, and # higher - for ECDSA sign, relatively fastest server-side operation. # Keep in mind that +100% means 2x improvement. $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $avx = ($ver>=3.0) + ($ver>=3.01); $addx = ($ver>=3.03); } $code.=<<___; .text .extern OPENSSL_ia32cap_P # The polynomial .align 64 .Lpoly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 # 2^512 mod P precomputed for NIST P256 polynomial .LRR: .quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd .LOne: .long 1,1,1,1,1,1,1,1 .LTwo: .long 2,2,2,2,2,2,2,2 .LThree: .long 3,3,3,3,3,3,3,3 .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe # Constants for computations modulo ord(p256) .Lord: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f ___ { ################################################################################ # void ecp_nistz256_mul_by_2(uint64_t res[4], uint64_t a[4]); my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11)); my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13"); my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx"); $code.=<<___; .globl ecp_nistz256_mul_by_2 .type ecp_nistz256_mul_by_2,\@function,2 .align 64 ecp_nistz256_mul_by_2: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lmul_by_2_body: mov 8*0($a_ptr), $a0 xor $t4,$t4 mov 8*1($a_ptr), $a1 add $a0, $a0 # a0:a3+a0:a3 mov 8*2($a_ptr), $a2 adc $a1, $a1 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub 8*0($a_ptr), $a0 mov $a2, $t2 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a3, $t3 sbb 8*3($a_ptr), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lmul_by_2_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 ################################################################################ # void ecp_nistz256_div_by_2(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_div_by_2 .type ecp_nistz256_div_by_2,\@function,2 .align 32 ecp_nistz256_div_by_2: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Ldiv_by_2_body: mov 8*0($a_ptr), $a0 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), $a2 mov $a0, $t0 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a1, $t1 xor $t4, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 adc \$0, $t4 xor $a_ptr, $a_ptr # borrow $a_ptr test \$1, $t0 cmovz $t0, $a0 cmovz $t1, $a1 cmovz $t2, $a2 cmovz $t3, $a3 cmovz $a_ptr, $t4 mov $a1, $t0 # a0:a3>>1 shr \$1, $a0 shl \$63, $t0 mov $a2, $t1 shr \$1, $a1 or $t0, $a0 shl \$63, $t1 mov $a3, $t2 shr \$1, $a2 or $t1, $a1 shl \$63, $t2 shr \$1, $a3 shl \$63, $t4 or $t2, $a2 or $t4, $a3 mov $a0, 8*0($r_ptr) mov $a1, 8*1($r_ptr) mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Ldiv_by_2_epilogue: ret .cfi_endproc .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 ################################################################################ # void ecp_nistz256_mul_by_3(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_mul_by_3 .type ecp_nistz256_mul_by_3,\@function,2 .align 32 ecp_nistz256_mul_by_3: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lmul_by_3_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 mov 8*1($a_ptr), $a1 add $a0, $a0 # a0:a3+a0:a3 mov 8*2($a_ptr), $a2 adc $a1, $a1 mov 8*3($a_ptr), $a3 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb .Lpoly+8*1(%rip), $a1 sbb \$0, $a2 mov $a3, $t3 sbb .Lpoly+8*3(%rip), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 cmovc $t2, $a2 cmovc $t3, $a3 xor $t4, $t4 add 8*0($a_ptr), $a0 # a0:a3+=a_ptr[0:3] adc 8*1($a_ptr), $a1 mov $a0, $t0 adc 8*2($a_ptr), $a2 adc 8*3($a_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb .Lpoly+8*1(%rip), $a1 sbb \$0, $a2 mov $a3, $t3 sbb .Lpoly+8*3(%rip), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lmul_by_3_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 ################################################################################ # void ecp_nistz256_add(uint64_t res[4], uint64_t a[4], uint64_t b[4]); .globl ecp_nistz256_add .type ecp_nistz256_add,\@function,3 .align 32 ecp_nistz256_add: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Ladd_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), $a2 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr add 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub 8*0($a_ptr), $a0 mov $a2, $t2 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a3, $t3 sbb 8*3($a_ptr), $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Ladd_epilogue: ret .cfi_endproc .size ecp_nistz256_add,.-ecp_nistz256_add ################################################################################ # void ecp_nistz256_sub(uint64_t res[4], uint64_t a[4], uint64_t b[4]); .globl ecp_nistz256_sub .type ecp_nistz256_sub,\@function,3 .align 32 ecp_nistz256_sub: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lsub_body: mov 8*0($a_ptr), $a0 xor $t4, $t4 mov 8*1($a_ptr), $a1 mov 8*2($a_ptr), $a2 mov 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr sub 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb \$0, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lsub_epilogue: ret .cfi_endproc .size ecp_nistz256_sub,.-ecp_nistz256_sub ################################################################################ # void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]); .globl ecp_nistz256_neg .type ecp_nistz256_neg,\@function,2 .align 32 ecp_nistz256_neg: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lneg_body: xor $a0, $a0 xor $a1, $a1 xor $a2, $a2 xor $a3, $a3 xor $t4, $t4 sub 8*0($a_ptr), $a0 sbb 8*1($a_ptr), $a1 sbb 8*2($a_ptr), $a2 mov $a0, $t0 sbb 8*3($a_ptr), $a3 lea .Lpoly(%rip), $a_ptr mov $a1, $t1 sbb \$0, $t4 add 8*0($a_ptr), $a0 mov $a2, $t2 adc 8*1($a_ptr), $a1 adc 8*2($a_ptr), $a2 mov $a3, $t3 adc 8*3($a_ptr), $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lneg_epilogue: ret .cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg ___ } { my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax"); my ($poly1,$poly3)=($acc6,$acc7); $code.=<<___; ################################################################################ # void ecp_nistz256_ord_mul_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_ord_mul_mont .type ecp_nistz256_ord_mul_mont,\@function,3 .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lecp_nistz256_ord_mul_montx ___ $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_mul_body: mov 8*0($b_org), %rax mov $b_org, $b_ptr lea .Lord(%rip), %r14 mov .LordK(%rip), %r15 ################################# * b[0] mov %rax, $t0 mulq 8*0($a_ptr) mov %rax, $acc0 mov $t0, %rax mov %rdx, $acc1 mulq 8*1($a_ptr) add %rax, $acc1 mov $t0, %rax adc \$0, %rdx mov %rdx, $acc2 mulq 8*2($a_ptr) add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov $acc0, $acc5 imulq %r15,$acc0 mov %rdx, $acc3 mulq 8*3($a_ptr) add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx mov %rdx, $acc4 ################################# First reduction step mulq 8*0(%r14) mov $acc0, $t1 add %rax, $acc5 # guaranteed to be zero mov $acc0, %rax adc \$0, %rdx mov %rdx, $t0 sub $acc0, $acc2 sbb \$0, $acc0 # can't borrow mulq 8*1(%r14) add $t0, $acc1 adc \$0, %rdx add %rax, $acc1 mov $t1, %rax adc %rdx, $acc2 mov $t1, %rdx adc \$0, $acc0 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc3 mov 8*1($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc0, $acc3 adc $t1, $acc4 adc \$0, $acc5 ################################# * b[1] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc1 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov $acc1, $t0 imulq %r15, $acc1 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc4 adc \$0, %rdx xor $acc0, $acc0 add %rax, $acc4 mov $acc1, %rax adc %rdx, $acc5 adc \$0, $acc0 ################################# Second reduction step mulq 8*0(%r14) mov $acc1, $t1 add %rax, $t0 # guaranteed to be zero mov $acc1, %rax adc %rdx, $t0 sub $acc1, $acc3 sbb \$0, $acc1 # can't borrow mulq 8*1(%r14) add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t1, %rax adc %rdx, $acc3 mov $t1, %rdx adc \$0, $acc1 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc4 mov 8*2($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc1, $acc4 adc $t1, $acc5 adc \$0, $acc0 ################################## * b[2] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc2 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t0, %rax adc \$0, %rdx mov $acc2, $t0 imulq %r15, $acc2 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc5 adc \$0, %rdx xor $acc1, $acc1 add %rax, $acc5 mov $acc2, %rax adc %rdx, $acc0 adc \$0, $acc1 ################################# Third reduction step mulq 8*0(%r14) mov $acc2, $t1 add %rax, $t0 # guaranteed to be zero mov $acc2, %rax adc %rdx, $t0 sub $acc2, $acc4 sbb \$0, $acc2 # can't borrow mulq 8*1(%r14) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc %rdx, $acc4 mov $t1, %rdx adc \$0, $acc2 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc5 mov 8*3($b_ptr), %rax sbb %rdx, $t1 # can't borrow add $acc2, $acc5 adc $t1, $acc0 adc \$0, $acc1 ################################# * b[3] mov %rax, $t0 mulq 8*0($a_ptr) add %rax, $acc3 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*1($a_ptr) add $t1, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t0, %rax adc \$0, %rdx mov %rdx, $t1 mulq 8*2($a_ptr) add $t1, $acc5 adc \$0, %rdx add %rax, $acc5 mov $t0, %rax adc \$0, %rdx mov $acc3, $t0 imulq %r15, $acc3 mov %rdx, $t1 mulq 8*3($a_ptr) add $t1, $acc0 adc \$0, %rdx xor $acc2, $acc2 add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 adc \$0, $acc2 ################################# Last reduction step mulq 8*0(%r14) mov $acc3, $t1 add %rax, $t0 # guaranteed to be zero mov $acc3, %rax adc %rdx, $t0 sub $acc3, $acc5 sbb \$0, $acc3 # can't borrow mulq 8*1(%r14) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc %rdx, $acc5 mov $t1, %rdx adc \$0, $acc3 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc0 sbb %rdx, $t1 # can't borrow add $acc3, $acc0 adc $t1, $acc1 adc \$0, $acc2 ################################# Subtract ord mov $acc4, $a_ptr sub 8*0(%r14), $acc4 mov $acc5, $acc3 sbb 8*1(%r14), $acc5 mov $acc0, $t0 sbb 8*2(%r14), $acc0 mov $acc1, $t1 sbb 8*3(%r14), $acc1 sbb \$0, $acc2 cmovc $a_ptr, $acc4 cmovc $acc3, $acc5 cmovc $t0, $acc0 cmovc $t1, $acc1 mov $acc4, 8*0($r_ptr) mov $acc5, 8*1($r_ptr) mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mul_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont ################################################################################ # void ecp_nistz256_ord_sqr_mont( # uint64_t res[4], # uint64_t a[4], # int rep); .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,\@function,3 .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lecp_nistz256_ord_sqr_montx ___ $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_sqr_body: mov 8*0($a_ptr), $acc0 mov 8*1($a_ptr), %rax mov 8*2($a_ptr), $acc6 mov 8*3($a_ptr), $acc7 lea .Lord(%rip), $a_ptr # pointer to modulus mov $b_org, $b_ptr jmp .Loop_ord_sqr .align 32 .Loop_ord_sqr: ################################# a[1:] * a[0] mov %rax, $t1 # put aside a[1] mul $acc0 # a[1] * a[0] mov %rax, $acc1 movq $t1, %xmm1 # offload a[1] mov $acc6, %rax mov %rdx, $acc2 mul $acc0 # a[2] * a[0] add %rax, $acc2 mov $acc7, %rax movq $acc6, %xmm2 # offload a[2] adc \$0, %rdx mov %rdx, $acc3 mul $acc0 # a[3] * a[0] add %rax, $acc3 mov $acc7, %rax movq $acc7, %xmm3 # offload a[3] adc \$0, %rdx mov %rdx, $acc4 ################################# a[3] * a[2] mul $acc6 # a[3] * a[2] mov %rax, $acc5 mov $acc6, %rax mov %rdx, $acc6 ################################# a[2:] * a[1] mul $t1 # a[2] * a[1] add %rax, $acc3 mov $acc7, %rax adc \$0, %rdx mov %rdx, $acc7 mul $t1 # a[3] * a[1] add %rax, $acc4 adc \$0, %rdx add $acc7, $acc4 adc %rdx, $acc5 adc \$0, $acc6 # can't overflow ################################# *2 xor $acc7, $acc7 mov $acc0, %rax add $acc1, $acc1 adc $acc2, $acc2 adc $acc3, $acc3 adc $acc4, $acc4 adc $acc5, $acc5 adc $acc6, $acc6 adc \$0, $acc7 ################################# Missing products mul %rax # a[0] * a[0] mov %rax, $acc0 movq %xmm1, %rax mov %rdx, $t1 mul %rax # a[1] * a[1] add $t1, $acc1 adc %rax, $acc2 movq %xmm2, %rax adc \$0, %rdx mov %rdx, $t1 mul %rax # a[2] * a[2] add $t1, $acc3 adc %rax, $acc4 movq %xmm3, %rax adc \$0, %rdx mov %rdx, $t1 mov $acc0, $t0 imulq 8*4($a_ptr), $acc0 # *= .LordK mul %rax # a[3] * a[3] add $t1, $acc5 adc %rax, $acc6 mov 8*0($a_ptr), %rax # modulus[0] adc %rdx, $acc7 # can't overflow ################################# First reduction step mul $acc0 mov $acc0, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax # modulus[1] adc %rdx, $t0 sub $acc0, $acc2 sbb \$0, $t1 # can't borrow mul $acc0 add $t0, $acc1 adc \$0, %rdx add %rax, $acc1 mov $acc0, %rax adc %rdx, $acc2 mov $acc0, %rdx adc \$0, $t1 # can't overflow mov $acc1, $t0 imulq 8*4($a_ptr), $acc1 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc3 mov 8*0($a_ptr), %rax sbb %rdx, $acc0 # can't borrow add $t1, $acc3 adc \$0, $acc0 # can't overflow ################################# Second reduction step mul $acc1 mov $acc1, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc1, $acc3 sbb \$0, $t1 # can't borrow mul $acc1 add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $acc1, %rax adc %rdx, $acc3 mov $acc1, %rdx adc \$0, $t1 # can't overflow mov $acc2, $t0 imulq 8*4($a_ptr), $acc2 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc0 mov 8*0($a_ptr), %rax sbb %rdx, $acc1 # can't borrow add $t1, $acc0 adc \$0, $acc1 # can't overflow ################################# Third reduction step mul $acc2 mov $acc2, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc2, $acc0 sbb \$0, $t1 # can't borrow mul $acc2 add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $acc2, %rax adc %rdx, $acc0 mov $acc2, %rdx adc \$0, $t1 # can't overflow mov $acc3, $t0 imulq 8*4($a_ptr), $acc3 # *= .LordK shl \$32, %rax shr \$32, %rdx sub %rax, $acc1 mov 8*0($a_ptr), %rax sbb %rdx, $acc2 # can't borrow add $t1, $acc1 adc \$0, $acc2 # can't overflow ################################# Last reduction step mul $acc3 mov $acc3, $t1 add %rax, $t0 # guaranteed to be zero mov 8*1($a_ptr), %rax adc %rdx, $t0 sub $acc3, $acc1 sbb \$0, $t1 # can't borrow mul $acc3 add $t0, $acc0 adc \$0, %rdx add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 mov $acc3, %rdx adc \$0, $t1 # can't overflow shl \$32, %rax shr \$32, %rdx sub %rax, $acc2 sbb %rdx, $acc3 # can't borrow add $t1, $acc2 adc \$0, $acc3 # can't overflow ################################# Add bits [511:256] of the sqr result xor %rdx, %rdx add $acc4, $acc0 adc $acc5, $acc1 mov $acc0, $acc4 adc $acc6, $acc2 adc $acc7, $acc3 mov $acc1, %rax adc \$0, %rdx ################################# Compare to modulus sub 8*0($a_ptr), $acc0 mov $acc2, $acc6 sbb 8*1($a_ptr), $acc1 sbb 8*2($a_ptr), $acc2 mov $acc3, $acc7 sbb 8*3($a_ptr), $acc3 sbb \$0, %rdx cmovc $acc4, $acc0 cmovnc $acc1, %rax cmovnc $acc2, $acc6 cmovnc $acc3, $acc7 dec $b_ptr jnz .Loop_ord_sqr mov $acc0, 8*0($r_ptr) mov %rax, 8*1($r_ptr) pxor %xmm1, %xmm1 mov $acc6, 8*2($r_ptr) pxor %xmm2, %xmm2 mov $acc7, 8*3($r_ptr) pxor %xmm3, %xmm3 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqr_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont ___ $code.=<<___ if ($addx); ################################################################################ .type ecp_nistz256_ord_mul_montx,\@function,3 .align 32 ecp_nistz256_ord_mul_montx: .cfi_startproc .Lecp_nistz256_ord_mul_montx: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_mulx_body: mov $b_org, $b_ptr mov 8*0($b_org), %rdx mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 lea -128($a_ptr), $a_ptr # control u-op density lea .Lord-128(%rip), %r14 mov .LordK(%rip), %r15 ################################# Multiply by b[0] mulx $acc1, $acc0, $acc1 mulx $acc2, $t0, $acc2 mulx $acc3, $t1, $acc3 add $t0, $acc1 mulx $acc4, $t0, $acc4 mov $acc0, %rdx mulx %r15, %rdx, %rax adc $t1, $acc2 adc $t0, $acc3 adc \$0, $acc4 ################################# reduction xor $acc5, $acc5 # $acc5=0, cf=0, of=0 mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc0 # guaranteed to be zero adox $t1, $acc1 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*3+128(%r14), $t0, $t1 mov 8*1($b_ptr), %rdx adcx $t0, $acc3 adox $t1, $acc4 adcx $acc0, $acc4 adox $acc0, $acc5 adc \$0, $acc5 # cf=0, of=0 ################################# Multiply by b[1] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc1, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc4 adox $t1, $acc5 adcx $acc0, $acc5 adox $acc0, $acc0 adc \$0, $acc0 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc1 # guaranteed to be zero adox $t1, $acc2 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128(%r14), $t0, $t1 mov 8*2($b_ptr), %rdx adcx $t0, $acc4 adox $t1, $acc5 adcx $acc1, $acc5 adox $acc1, $acc0 adc \$0, $acc0 # cf=0, of=0 ################################# Multiply by b[2] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc2, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc5 adox $t1, $acc0 adcx $acc1, $acc0 adox $acc1, $acc1 adc \$0, $acc1 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc2 # guaranteed to be zero adox $t1, $acc3 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128(%r14), $t0, $t1 mov 8*3($b_ptr), %rdx adcx $t0, $acc5 adox $t1, $acc0 adcx $acc2, $acc0 adox $acc2, $acc1 adc \$0, $acc1 # cf=0, of=0 ################################# Multiply by b[3] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc3, %rdx mulx %r15, %rdx, %rax adcx $t0, $acc0 adox $t1, $acc1 adcx $acc2, $acc1 adox $acc2, $acc2 adc \$0, $acc2 # cf=0, of=0 ################################# reduction mulx 8*0+128(%r14), $t0, $t1 adcx $t0, $acc3 # guaranteed to be zero adox $t1, $acc4 mulx 8*1+128(%r14), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128(%r14), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128(%r14), $t0, $t1 lea 128(%r14),%r14 mov $acc4, $t2 adcx $t0, $acc0 adox $t1, $acc1 mov $acc5, $t3 adcx $acc3, $acc1 adox $acc3, $acc2 adc \$0, $acc2 ################################# # Branch-less conditional subtraction of P mov $acc0, $t0 sub 8*0(%r14), $acc4 sbb 8*1(%r14), $acc5 sbb 8*2(%r14), $acc0 mov $acc1, $t1 sbb 8*3(%r14), $acc1 sbb \$0, $acc2 cmovc $t2, $acc4 cmovc $t3, $acc5 cmovc $t0, $acc0 cmovc $t1, $acc1 mov $acc4, 8*0($r_ptr) mov $acc5, 8*1($r_ptr) mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mulx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx .type ecp_nistz256_ord_sqr_montx,\@function,3 .align 32 ecp_nistz256_ord_sqr_montx: .cfi_startproc .Lecp_nistz256_ord_sqr_montx: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lord_sqrx_body: mov $b_org, $b_ptr mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 lea .Lord(%rip), $a_ptr jmp .Loop_ord_sqrx .align 32 .Loop_ord_sqrx: mulx $acc6, $acc1, $acc2 # a[0]*a[1] mulx $acc7, $t0, $acc3 # a[0]*a[2] mov %rdx, %rax # offload a[0] movq $acc6, %xmm1 # offload a[1] mulx $acc0, $t1, $acc4 # a[0]*a[3] mov $acc6, %rdx add $t0, $acc2 movq $acc7, %xmm2 # offload a[2] adc $t1, $acc3 adc \$0, $acc4 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 ################################# mulx $acc7, $t0, $t1 # a[1]*a[2] adcx $t0, $acc3 adox $t1, $acc4 mulx $acc0, $t0, $t1 # a[1]*a[3] mov $acc7, %rdx adcx $t0, $acc4 adox $t1, $acc5 adc \$0, $acc5 ################################# mulx $acc0, $t0, $acc6 # a[2]*a[3] mov %rax, %rdx movq $acc0, %xmm3 # offload a[3] xor $acc7, $acc7 # $acc7=0,cf=0,of=0 adcx $acc1, $acc1 # acc1:6<<1 adox $t0, $acc5 adcx $acc2, $acc2 adox $acc7, $acc6 # of=0 ################################# a[i]*a[i] mulx %rdx, $acc0, $t1 movq %xmm1, %rdx adcx $acc3, $acc3 adox $t1, $acc1 adcx $acc4, $acc4 mulx %rdx, $t0, $t4 movq %xmm2, %rdx adcx $acc5, $acc5 adox $t0, $acc2 adcx $acc6, $acc6 mulx %rdx, $t0, $t1 .byte 0x67 movq %xmm3, %rdx adox $t4, $acc3 adcx $acc7, $acc7 adox $t0, $acc4 adox $t1, $acc5 mulx %rdx, $t0, $t4 adox $t0, $acc6 adox $t4, $acc7 ################################# reduction mov $acc0, %rdx mulx 8*4($a_ptr), %rdx, $t0 xor %rax, %rax # cf=0, of=0 mulx 8*0($a_ptr), $t0, $t1 adcx $t0, $acc0 # guaranteed to be zero adox $t1, $acc1 mulx 8*1($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*2($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*3($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc0 # of=0 adcx %rax, $acc0 # cf=0 ################################# mov $acc1, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adox $t0, $acc1 # guaranteed to be zero adcx $t1, $acc2 mulx 8*1($a_ptr), $t0, $t1 adox $t0, $acc2 adcx $t1, $acc3 mulx 8*2($a_ptr), $t0, $t1 adox $t0, $acc3 adcx $t1, $acc0 mulx 8*3($a_ptr), $t0, $t1 adox $t0, $acc0 adcx $t1, $acc1 # cf=0 adox %rax, $acc1 # of=0 ################################# mov $acc2, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adcx $t0, $acc2 # guaranteed to be zero adox $t1, $acc3 mulx 8*1($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc0 mulx 8*2($a_ptr), $t0, $t1 adcx $t0, $acc0 adox $t1, $acc1 mulx 8*3($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 # of=0 adcx %rax, $acc2 # cf=0 ################################# mov $acc3, %rdx mulx 8*4($a_ptr), %rdx, $t0 mulx 8*0($a_ptr), $t0, $t1 adox $t0, $acc3 # guaranteed to be zero adcx $t1, $acc0 mulx 8*1($a_ptr), $t0, $t1 adox $t0, $acc0 adcx $t1, $acc1 mulx 8*2($a_ptr), $t0, $t1 adox $t0, $acc1 adcx $t1, $acc2 mulx 8*3($a_ptr), $t0, $t1 adox $t0, $acc2 adcx $t1, $acc3 adox %rax, $acc3 ################################# accumulate upper half add $acc0, $acc4 # add $acc4, $acc0 adc $acc5, $acc1 mov $acc4, %rdx adc $acc6, $acc2 adc $acc7, $acc3 mov $acc1, $acc6 adc \$0, %rax ################################# compare to modulus sub 8*0($a_ptr), $acc4 mov $acc2, $acc7 sbb 8*1($a_ptr), $acc1 sbb 8*2($a_ptr), $acc2 mov $acc3, $acc0 sbb 8*3($a_ptr), $acc3 sbb \$0, %rax cmovnc $acc4, %rdx cmovnc $acc1, $acc6 cmovnc $acc2, $acc7 cmovnc $acc3, $acc0 dec $b_ptr jnz .Loop_ord_sqrx mov %rdx, 8*0($r_ptr) mov $acc6, 8*1($r_ptr) pxor %xmm1, %xmm1 mov $acc7, 8*2($r_ptr) pxor %xmm2, %xmm2 mov $acc0, 8*3($r_ptr) pxor %xmm3, %xmm3 mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqrx_epilogue: ret .cfi_endproc .size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx ___ $code.=<<___; ################################################################################ # void ecp_nistz256_to_mont( # uint64_t res[4], # uint64_t in[4]); .globl ecp_nistz256_to_mont .type ecp_nistz256_to_mont,\@function,2 .align 32 ecp_nistz256_to_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx ___ $code.=<<___; lea .LRR(%rip), $b_org jmp .Lmul_mont .cfi_endproc .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont ################################################################################ # void ecp_nistz256_mul_mont( # uint64_t res[4], # uint64_t a[4], # uint64_t b[4]); .globl ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,\@function,3 .align 32 ecp_nistz256_mul_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx ___ $code.=<<___; .Lmul_mont: push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lmul_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx je .Lmul_montx ___ $code.=<<___; mov $b_org, $b_ptr mov 8*0($b_org), %rax mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 call __ecp_nistz256_mul_montq ___ $code.=<<___ if ($addx); jmp .Lmul_mont_done .align 32 .Lmul_montx: mov $b_org, $b_ptr mov 8*0($b_org), %rdx mov 8*0($a_ptr), $acc1 mov 8*1($a_ptr), $acc2 mov 8*2($a_ptr), $acc3 mov 8*3($a_ptr), $acc4 lea -128($a_ptr), $a_ptr # control u-op density call __ecp_nistz256_mul_montx ___ $code.=<<___; .Lmul_mont_done: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmul_epilogue: ret .cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,\@abi-omnipotent .align 32 __ecp_nistz256_mul_montq: .cfi_startproc ######################################################################## # Multiply a by b[0] mov %rax, $t1 mulq $acc1 mov .Lpoly+8*1(%rip),$poly1 mov %rax, $acc0 mov $t1, %rax mov %rdx, $acc1 mulq $acc2 mov .Lpoly+8*3(%rip),$poly3 add %rax, $acc1 mov $t1, %rax adc \$0, %rdx mov %rdx, $acc2 mulq $acc3 add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $acc3 mulq $acc4 add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx xor $acc5, $acc5 mov %rdx, $acc4 ######################################################################## # First reduction step # Basically now we want to multiply acc[0] by p256, # and add the result to the acc. # Due to the special form of p256 we do some optimizations # # acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0] # then we add acc[0] and get acc[0] x 2^96 mov $acc0, $t1 shl \$32, $acc0 mulq $poly3 shr \$32, $t1 add $acc0, $acc1 # +=acc[0]<<96 adc $t1, $acc2 adc %rax, $acc3 mov 8*1($b_ptr), %rax adc %rdx, $acc4 adc \$0, $acc5 xor $acc0, $acc0 ######################################################################## # Multiply by b[1] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc1 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc2 adc \$0, %rdx add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $acc1, %rax adc %rdx, $acc5 adc \$0, $acc0 ######################################################################## # Second reduction step mov $acc1, $t1 shl \$32, $acc1 mulq $poly3 shr \$32, $t1 add $acc1, $acc2 adc $t1, $acc3 adc %rax, $acc4 mov 8*2($b_ptr), %rax adc %rdx, $acc5 adc \$0, $acc0 xor $acc1, $acc1 ######################################################################## # Multiply by b[2] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc2 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc3 adc \$0, %rdx add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc5 adc \$0, %rdx add %rax, $acc5 mov $acc2, %rax adc %rdx, $acc0 adc \$0, $acc1 ######################################################################## # Third reduction step mov $acc2, $t1 shl \$32, $acc2 mulq $poly3 shr \$32, $t1 add $acc2, $acc3 adc $t1, $acc4 adc %rax, $acc5 mov 8*3($b_ptr), %rax adc %rdx, $acc0 adc \$0, $acc1 xor $acc2, $acc2 ######################################################################## # Multiply by b[3] mov %rax, $t1 mulq 8*0($a_ptr) add %rax, $acc3 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*1($a_ptr) add $t0, $acc4 adc \$0, %rdx add %rax, $acc4 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*2($a_ptr) add $t0, $acc5 adc \$0, %rdx add %rax, $acc5 mov $t1, %rax adc \$0, %rdx mov %rdx, $t0 mulq 8*3($a_ptr) add $t0, $acc0 adc \$0, %rdx add %rax, $acc0 mov $acc3, %rax adc %rdx, $acc1 adc \$0, $acc2 ######################################################################## # Final reduction step mov $acc3, $t1 shl \$32, $acc3 mulq $poly3 shr \$32, $t1 add $acc3, $acc4 adc $t1, $acc5 mov $acc4, $t0 adc %rax, $acc0 adc %rdx, $acc1 mov $acc5, $t1 adc \$0, $acc2 ######################################################################## # Branch-less conditional subtraction of P sub \$-1, $acc4 # .Lpoly[0] mov $acc0, $t2 sbb $poly1, $acc5 # .Lpoly[1] sbb \$0, $acc0 # .Lpoly[2] mov $acc1, $t3 sbb $poly3, $acc1 # .Lpoly[3] sbb \$0, $acc2 cmovc $t0, $acc4 cmovc $t1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $t2, $acc0 mov $acc5, 8*1($r_ptr) cmovc $t3, $acc1 mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq ################################################################################ # void ecp_nistz256_sqr_mont( # uint64_t res[4], # uint64_t a[4]); # we optimize the square according to S.Gueron and V.Krasnov, # "Speeding up Big-Number Squaring" .globl ecp_nistz256_sqr_mont .type ecp_nistz256_sqr_mont,\@function,2 .align 32 ecp_nistz256_sqr_mont: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx ___ $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lsqr_body: ___ $code.=<<___ if ($addx); cmp \$0x80100, %ecx je .Lsqr_montx ___ $code.=<<___; mov 8*0($a_ptr), %rax mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 call __ecp_nistz256_sqr_montq ___ $code.=<<___ if ($addx); jmp .Lsqr_mont_done .align 32 .Lsqr_montx: mov 8*0($a_ptr), %rdx mov 8*1($a_ptr), $acc6 mov 8*2($a_ptr), $acc7 mov 8*3($a_ptr), $acc0 lea -128($a_ptr), $a_ptr # control u-op density call __ecp_nistz256_sqr_montx ___ $code.=<<___; .Lsqr_mont_done: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbx .cfi_restore %rbx mov 40(%rsp),%rbp .cfi_restore %rbp lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqr_epilogue: ret .cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,\@abi-omnipotent .align 32 __ecp_nistz256_sqr_montq: .cfi_startproc mov %rax, $acc5 mulq $acc6 # a[1]*a[0] mov %rax, $acc1 mov $acc7, %rax mov %rdx, $acc2 mulq $acc5 # a[0]*a[2] add %rax, $acc2 mov $acc0, %rax adc \$0, %rdx mov %rdx, $acc3 mulq $acc5 # a[0]*a[3] add %rax, $acc3 mov $acc7, %rax adc \$0, %rdx mov %rdx, $acc4 ################################# mulq $acc6 # a[1]*a[2] add %rax, $acc3 mov $acc0, %rax adc \$0, %rdx mov %rdx, $t1 mulq $acc6 # a[1]*a[3] add %rax, $acc4 mov $acc0, %rax adc \$0, %rdx add $t1, $acc4 mov %rdx, $acc5 adc \$0, $acc5 ################################# mulq $acc7 # a[2]*a[3] xor $acc7, $acc7 add %rax, $acc5 mov 8*0($a_ptr), %rax mov %rdx, $acc6 adc \$0, $acc6 add $acc1, $acc1 # acc1:6<<1 adc $acc2, $acc2 adc $acc3, $acc3 adc $acc4, $acc4 adc $acc5, $acc5 adc $acc6, $acc6 adc \$0, $acc7 mulq %rax mov %rax, $acc0 mov 8*1($a_ptr), %rax mov %rdx, $t0 mulq %rax add $t0, $acc1 adc %rax, $acc2 mov 8*2($a_ptr), %rax adc \$0, %rdx mov %rdx, $t0 mulq %rax add $t0, $acc3 adc %rax, $acc4 mov 8*3($a_ptr), %rax adc \$0, %rdx mov %rdx, $t0 mulq %rax add $t0, $acc5 adc %rax, $acc6 mov $acc0, %rax adc %rdx, $acc7 mov .Lpoly+8*1(%rip), $a_ptr mov .Lpoly+8*3(%rip), $t1 ########################################## # Now the reduction # First iteration mov $acc0, $t0 shl \$32, $acc0 mulq $t1 shr \$32, $t0 add $acc0, $acc1 # +=acc[0]<<96 adc $t0, $acc2 adc %rax, $acc3 mov $acc1, %rax adc \$0, %rdx ########################################## # Second iteration mov $acc1, $t0 shl \$32, $acc1 mov %rdx, $acc0 mulq $t1 shr \$32, $t0 add $acc1, $acc2 adc $t0, $acc3 adc %rax, $acc0 mov $acc2, %rax adc \$0, %rdx ########################################## # Third iteration mov $acc2, $t0 shl \$32, $acc2 mov %rdx, $acc1 mulq $t1 shr \$32, $t0 add $acc2, $acc3 adc $t0, $acc0 adc %rax, $acc1 mov $acc3, %rax adc \$0, %rdx ########################################### # Last iteration mov $acc3, $t0 shl \$32, $acc3 mov %rdx, $acc2 mulq $t1 shr \$32, $t0 add $acc3, $acc0 adc $t0, $acc1 adc %rax, $acc2 adc \$0, %rdx xor $acc3, $acc3 ############################################ # Add the rest of the acc add $acc0, $acc4 adc $acc1, $acc5 mov $acc4, $acc0 adc $acc2, $acc6 adc %rdx, $acc7 mov $acc5, $acc1 adc \$0, $acc3 sub \$-1, $acc4 # .Lpoly[0] mov $acc6, $acc2 sbb $a_ptr, $acc5 # .Lpoly[1] sbb \$0, $acc6 # .Lpoly[2] mov $acc7, $t0 sbb $t1, $acc7 # .Lpoly[3] sbb \$0, $acc3 cmovc $acc0, $acc4 cmovc $acc1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $acc2, $acc6 mov $acc5, 8*1($r_ptr) cmovc $t0, $acc7 mov $acc6, 8*2($r_ptr) mov $acc7, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq ___ if ($addx) { $code.=<<___; .type __ecp_nistz256_mul_montx,\@abi-omnipotent .align 32 __ecp_nistz256_mul_montx: .cfi_startproc ######################################################################## # Multiply by b[0] mulx $acc1, $acc0, $acc1 mulx $acc2, $t0, $acc2 mov \$32, $poly1 xor $acc5, $acc5 # cf=0 mulx $acc3, $t1, $acc3 mov .Lpoly+8*3(%rip), $poly3 adc $t0, $acc1 mulx $acc4, $t0, $acc4 mov $acc0, %rdx adc $t1, $acc2 shlx $poly1,$acc0,$t1 adc $t0, $acc3 shrx $poly1,$acc0,$t0 adc \$0, $acc4 ######################################################################## # First reduction step add $t1, $acc1 adc $t0, $acc2 mulx $poly3, $t0, $t1 mov 8*1($b_ptr), %rdx adc $t0, $acc3 adc $t1, $acc4 adc \$0, $acc5 xor $acc0, $acc0 # $acc0=0,cf=0,of=0 ######################################################################## # Multiply by b[1] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc1 adox $t1, $acc2 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc1, %rdx adcx $t0, $acc4 shlx $poly1, $acc1, $t0 adox $t1, $acc5 shrx $poly1, $acc1, $t1 adcx $acc0, $acc5 adox $acc0, $acc0 adc \$0, $acc0 ######################################################################## # Second reduction step add $t0, $acc2 adc $t1, $acc3 mulx $poly3, $t0, $t1 mov 8*2($b_ptr), %rdx adc $t0, $acc4 adc $t1, $acc5 adc \$0, $acc0 xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0 ######################################################################## # Multiply by b[2] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc2 adox $t1, $acc3 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc2, %rdx adcx $t0, $acc5 shlx $poly1, $acc2, $t0 adox $t1, $acc0 shrx $poly1, $acc2, $t1 adcx $acc1, $acc0 adox $acc1, $acc1 adc \$0, $acc1 ######################################################################## # Third reduction step add $t0, $acc3 adc $t1, $acc4 mulx $poly3, $t0, $t1 mov 8*3($b_ptr), %rdx adc $t0, $acc5 adc $t1, $acc0 adc \$0, $acc1 xor $acc2, $acc2 # $acc2=0,cf=0,of=0 ######################################################################## # Multiply by b[3] mulx 8*0+128($a_ptr), $t0, $t1 adcx $t0, $acc3 adox $t1, $acc4 mulx 8*1+128($a_ptr), $t0, $t1 adcx $t0, $acc4 adox $t1, $acc5 mulx 8*2+128($a_ptr), $t0, $t1 adcx $t0, $acc5 adox $t1, $acc0 mulx 8*3+128($a_ptr), $t0, $t1 mov $acc3, %rdx adcx $t0, $acc0 shlx $poly1, $acc3, $t0 adox $t1, $acc1 shrx $poly1, $acc3, $t1 adcx $acc2, $acc1 adox $acc2, $acc2 adc \$0, $acc2 ######################################################################## # Fourth reduction step add $t0, $acc4 adc $t1, $acc5 mulx $poly3, $t0, $t1 mov $acc4, $t2 mov .Lpoly+8*1(%rip), $poly1 adc $t0, $acc0 mov $acc5, $t3 adc $t1, $acc1 adc \$0, $acc2 ######################################################################## # Branch-less conditional subtraction of P xor %eax, %eax mov $acc0, $t0 sbb \$-1, $acc4 # .Lpoly[0] sbb $poly1, $acc5 # .Lpoly[1] sbb \$0, $acc0 # .Lpoly[2] mov $acc1, $t1 sbb $poly3, $acc1 # .Lpoly[3] sbb \$0, $acc2 cmovc $t2, $acc4 cmovc $t3, $acc5 mov $acc4, 8*0($r_ptr) cmovc $t0, $acc0 mov $acc5, 8*1($r_ptr) cmovc $t1, $acc1 mov $acc0, 8*2($r_ptr) mov $acc1, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx .type __ecp_nistz256_sqr_montx,\@abi-omnipotent .align 32 __ecp_nistz256_sqr_montx: .cfi_startproc mulx $acc6, $acc1, $acc2 # a[0]*a[1] mulx $acc7, $t0, $acc3 # a[0]*a[2] xor %eax, %eax adc $t0, $acc2 mulx $acc0, $t1, $acc4 # a[0]*a[3] mov $acc6, %rdx adc $t1, $acc3 adc \$0, $acc4 xor $acc5, $acc5 # $acc5=0,cf=0,of=0 ################################# mulx $acc7, $t0, $t1 # a[1]*a[2] adcx $t0, $acc3 adox $t1, $acc4 mulx $acc0, $t0, $t1 # a[1]*a[3] mov $acc7, %rdx adcx $t0, $acc4 adox $t1, $acc5 adc \$0, $acc5 ################################# mulx $acc0, $t0, $acc6 # a[2]*a[3] mov 8*0+128($a_ptr), %rdx xor $acc7, $acc7 # $acc7=0,cf=0,of=0 adcx $acc1, $acc1 # acc1:6<<1 adox $t0, $acc5 adcx $acc2, $acc2 adox $acc7, $acc6 # of=0 mulx %rdx, $acc0, $t1 mov 8*1+128($a_ptr), %rdx adcx $acc3, $acc3 adox $t1, $acc1 adcx $acc4, $acc4 mulx %rdx, $t0, $t4 mov 8*2+128($a_ptr), %rdx adcx $acc5, $acc5 adox $t0, $acc2 adcx $acc6, $acc6 .byte 0x67 mulx %rdx, $t0, $t1 mov 8*3+128($a_ptr), %rdx adox $t4, $acc3 adcx $acc7, $acc7 adox $t0, $acc4 mov \$32, $a_ptr adox $t1, $acc5 .byte 0x67,0x67 mulx %rdx, $t0, $t4 mov .Lpoly+8*3(%rip), %rdx adox $t0, $acc6 shlx $a_ptr, $acc0, $t0 adox $t4, $acc7 shrx $a_ptr, $acc0, $t4 mov %rdx,$t1 # reduction step 1 add $t0, $acc1 adc $t4, $acc2 mulx $acc0, $t0, $acc0 adc $t0, $acc3 shlx $a_ptr, $acc1, $t0 adc \$0, $acc0 shrx $a_ptr, $acc1, $t4 # reduction step 2 add $t0, $acc2 adc $t4, $acc3 mulx $acc1, $t0, $acc1 adc $t0, $acc0 shlx $a_ptr, $acc2, $t0 adc \$0, $acc1 shrx $a_ptr, $acc2, $t4 # reduction step 3 add $t0, $acc3 adc $t4, $acc0 mulx $acc2, $t0, $acc2 adc $t0, $acc1 shlx $a_ptr, $acc3, $t0 adc \$0, $acc2 shrx $a_ptr, $acc3, $t4 # reduction step 4 add $t0, $acc0 adc $t4, $acc1 mulx $acc3, $t0, $acc3 adc $t0, $acc2 adc \$0, $acc3 xor $t3, $t3 add $acc0, $acc4 # accumulate upper half mov .Lpoly+8*1(%rip), $a_ptr adc $acc1, $acc5 mov $acc4, $acc0 adc $acc2, $acc6 adc $acc3, $acc7 mov $acc5, $acc1 adc \$0, $t3 sub \$-1, $acc4 # .Lpoly[0] mov $acc6, $acc2 sbb $a_ptr, $acc5 # .Lpoly[1] sbb \$0, $acc6 # .Lpoly[2] mov $acc7, $acc3 sbb $t1, $acc7 # .Lpoly[3] sbb \$0, $t3 cmovc $acc0, $acc4 cmovc $acc1, $acc5 mov $acc4, 8*0($r_ptr) cmovc $acc2, $acc6 mov $acc5, 8*1($r_ptr) cmovc $acc3, $acc7 mov $acc6, 8*2($r_ptr) mov $acc7, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx ___ } } { my ($r_ptr,$in_ptr)=("%rdi","%rsi"); my ($acc0,$acc1,$acc2,$acc3)=map("%r$_",(8..11)); my ($t0,$t1,$t2)=("%rcx","%r12","%r13"); $code.=<<___; ################################################################################ # void ecp_nistz256_from_mont( # uint64_t res[4], # uint64_t in[4]); # This one performs Montgomery multiplication by 1, so we only need the reduction .globl ecp_nistz256_from_mont .type ecp_nistz256_from_mont,\@function,2 .align 32 ecp_nistz256_from_mont: .cfi_startproc push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 .Lfrom_body: mov 8*0($in_ptr), %rax mov .Lpoly+8*3(%rip), $t2 mov 8*1($in_ptr), $acc1 mov 8*2($in_ptr), $acc2 mov 8*3($in_ptr), $acc3 mov %rax, $acc0 mov .Lpoly+8*1(%rip), $t1 ######################################### # First iteration mov %rax, $t0 shl \$32, $acc0 mulq $t2 shr \$32, $t0 add $acc0, $acc1 adc $t0, $acc2 adc %rax, $acc3 mov $acc1, %rax adc \$0, %rdx ######################################### # Second iteration mov $acc1, $t0 shl \$32, $acc1 mov %rdx, $acc0 mulq $t2 shr \$32, $t0 add $acc1, $acc2 adc $t0, $acc3 adc %rax, $acc0 mov $acc2, %rax adc \$0, %rdx ########################################## # Third iteration mov $acc2, $t0 shl \$32, $acc2 mov %rdx, $acc1 mulq $t2 shr \$32, $t0 add $acc2, $acc3 adc $t0, $acc0 adc %rax, $acc1 mov $acc3, %rax adc \$0, %rdx ########################################### # Last iteration mov $acc3, $t0 shl \$32, $acc3 mov %rdx, $acc2 mulq $t2 shr \$32, $t0 add $acc3, $acc0 adc $t0, $acc1 mov $acc0, $t0 adc %rax, $acc2 mov $acc1, $in_ptr adc \$0, %rdx ########################################### # Branch-less conditional subtraction sub \$-1, $acc0 mov $acc2, %rax sbb $t1, $acc1 sbb \$0, $acc2 mov %rdx, $acc3 sbb $t2, %rdx sbb $t2, $t2 cmovnz $t0, $acc0 cmovnz $in_ptr, $acc1 mov $acc0, 8*0($r_ptr) cmovnz %rax, $acc2 mov $acc1, 8*1($r_ptr) cmovz %rdx, $acc3 mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) mov 0(%rsp),%r13 .cfi_restore %r13 mov 8(%rsp),%r12 .cfi_restore %r12 lea 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lfrom_epilogue: ret .cfi_endproc .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont ___ } { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7)); my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15)); my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15)); $code.=<<___; ################################################################################ # void ecp_nistz256_scatter_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_scatter_w5 .type ecp_nistz256_scatter_w5,\@abi-omnipotent .align 32 ecp_nistz256_scatter_w5: .cfi_startproc lea -3($index,$index,2), $index movdqa 0x00($in_t), %xmm0 shl \$5, $index movdqa 0x10($in_t), %xmm1 movdqa 0x20($in_t), %xmm2 movdqa 0x30($in_t), %xmm3 movdqa 0x40($in_t), %xmm4 movdqa 0x50($in_t), %xmm5 movdqa %xmm0, 0x00($val,$index) movdqa %xmm1, 0x10($val,$index) movdqa %xmm2, 0x20($val,$index) movdqa %xmm3, 0x30($val,$index) movdqa %xmm4, 0x40($val,$index) movdqa %xmm5, 0x50($val,$index) ret .cfi_endproc .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 ################################################################################ # void ecp_nistz256_gather_w5(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_gather_w5 .type ecp_nistz256_gather_w5,\@abi-omnipotent .align 32 ecp_nistz256_gather_w5: .cfi_startproc ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax jnz .Lavx2_gather_w5 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_gather_w5: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) ___ $code.=<<___; movdqa .LOne(%rip), $ONE movd $index, $INDEX pxor $Ra, $Ra pxor $Rb, $Rb pxor $Rc, $Rc pxor $Rd, $Rd pxor $Re, $Re pxor $Rf, $Rf movdqa $ONE, $M0 pshufd \$0, $INDEX, $INDEX mov \$16, %rax .Lselect_loop_sse_w5: movdqa $M0, $TMP0 paddd $ONE, $M0 pcmpeqd $INDEX, $TMP0 movdqa 16*0($in_t), $T0a movdqa 16*1($in_t), $T0b movdqa 16*2($in_t), $T0c movdqa 16*3($in_t), $T0d movdqa 16*4($in_t), $T0e movdqa 16*5($in_t), $T0f lea 16*6($in_t), $in_t pand $TMP0, $T0a pand $TMP0, $T0b por $T0a, $Ra pand $TMP0, $T0c por $T0b, $Rb pand $TMP0, $T0d por $T0c, $Rc pand $TMP0, $T0e por $T0d, $Rd pand $TMP0, $T0f por $T0e, $Re por $T0f, $Rf dec %rax jnz .Lselect_loop_sse_w5 movdqu $Ra, 16*0($val) movdqu $Rb, 16*1($val) movdqu $Rc, 16*2($val) movdqu $Rd, 16*3($val) movdqu $Re, 16*4($val) movdqu $Rf, 16*5($val) ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_gather_w5: .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 ################################################################################ # void ecp_nistz256_scatter_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_scatter_w7 .type ecp_nistz256_scatter_w7,\@abi-omnipotent .align 32 ecp_nistz256_scatter_w7: .cfi_startproc movdqu 0x00($in_t), %xmm0 shl \$6, $index movdqu 0x10($in_t), %xmm1 movdqu 0x20($in_t), %xmm2 movdqu 0x30($in_t), %xmm3 movdqa %xmm0, 0x00($val,$index) movdqa %xmm1, 0x10($val,$index) movdqa %xmm2, 0x20($val,$index) movdqa %xmm3, 0x30($val,$index) ret .cfi_endproc .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 ################################################################################ # void ecp_nistz256_gather_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_gather_w7 .type ecp_nistz256_gather_w7,\@abi-omnipotent .align 32 ecp_nistz256_gather_w7: .cfi_startproc ___ $code.=<<___ if ($avx>1); mov OPENSSL_ia32cap_P+8(%rip), %eax test \$`1<<5`, %eax jnz .Lavx2_gather_w7 ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_gather_w7: .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax) ___ $code.=<<___; movdqa .LOne(%rip), $M0 movd $index, $INDEX pxor $Ra, $Ra pxor $Rb, $Rb pxor $Rc, $Rc pxor $Rd, $Rd movdqa $M0, $ONE pshufd \$0, $INDEX, $INDEX mov \$64, %rax .Lselect_loop_sse_w7: movdqa $M0, $TMP0 paddd $ONE, $M0 movdqa 16*0($in_t), $T0a movdqa 16*1($in_t), $T0b pcmpeqd $INDEX, $TMP0 movdqa 16*2($in_t), $T0c movdqa 16*3($in_t), $T0d lea 16*4($in_t), $in_t pand $TMP0, $T0a pand $TMP0, $T0b por $T0a, $Ra pand $TMP0, $T0c por $T0b, $Rb pand $TMP0, $T0d por $T0c, $Rc prefetcht0 255($in_t) por $T0d, $Rd dec %rax jnz .Lselect_loop_sse_w7 movdqu $Ra, 16*0($val) movdqu $Rb, 16*1($val) movdqu $Rc, 16*2($val) movdqu $Rd, 16*3($val) ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea 0xa8(%rsp), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_gather_w7: .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 ___ } if ($avx>1) { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4)); my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9)); my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14)); $code.=<<___; ################################################################################ # void ecp_nistz256_avx2_gather_w5(uint64_t *val, uint64_t *in_t, int index); .type ecp_nistz256_avx2_gather_w5,\@abi-omnipotent .align 32 ecp_nistz256_avx2_gather_w5: .cfi_startproc .Lavx2_gather_w5: vzeroupper ___ $code.=<<___ if ($win64); lea -0x88(%rsp), %rax mov %rsp,%r11 .LSEH_begin_ecp_nistz256_avx2_gather_w5: .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LTwo(%rip), $TWO vpxor $Ra, $Ra, $Ra vpxor $Rb, $Rb, $Rb vpxor $Rc, $Rc, $Rc vmovdqa .LOne(%rip), $M0 vmovdqa .LTwo(%rip), $M1 vmovd $index, %xmm1 vpermd $INDEX, $Ra, $INDEX mov \$8, %rax .Lselect_loop_avx2_w5: vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vmovdqa 32*2($in_t), $T0c vmovdqa 32*3($in_t), $T1a vmovdqa 32*4($in_t), $T1b vmovdqa 32*5($in_t), $T1c vpcmpeqd $INDEX, $M0, $TMP0 vpcmpeqd $INDEX, $M1, $TMP1 vpaddd $TWO, $M0, $M0 vpaddd $TWO, $M1, $M1 lea 32*6($in_t), $in_t vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpand $TMP0, $T0c, $T0c vpand $TMP1, $T1a, $T1a vpand $TMP1, $T1b, $T1b vpand $TMP1, $T1c, $T1c vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vpxor $T0c, $Rc, $Rc vpxor $T1a, $Ra, $Ra vpxor $T1b, $Rb, $Rb vpxor $T1c, $Rc, $Rc dec %rax jnz .Lselect_loop_avx2_w5 vmovdqu $Ra, 32*0($val) vmovdqu $Rb, 32*1($val) vmovdqu $Rc, 32*2($val) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea (%r11), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_avx2_gather_w5: .size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 ___ } if ($avx>1) { my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx"); my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3)); my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7)); my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11)); my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15)); $code.=<<___; ################################################################################ # void ecp_nistz256_avx2_gather_w7(uint64_t *val, uint64_t *in_t, int index); .globl ecp_nistz256_avx2_gather_w7 .type ecp_nistz256_avx2_gather_w7,\@abi-omnipotent .align 32 ecp_nistz256_avx2_gather_w7: .cfi_startproc .Lavx2_gather_w7: vzeroupper ___ $code.=<<___ if ($win64); mov %rsp,%r11 lea -0x88(%rsp), %rax .LSEH_begin_ecp_nistz256_avx2_gather_w7: .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax), %rsp .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6, -0x20(%rax) .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7, -0x10(%rax) .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8, 8(%rax) .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9, 0x10(%rax) .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10, 0x20(%rax) .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11, 0x30(%rax) .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12, 0x40(%rax) .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13, 0x50(%rax) .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14, 0x60(%rax) .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15, 0x70(%rax) ___ $code.=<<___; vmovdqa .LThree(%rip), $THREE vpxor $Ra, $Ra, $Ra vpxor $Rb, $Rb, $Rb vmovdqa .LOne(%rip), $M0 vmovdqa .LTwo(%rip), $M1 vmovdqa .LThree(%rip), $M2 vmovd $index, %xmm1 vpermd $INDEX, $Ra, $INDEX # Skip index = 0, because it is implicitly the point at infinity mov \$21, %rax .Lselect_loop_avx2_w7: vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vmovdqa 32*2($in_t), $T1a vmovdqa 32*3($in_t), $T1b vmovdqa 32*4($in_t), $T2a vmovdqa 32*5($in_t), $T2b vpcmpeqd $INDEX, $M0, $TMP0 vpcmpeqd $INDEX, $M1, $TMP1 vpcmpeqd $INDEX, $M2, $TMP2 vpaddd $THREE, $M0, $M0 vpaddd $THREE, $M1, $M1 vpaddd $THREE, $M2, $M2 lea 32*6($in_t), $in_t vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpand $TMP1, $T1a, $T1a vpand $TMP1, $T1b, $T1b vpand $TMP2, $T2a, $T2a vpand $TMP2, $T2b, $T2b vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vpxor $T1a, $Ra, $Ra vpxor $T1b, $Rb, $Rb vpxor $T2a, $Ra, $Ra vpxor $T2b, $Rb, $Rb dec %rax jnz .Lselect_loop_avx2_w7 vmovdqa 32*0($in_t), $T0a vmovdqa 32*1($in_t), $T0b vpcmpeqd $INDEX, $M0, $TMP0 vpand $TMP0, $T0a, $T0a vpand $TMP0, $T0b, $T0b vpxor $T0a, $Ra, $Ra vpxor $T0b, $Rb, $Rb vmovdqu $Ra, 32*0($val) vmovdqu $Rb, 32*1($val) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp), %xmm6 movaps 0x10(%rsp), %xmm7 movaps 0x20(%rsp), %xmm8 movaps 0x30(%rsp), %xmm9 movaps 0x40(%rsp), %xmm10 movaps 0x50(%rsp), %xmm11 movaps 0x60(%rsp), %xmm12 movaps 0x70(%rsp), %xmm13 movaps 0x80(%rsp), %xmm14 movaps 0x90(%rsp), %xmm15 lea (%r11), %rsp ___ $code.=<<___; ret .cfi_endproc .LSEH_end_ecp_nistz256_avx2_gather_w7: .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } else { $code.=<<___; .globl ecp_nistz256_avx2_gather_w7 .type ecp_nistz256_avx2_gather_w7,\@function,3 .align 32 ecp_nistz256_avx2_gather_w7: .cfi_startproc .byte 0x0f,0x0b # ud2 ret .cfi_endproc .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 ___ } {{{ ######################################################################## # This block implements higher level point_double, point_add and # point_add_affine. The key to performance in this case is to allow # out-of-order execution logic to overlap computations from next step # with tail processing from current step. By using tailored calling # sequence we minimize inter-step overhead to give processor better # shot at overlapping operations... # # You will notice that input data is copied to stack. Trouble is that # there are no registers to spare for holding original pointers and # reloading them, pointers, would create undesired dependencies on # effective addresses calculation paths. In other words it's too done # to favour out-of-order execution logic. # my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx"); my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15)); my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4); my ($poly1,$poly3)=($acc6,$acc7); sub load_for_mul () { my ($a,$b,$src0) = @_; my $bias = $src0 eq "%rax" ? 0 : -128; " mov $b, $src0 lea $b, $b_ptr mov 8*0+$a, $acc1 mov 8*1+$a, $acc2 lea $bias+$a, $a_ptr mov 8*2+$a, $acc3 mov 8*3+$a, $acc4" } sub load_for_sqr () { my ($a,$src0) = @_; my $bias = $src0 eq "%rax" ? 0 : -128; " mov 8*0+$a, $src0 mov 8*1+$a, $acc6 lea $bias+$a, $a_ptr mov 8*2+$a, $acc7 mov 8*3+$a, $acc0" } { ######################################################################## # operate in 4-5-0-1 "name space" that matches multiplication output # my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); $code.=<<___; .type __ecp_nistz256_add_toq,\@abi-omnipotent .align 32 __ecp_nistz256_add_toq: .cfi_startproc xor $t4,$t4 add 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,\@abi-omnipotent .align 32 __ecp_nistz256_sub_fromq: .cfi_startproc sub 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb $t4, $t4 add \$-1, $a0 mov $a2, $t2 adc $poly1, $a1 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 test $t4, $t4 cmovz $t0, $a0 cmovz $t1, $a1 mov $a0, 8*0($r_ptr) cmovz $t2, $a2 mov $a1, 8*1($r_ptr) cmovz $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,\@abi-omnipotent .align 32 __ecp_nistz256_subq: .cfi_startproc sub $a0, $t0 sbb $a1, $t1 mov $t0, $a0 sbb $a2, $t2 sbb $a3, $t3 mov $t1, $a1 sbb $t4, $t4 add \$-1, $t0 mov $t2, $a2 adc $poly1, $t1 adc \$0, $t2 mov $t3, $a3 adc $poly3, $t3 test $t4, $t4 cmovnz $t0, $a0 cmovnz $t1, $a1 cmovnz $t2, $a2 cmovnz $t3, $a3 ret .cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,\@abi-omnipotent .align 32 __ecp_nistz256_mul_by_2q: .cfi_startproc xor $t4, $t4 add $a0, $a0 # a0:a3+a0:a3 adc $a1, $a1 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 sub \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q ___ } sub gen_double () { my $x = shift; my ($src0,$sfx,$bias); my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4)); if ($x ne "x") { $src0 = "%rax"; $sfx = ""; $bias = 0; $code.=<<___; .globl ecp_nistz256_point_double .type ecp_nistz256_point_double,\@function,2 .align 32 ecp_nistz256_point_double: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lpoint_doublex ___ } else { $src0 = "%rdx"; $sfx = "x"; $bias = 128; $code.=<<___; .type ecp_nistz256_point_doublex,\@function,2 .align 32 ecp_nistz256_point_doublex: .cfi_startproc .Lpoint_doublex: ___ } $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*5+8, %rsp .cfi_adjust_cfa_offset 32*5+8 .Lpoint_double${x}_body: .Lpoint_double_shortcut$x: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x mov $a_ptr, $b_ptr # backup copy movdqu 0x10($a_ptr), %xmm1 mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order mov 0x20+8*1($a_ptr), $acc5 mov 0x20+8*2($a_ptr), $acc0 mov 0x20+8*3($a_ptr), $acc1 mov .Lpoly+8*1(%rip), $poly1 mov .Lpoly+8*3(%rip), $poly3 movdqa %xmm0, $in_x(%rsp) movdqa %xmm1, $in_x+0x10(%rsp) lea 0x20($r_ptr), $acc2 lea 0x40($r_ptr), $acc3 movq $r_ptr, %xmm0 movq $acc2, %xmm1 movq $acc3, %xmm2 lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y); mov 0x40+8*0($a_ptr), $src0 mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 lea 0x40-$bias($a_ptr), $a_ptr lea $Zsqr(%rsp), $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z); `&load_for_sqr("$S(%rsp)", "$src0")` lea $S(%rsp), $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S); mov 0x20($b_ptr), $src0 # $b_ptr is still valid mov 0x40+8*0($b_ptr), $acc1 mov 0x40+8*1($b_ptr), $acc2 mov 0x40+8*2($b_ptr), $acc3 mov 0x40+8*3($b_ptr), $acc4 lea 0x40-$bias($b_ptr), $a_ptr lea 0x20($b_ptr), $b_ptr movq %xmm2, $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y); call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z); mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order mov $in_x+8*1(%rsp), $acc5 lea $Zsqr(%rsp), $b_ptr mov $in_x+8*2(%rsp), $acc0 mov $in_x+8*3(%rsp), $acc1 lea $M(%rsp), $r_ptr call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr); mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order mov $in_x+8*1(%rsp), $acc5 lea $Zsqr(%rsp), $b_ptr mov $in_x+8*2(%rsp), $acc0 mov $in_x+8*3(%rsp), $acc1 lea $Zsqr(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr); `&load_for_sqr("$S(%rsp)", "$src0")` movq %xmm1, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S); ___ { ######## ecp_nistz256_div_by_2(res_y, res_y); ########################## # operate in 4-5-6-7 "name space" that matches squaring output # my ($poly1,$poly3)=($a_ptr,$t1); my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2); $code.=<<___; xor $t4, $t4 mov $a0, $t0 add \$-1, $a0 mov $a1, $t1 adc $poly1, $a1 mov $a2, $t2 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 adc \$0, $t4 xor $a_ptr, $a_ptr # borrow $a_ptr test \$1, $t0 cmovz $t0, $a0 cmovz $t1, $a1 cmovz $t2, $a2 cmovz $t3, $a3 cmovz $a_ptr, $t4 mov $a1, $t0 # a0:a3>>1 shr \$1, $a0 shl \$63, $t0 mov $a2, $t1 shr \$1, $a1 or $t0, $a0 shl \$63, $t1 mov $a3, $t2 shr \$1, $a2 or $t1, $a1 shl \$63, $t2 mov $a0, 8*0($r_ptr) shr \$1, $a3 mov $a1, 8*1($r_ptr) shl \$63, $t4 or $t2, $a2 or $t4, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")` lea $M(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr); lea $tmp0(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x lea $M(%rsp), $b_ptr lea $M(%rsp), $r_ptr call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M); `&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")` lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x); lea $tmp0(%rsp), $r_ptr call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S); `&load_for_sqr("$M(%rsp)", "$src0")` movq %xmm0, $r_ptr call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M); lea $tmp0(%rsp), $b_ptr mov $acc6, $acc0 # harmonize sqr output and sub input mov $acc7, $acc1 mov $a_ptr, $poly1 mov $t1, $poly3 call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0); mov $S+8*0(%rsp), $t0 mov $S+8*1(%rsp), $t1 mov $S+8*2(%rsp), $t2 mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order lea $S(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(S, S, res_x); mov $M(%rsp), $src0 lea $M(%rsp), $b_ptr mov $acc4, $acc6 # harmonize sub output and mul input xor %ecx, %ecx mov $acc4, $S+8*0(%rsp) # have to save:-( mov $acc5, $acc2 mov $acc5, $S+8*1(%rsp) cmovz $acc0, $acc3 mov $acc0, $S+8*2(%rsp) lea $S-$bias(%rsp), $a_ptr cmovz $acc1, $acc4 mov $acc1, $S+8*3(%rsp) mov $acc6, $acc1 lea $S(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M); movq %xmm1, $b_ptr movq %xmm1, $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y); lea 32*5+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_double${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx ___ } &gen_double("q"); sub gen_add () { my $x = shift; my ($src0,$sfx,$bias); my ($H,$Hsqr,$R,$Rsqr,$Hcub, $U1,$U2,$S1,$S2, $res_x,$res_y,$res_z, $in1_x,$in1_y,$in1_z, $in2_x,$in2_y,$in2_z)=map(32*$_,(0..17)); my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr); if ($x ne "x") { $src0 = "%rax"; $sfx = ""; $bias = 0; $code.=<<___; .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,\@function,3 .align 32 ecp_nistz256_point_add: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lpoint_addx ___ } else { $src0 = "%rdx"; $sfx = "x"; $bias = 128; $code.=<<___; .type ecp_nistz256_point_addx,\@function,3 .align 32 ecp_nistz256_point_addx: .cfi_startproc .Lpoint_addx: ___ } $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*18+8, %rsp .cfi_adjust_cfa_offset 32*18+8 .Lpoint_add${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 movdqu 0x30($a_ptr), %xmm3 movdqu 0x40($a_ptr), %xmm4 movdqu 0x50($a_ptr), %xmm5 mov $a_ptr, $b_ptr # reassign mov $b_org, $a_ptr # reassign movdqa %xmm0, $in1_x(%rsp) movdqa %xmm1, $in1_x+0x10(%rsp) movdqa %xmm2, $in1_y(%rsp) movdqa %xmm3, $in1_y+0x10(%rsp) movdqa %xmm4, $in1_z(%rsp) movdqa %xmm5, $in1_z+0x10(%rsp) por %xmm4, %xmm5 movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr pshufd \$0xb1, %xmm5, %xmm3 movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 por %xmm3, %xmm5 movdqu 0x30($a_ptr), %xmm3 mov 0x40+8*0($a_ptr), $src0 # load original in2_z mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 movdqa %xmm0, $in2_x(%rsp) pshufd \$0x1e, %xmm5, %xmm4 movdqa %xmm1, $in2_x+0x10(%rsp) movdqu 0x40($a_ptr),%xmm0 # in2_z again movdqu 0x50($a_ptr),%xmm1 movdqa %xmm2, $in2_y(%rsp) movdqa %xmm3, $in2_y+0x10(%rsp) por %xmm4, %xmm5 pxor %xmm4, %xmm4 por %xmm0, %xmm1 movq $r_ptr, %xmm0 # save $r_ptr lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid mov $src0, $in2_z+8*0(%rsp) # make in2_z copy mov $acc6, $in2_z+8*1(%rsp) mov $acc7, $in2_z+8*2(%rsp) mov $acc0, $in2_z+8*3(%rsp) lea $Z2sqr(%rsp), $r_ptr # Z2^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z); pcmpeqd %xmm4, %xmm5 pshufd \$0xb1, %xmm1, %xmm4 por %xmm1, %xmm4 pshufd \$0, %xmm5, %xmm5 # in1infty pshufd \$0x1e, %xmm4, %xmm3 por %xmm3, %xmm4 pxor %xmm3, %xmm3 pcmpeqd %xmm3, %xmm4 pshufd \$0, %xmm4, %xmm4 # in2infty mov 0x40+8*0($b_ptr), $src0 # load original in1_z mov 0x40+8*1($b_ptr), $acc6 mov 0x40+8*2($b_ptr), $acc7 mov 0x40+8*3($b_ptr), $acc0 movq $b_ptr, %xmm1 lea 0x40-$bias($b_ptr), $a_ptr lea $Z1sqr(%rsp), $r_ptr # Z1^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); `&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")` lea $S1(%rsp), $r_ptr # S1 = Z2^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z); `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); `&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")` lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y); `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); lea $S1(%rsp), $b_ptr lea $R(%rsp), $r_ptr # R = S2 - S1 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1); or $acc5, $acc4 # see if result is zero movdqa %xmm4, %xmm2 or $acc0, $acc4 or $acc1, $acc4 por %xmm5, %xmm2 # in1infty || in2infty movq $acc4, %xmm3 `&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")` lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr); `&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr); lea $U1(%rsp), $b_ptr lea $H(%rsp), $r_ptr # H = U2 - U1 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1); or $acc5, $acc4 # see if result is zero or $acc0, $acc4 or $acc1, $acc4 # !is_equal(U1, U2) movq %xmm2, $acc0 # in1infty | in2infty movq %xmm3, $acc1 # !is_equal(S1, S2) or $acc0, $acc4 or $acc1, $acc4 # if (!is_equal(U1, U2) | in1infty | in2infty | !is_equal(S1, S2)) .byte 0x3e # predict taken jnz .Ladd_proceed$x .Ladd_double$x: movq %xmm1, $a_ptr # restore $a_ptr movq %xmm0, $r_ptr # restore $r_ptr add \$`32*(18-5)`, %rsp # difference in frame sizes .cfi_adjust_cfa_offset `-32*(18-5)` jmp .Lpoint_double_shortcut$x .cfi_adjust_cfa_offset `32*(18-5)` .align 32 .Ladd_proceed$x: `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); `&load_for_sqr("$H(%rsp)", "$src0")` lea $Hsqr(%rsp), $r_ptr # H^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); `&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z); `&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")` lea $Hcub(%rsp), $r_ptr # H^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); `&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U1*H^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr); ___ { ####################################################################### # operate in 4-5-0-1 "name space" that matches multiplication output # my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); my ($poly1, $poly3)=($acc6,$acc7); $code.=<<___; #lea $U2(%rsp), $a_ptr #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); xor $t4, $t4 add $acc0, $acc0 # a0:a3+a0:a3 lea $Rsqr(%rsp), $a_ptr adc $acc1, $acc1 mov $acc0, $t0 adc $acc2, $acc2 adc $acc3, $acc3 mov $acc1, $t1 adc \$0, $t4 sub \$-1, $acc0 mov $acc2, $t2 sbb $poly1, $acc1 sbb \$0, $acc2 mov $acc3, $t3 sbb $poly3, $acc3 sbb \$0, $t4 cmovc $t0, $acc0 mov 8*0($a_ptr), $t0 cmovc $t1, $acc1 mov 8*1($a_ptr), $t1 cmovc $t2, $acc2 mov 8*2($a_ptr), $t2 cmovc $t3, $acc3 mov 8*3($a_ptr), $t3 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); lea $Hcub(%rsp), $b_ptr lea $res_x(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); mov $U2+8*0(%rsp), $t0 mov $U2+8*1(%rsp), $t1 mov $U2+8*2(%rsp), $t2 mov $U2+8*3(%rsp), $t3 lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x); mov $acc0, 8*0($r_ptr) # save the result, as mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub); `&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")` lea $res_y(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y); lea $S2(%rsp), $b_ptr lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2); movq %xmm0, $r_ptr # restore $r_ptr movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty); movdqa %xmm5, %xmm1 pandn $res_z(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_z+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_z(%rsp), %xmm2 pand $in2_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_z(%rsp), %xmm2 pand $in1_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x40($r_ptr) movdqu %xmm3, 0x50($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); movdqa %xmm5, %xmm1 pandn $res_x(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_x+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_x(%rsp), %xmm2 pand $in2_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_x(%rsp), %xmm2 pand $in1_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x00($r_ptr) movdqu %xmm3, 0x10($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); movdqa %xmm5, %xmm1 pandn $res_y(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_y+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_y(%rsp), %xmm2 pand $in2_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_y(%rsp), %xmm2 pand $in1_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) .Ladd_done$x: lea 32*18+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_add${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx ___ } &gen_add("q"); sub gen_add_affine () { my $x = shift; my ($src0,$sfx,$bias); my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr, $res_x,$res_y,$res_z, $in1_x,$in1_y,$in1_z, $in2_x,$in2_y)=map(32*$_,(0..14)); my $Z1sqr = $S2; if ($x ne "x") { $src0 = "%rax"; $sfx = ""; $bias = 0; $code.=<<___; .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,\@function,3 .align 32 ecp_nistz256_point_add_affine: .cfi_startproc ___ $code.=<<___ if ($addx); mov \$0x80100, %ecx and OPENSSL_ia32cap_P+8(%rip), %ecx cmp \$0x80100, %ecx je .Lpoint_add_affinex ___ } else { $src0 = "%rdx"; $sfx = "x"; $bias = 128; $code.=<<___; .type ecp_nistz256_point_add_affinex,\@function,3 .align 32 ecp_nistz256_point_add_affinex: .cfi_startproc .Lpoint_add_affinex: ___ } $code.=<<___; push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$32*15+8, %rsp .cfi_adjust_cfa_offset 32*15+8 .Ladd_affine${x}_body: movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr mov $b_org, $b_ptr # reassign movdqu 0x10($a_ptr), %xmm1 movdqu 0x20($a_ptr), %xmm2 movdqu 0x30($a_ptr), %xmm3 movdqu 0x40($a_ptr), %xmm4 movdqu 0x50($a_ptr), %xmm5 mov 0x40+8*0($a_ptr), $src0 # load original in1_z mov 0x40+8*1($a_ptr), $acc6 mov 0x40+8*2($a_ptr), $acc7 mov 0x40+8*3($a_ptr), $acc0 movdqa %xmm0, $in1_x(%rsp) movdqa %xmm1, $in1_x+0x10(%rsp) movdqa %xmm2, $in1_y(%rsp) movdqa %xmm3, $in1_y+0x10(%rsp) movdqa %xmm4, $in1_z(%rsp) movdqa %xmm5, $in1_z+0x10(%rsp) por %xmm4, %xmm5 movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr pshufd \$0xb1, %xmm5, %xmm3 movdqu 0x10($b_ptr), %xmm1 movdqu 0x20($b_ptr), %xmm2 por %xmm3, %xmm5 movdqu 0x30($b_ptr), %xmm3 movdqa %xmm0, $in2_x(%rsp) pshufd \$0x1e, %xmm5, %xmm4 movdqa %xmm1, $in2_x+0x10(%rsp) por %xmm0, %xmm1 movq $r_ptr, %xmm0 # save $r_ptr movdqa %xmm2, $in2_y(%rsp) movdqa %xmm3, $in2_y+0x10(%rsp) por %xmm2, %xmm3 por %xmm4, %xmm5 pxor %xmm4, %xmm4 por %xmm1, %xmm3 lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid lea $Z1sqr(%rsp), $r_ptr # Z1^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z); pcmpeqd %xmm4, %xmm5 pshufd \$0xb1, %xmm3, %xmm4 mov 0x00($b_ptr), $src0 # $b_ptr is still valid #lea 0x00($b_ptr), $b_ptr mov $acc4, $acc1 # harmonize sqr output and mul input por %xmm3, %xmm4 pshufd \$0, %xmm5, %xmm5 # in1infty pshufd \$0x1e, %xmm4, %xmm3 mov $acc5, $acc2 por %xmm3, %xmm4 pxor %xmm3, %xmm3 mov $acc6, $acc3 pcmpeqd %xmm3, %xmm4 pshufd \$0, %xmm4, %xmm4 # in2infty lea $Z1sqr-$bias(%rsp), $a_ptr mov $acc7, $acc4 lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x); lea $in1_x(%rsp), $b_ptr lea $H(%rsp), $r_ptr # H = U2 - U1 call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x); `&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z); `&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")` lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z); `&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y); lea $in1_y(%rsp), $b_ptr lea $R(%rsp), $r_ptr # R = S2 - S1 call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y); `&load_for_sqr("$H(%rsp)", "$src0")` lea $Hsqr(%rsp), $r_ptr # H^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H); `&load_for_sqr("$R(%rsp)", "$src0")` lea $Rsqr(%rsp), $r_ptr # R^2 call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R); `&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")` lea $Hcub(%rsp), $r_ptr # H^3 call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H); `&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")` lea $U2(%rsp), $r_ptr # U1*H^2 call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr); ___ { ####################################################################### # operate in 4-5-0-1 "name space" that matches multiplication output # my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); my ($poly1, $poly3)=($acc6,$acc7); $code.=<<___; #lea $U2(%rsp), $a_ptr #lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2 #call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2); xor $t4, $t4 add $acc0, $acc0 # a0:a3+a0:a3 lea $Rsqr(%rsp), $a_ptr adc $acc1, $acc1 mov $acc0, $t0 adc $acc2, $acc2 adc $acc3, $acc3 mov $acc1, $t1 adc \$0, $t4 sub \$-1, $acc0 mov $acc2, $t2 sbb $poly1, $acc1 sbb \$0, $acc2 mov $acc3, $t3 sbb $poly3, $acc3 sbb \$0, $t4 cmovc $t0, $acc0 mov 8*0($a_ptr), $t0 cmovc $t1, $acc1 mov 8*1($a_ptr), $t1 cmovc $t2, $acc2 mov 8*2($a_ptr), $t2 cmovc $t3, $acc3 mov 8*3($a_ptr), $t3 call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr); lea $Hcub(%rsp), $b_ptr lea $res_x(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub); mov $U2+8*0(%rsp), $t0 mov $U2+8*1(%rsp), $t1 mov $U2+8*2(%rsp), $t2 mov $U2+8*3(%rsp), $t3 lea $H(%rsp), $r_ptr call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x); mov $acc0, 8*0($r_ptr) # save the result, as mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't mov $acc2, 8*2($r_ptr) mov $acc3, 8*3($r_ptr) ___ } $code.=<<___; `&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")` lea $S2(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y); `&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")` lea $H(%rsp), $r_ptr call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R); lea $S2(%rsp), $b_ptr lea $res_y(%rsp), $r_ptr call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2); movq %xmm0, $r_ptr # restore $r_ptr movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty); movdqa %xmm5, %xmm1 pandn $res_z(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_z+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand .LONE_mont(%rip), %xmm2 pand .LONE_mont+0x10(%rip), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_z(%rsp), %xmm2 pand $in1_z+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x40($r_ptr) movdqu %xmm3, 0x50($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty); movdqa %xmm5, %xmm1 pandn $res_x(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_x+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_x(%rsp), %xmm2 pand $in2_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_x(%rsp), %xmm2 pand $in1_x+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x00($r_ptr) movdqu %xmm3, 0x10($r_ptr) movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty); movdqa %xmm5, %xmm1 pandn $res_y(%rsp), %xmm0 movdqa %xmm5, %xmm2 pandn $res_y+0x10(%rsp), %xmm1 movdqa %xmm5, %xmm3 pand $in2_y(%rsp), %xmm2 pand $in2_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty); movdqa %xmm4, %xmm1 pandn %xmm2, %xmm0 movdqa %xmm4, %xmm2 pandn %xmm3, %xmm1 movdqa %xmm4, %xmm3 pand $in1_y(%rsp), %xmm2 pand $in1_y+0x10(%rsp), %xmm3 por %xmm0, %xmm2 por %xmm1, %xmm3 movdqu %xmm2, 0x20($r_ptr) movdqu %xmm3, 0x30($r_ptr) lea 32*15+56(%rsp), %rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbx .cfi_restore %rbx mov -8(%rsi),%rbp .cfi_restore %rbp lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Ladd_affine${x}_epilogue: ret .cfi_endproc .size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx ___ } &gen_add_affine("q"); ######################################################################## # AD*X magic # if ($addx) { { ######################################################################## # operate in 4-5-0-1 "name space" that matches multiplication output # my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3); $code.=<<___; .type __ecp_nistz256_add_tox,\@abi-omnipotent .align 32 __ecp_nistz256_add_tox: .cfi_startproc xor $t4, $t4 adc 8*0($b_ptr), $a0 adc 8*1($b_ptr), $a1 mov $a0, $t0 adc 8*2($b_ptr), $a2 adc 8*3($b_ptr), $a3 mov $a1, $t1 adc \$0, $t4 xor $t3, $t3 sbb \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox .type __ecp_nistz256_sub_fromx,\@abi-omnipotent .align 32 __ecp_nistz256_sub_fromx: .cfi_startproc xor $t4, $t4 sbb 8*0($b_ptr), $a0 sbb 8*1($b_ptr), $a1 mov $a0, $t0 sbb 8*2($b_ptr), $a2 sbb 8*3($b_ptr), $a3 mov $a1, $t1 sbb \$0, $t4 xor $t3, $t3 adc \$-1, $a0 mov $a2, $t2 adc $poly1, $a1 adc \$0, $a2 mov $a3, $t3 adc $poly3, $a3 bt \$0, $t4 cmovnc $t0, $a0 cmovnc $t1, $a1 mov $a0, 8*0($r_ptr) cmovnc $t2, $a2 mov $a1, 8*1($r_ptr) cmovnc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx .type __ecp_nistz256_subx,\@abi-omnipotent .align 32 __ecp_nistz256_subx: .cfi_startproc xor $t4, $t4 sbb $a0, $t0 sbb $a1, $t1 mov $t0, $a0 sbb $a2, $t2 sbb $a3, $t3 mov $t1, $a1 sbb \$0, $t4 xor $a3 ,$a3 adc \$-1, $t0 mov $t2, $a2 adc $poly1, $t1 adc \$0, $t2 mov $t3, $a3 adc $poly3, $t3 bt \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 cmovc $t2, $a2 cmovc $t3, $a3 ret .cfi_endproc .size __ecp_nistz256_subx,.-__ecp_nistz256_subx .type __ecp_nistz256_mul_by_2x,\@abi-omnipotent .align 32 __ecp_nistz256_mul_by_2x: .cfi_startproc xor $t4, $t4 adc $a0, $a0 # a0:a3+a0:a3 adc $a1, $a1 mov $a0, $t0 adc $a2, $a2 adc $a3, $a3 mov $a1, $t1 adc \$0, $t4 xor $t3, $t3 sbb \$-1, $a0 mov $a2, $t2 sbb $poly1, $a1 sbb \$0, $a2 mov $a3, $t3 sbb $poly3, $a3 sbb \$0, $t4 cmovc $t0, $a0 cmovc $t1, $a1 mov $a0, 8*0($r_ptr) cmovc $t2, $a2 mov $a1, 8*1($r_ptr) cmovc $t3, $a3 mov $a2, 8*2($r_ptr) mov $a3, 8*3($r_ptr) ret .cfi_endproc .size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x ___ } &gen_double("x"); &gen_add("x"); &gen_add_affine("x"); } }}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type short_handler,\@abi-omnipotent .align 16 short_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea 16(%rax),%rax mov -8(%rax),%r12 mov -16(%rax),%r13 mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 jmp .Lcommon_seh_tail .size short_handler,.-short_handler .type full_handler,\@abi-omnipotent .align 16 full_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 8(%r11),%r10d # HandlerData[2] lea (%rax,%r10),%rax mov -8(%rax),%rbp mov -16(%rax),%rbx mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size full_handler,.-full_handler .section .pdata .align 4 .rva .LSEH_begin_ecp_nistz256_mul_by_2 .rva .LSEH_end_ecp_nistz256_mul_by_2 .rva .LSEH_info_ecp_nistz256_mul_by_2 .rva .LSEH_begin_ecp_nistz256_div_by_2 .rva .LSEH_end_ecp_nistz256_div_by_2 .rva .LSEH_info_ecp_nistz256_div_by_2 .rva .LSEH_begin_ecp_nistz256_mul_by_3 .rva .LSEH_end_ecp_nistz256_mul_by_3 .rva .LSEH_info_ecp_nistz256_mul_by_3 .rva .LSEH_begin_ecp_nistz256_add .rva .LSEH_end_ecp_nistz256_add .rva .LSEH_info_ecp_nistz256_add .rva .LSEH_begin_ecp_nistz256_sub .rva .LSEH_end_ecp_nistz256_sub .rva .LSEH_info_ecp_nistz256_sub .rva .LSEH_begin_ecp_nistz256_neg .rva .LSEH_end_ecp_nistz256_neg .rva .LSEH_info_ecp_nistz256_neg .rva .LSEH_begin_ecp_nistz256_ord_mul_mont .rva .LSEH_end_ecp_nistz256_ord_mul_mont .rva .LSEH_info_ecp_nistz256_ord_mul_mont .rva .LSEH_begin_ecp_nistz256_ord_sqr_mont .rva .LSEH_end_ecp_nistz256_ord_sqr_mont .rva .LSEH_info_ecp_nistz256_ord_sqr_mont ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_ord_mul_montx .rva .LSEH_end_ecp_nistz256_ord_mul_montx .rva .LSEH_info_ecp_nistz256_ord_mul_montx .rva .LSEH_begin_ecp_nistz256_ord_sqr_montx .rva .LSEH_end_ecp_nistz256_ord_sqr_montx .rva .LSEH_info_ecp_nistz256_ord_sqr_montx ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_to_mont .rva .LSEH_end_ecp_nistz256_to_mont .rva .LSEH_info_ecp_nistz256_to_mont .rva .LSEH_begin_ecp_nistz256_mul_mont .rva .LSEH_end_ecp_nistz256_mul_mont .rva .LSEH_info_ecp_nistz256_mul_mont .rva .LSEH_begin_ecp_nistz256_sqr_mont .rva .LSEH_end_ecp_nistz256_sqr_mont .rva .LSEH_info_ecp_nistz256_sqr_mont .rva .LSEH_begin_ecp_nistz256_from_mont .rva .LSEH_end_ecp_nistz256_from_mont .rva .LSEH_info_ecp_nistz256_from_mont .rva .LSEH_begin_ecp_nistz256_gather_w5 .rva .LSEH_end_ecp_nistz256_gather_w5 .rva .LSEH_info_ecp_nistz256_gather_wX .rva .LSEH_begin_ecp_nistz256_gather_w7 .rva .LSEH_end_ecp_nistz256_gather_w7 .rva .LSEH_info_ecp_nistz256_gather_wX ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_ecp_nistz256_avx2_gather_w5 .rva .LSEH_end_ecp_nistz256_avx2_gather_w5 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX .rva .LSEH_begin_ecp_nistz256_avx2_gather_w7 .rva .LSEH_end_ecp_nistz256_avx2_gather_w7 .rva .LSEH_info_ecp_nistz256_avx2_gather_wX ___ $code.=<<___; .rva .LSEH_begin_ecp_nistz256_point_double .rva .LSEH_end_ecp_nistz256_point_double .rva .LSEH_info_ecp_nistz256_point_double .rva .LSEH_begin_ecp_nistz256_point_add .rva .LSEH_end_ecp_nistz256_point_add .rva .LSEH_info_ecp_nistz256_point_add .rva .LSEH_begin_ecp_nistz256_point_add_affine .rva .LSEH_end_ecp_nistz256_point_add_affine .rva .LSEH_info_ecp_nistz256_point_add_affine ___ $code.=<<___ if ($addx); .rva .LSEH_begin_ecp_nistz256_point_doublex .rva .LSEH_end_ecp_nistz256_point_doublex .rva .LSEH_info_ecp_nistz256_point_doublex .rva .LSEH_begin_ecp_nistz256_point_addx .rva .LSEH_end_ecp_nistz256_point_addx .rva .LSEH_info_ecp_nistz256_point_addx .rva .LSEH_begin_ecp_nistz256_point_add_affinex .rva .LSEH_end_ecp_nistz256_point_add_affinex .rva .LSEH_info_ecp_nistz256_point_add_affinex ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_ecp_nistz256_mul_by_2: .byte 9,0,0,0 .rva short_handler .rva .Lmul_by_2_body,.Lmul_by_2_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_div_by_2: .byte 9,0,0,0 .rva short_handler .rva .Ldiv_by_2_body,.Ldiv_by_2_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_mul_by_3: .byte 9,0,0,0 .rva short_handler .rva .Lmul_by_3_body,.Lmul_by_3_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_add: .byte 9,0,0,0 .rva short_handler .rva .Ladd_body,.Ladd_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_sub: .byte 9,0,0,0 .rva short_handler .rva .Lsub_body,.Lsub_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_neg: .byte 9,0,0,0 .rva short_handler .rva .Lneg_body,.Lneg_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_ord_mul_mont: .byte 9,0,0,0 .rva full_handler .rva .Lord_mul_body,.Lord_mul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_ord_sqr_mont: .byte 9,0,0,0 .rva full_handler .rva .Lord_sqr_body,.Lord_sqr_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___ if ($addx); .LSEH_info_ecp_nistz256_ord_mul_montx: .byte 9,0,0,0 .rva full_handler .rva .Lord_mulx_body,.Lord_mulx_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_ord_sqr_montx: .byte 9,0,0,0 .rva full_handler .rva .Lord_sqrx_body,.Lord_sqrx_epilogue # HandlerData[] .long 48,0 ___ $code.=<<___; .LSEH_info_ecp_nistz256_to_mont: .byte 9,0,0,0 .rva full_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_mul_mont: .byte 9,0,0,0 .rva full_handler .rva .Lmul_body,.Lmul_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_sqr_mont: .byte 9,0,0,0 .rva full_handler .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[] .long 48,0 .LSEH_info_ecp_nistz256_from_mont: .byte 9,0,0,0 .rva short_handler .rva .Lfrom_body,.Lfrom_epilogue # HandlerData[] .LSEH_info_ecp_nistz256_gather_wX: .byte 0x01,0x33,0x16,0x00 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 .align 8 ___ $code.=<<___ if ($avx>1); .LSEH_info_ecp_nistz256_avx2_gather_wX: .byte 0x01,0x36,0x17,0x0b .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15 .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14 .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13 .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12 .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11 .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10 .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9 .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8 .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7 .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8 .byte 0x00,0xb3,0x00,0x00 # set_frame r11 .align 8 ___ $code.=<<___; .LSEH_info_ecp_nistz256_point_double: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_doubleq_body,.Lpoint_doubleq_epilogue # HandlerData[] .long 32*5+56,0 .LSEH_info_ecp_nistz256_point_add: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_addq_body,.Lpoint_addq_epilogue # HandlerData[] .long 32*18+56,0 .LSEH_info_ecp_nistz256_point_add_affine: .byte 9,0,0,0 .rva full_handler .rva .Ladd_affineq_body,.Ladd_affineq_epilogue # HandlerData[] .long 32*15+56,0 ___ $code.=<<___ if ($addx); .align 8 .LSEH_info_ecp_nistz256_point_doublex: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_doublex_body,.Lpoint_doublex_epilogue # HandlerData[] .long 32*5+56,0 .LSEH_info_ecp_nistz256_point_addx: .byte 9,0,0,0 .rva full_handler .rva .Lpoint_addx_body,.Lpoint_addx_epilogue # HandlerData[] .long 32*18+56,0 .LSEH_info_ecp_nistz256_point_add_affinex: .byte 9,0,0,0 .rva full_handler .rva .Ladd_affinex_body,.Ladd_affinex_epilogue # HandlerData[] .long 32*15+56,0 ___ } ######################################################################## # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7 # open TABLE,") { s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo; } close TABLE; die "insane number of elements" if ($#arr != 64*16*37-1); print <<___; .text .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,\@object .align 4096 ecp_nistz256_precomputed: ___ while (@line=splice(@arr,0,16)) { print ".long\t",join(',',map { sprintf "0x%08x",$_} @line),"\n"; } print <<___; .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/ec/asm/x25519-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/ec/asm/x25519-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/ec/asm/x25519-x86_64.pl (revision 364963) @@ -1,1131 +1,1131 @@ #!/usr/bin/env perl # Copyright 2018-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # X25519 lower-level primitives for x86_64. # # February 2018. # # This module implements radix 2^51 multiplication and squaring, and # radix 2^64 multiplication, squaring, addition, subtraction and final # reduction. Latter radix is used on ADCX/ADOX-capable processors such # as Broadwell. On related note one should mention that there are # vector implementations that provide significantly better performance # on some processors(*), but they are large and overly complex. Which # in combination with them being effectively processor-specific makes # the undertaking hard to justify. The goal for this implementation # is rather versatility and simplicity [and ultimately formal # verification]. # # (*) For example sandy2x should provide ~30% improvement on Sandy # Bridge, but only nominal ~5% on Haswell [and big loss on # Broadwell and successors]. # ###################################################################### # Improvement coefficients: # # amd64-51(*) gcc-5.x(**) # # P4 +22% +40% # Sandy Bridge -3% +11% # Haswell -1% +13% # Broadwell(***) +30% +35% # Skylake(***) +33% +47% # Silvermont +20% +26% # Goldmont +40% +50% # Bulldozer +20% +9% # Ryzen(***) +43% +40% # VIA +170% +120% # # (*) amd64-51 is popular assembly implementation with 2^51 radix, # only multiplication and squaring subroutines were linked # for comparison, but not complete ladder step; gain on most # processors is because this module refrains from shld, and # minor regression on others is because this does result in # higher instruction count; # (**) compiler is free to inline functions, in assembly one would # need to implement ladder step to do that, and it will improve # performance by several percent; # (***) ADCX/ADOX result for 2^64 radix, there is no corresponding # C implementation, so that comparison is always against # 2^51 radix; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.23); } if (!$addx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $addx = ($1>=2.10); } if (!$addx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $addx = ($1>=12); } -if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { +if (!$addx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)/) { my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10 $addx = ($ver>=3.03); } $code.=<<___; .text .globl x25519_fe51_mul .type x25519_fe51_mul,\@function,3 .align 32 x25519_fe51_mul: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_mul_body: mov 8*0(%rsi),%rax # f[0] mov 8*0(%rdx),%r11 # load g[0-4] mov 8*1(%rdx),%r12 mov 8*2(%rdx),%r13 mov 8*3(%rdx),%rbp mov 8*4(%rdx),%r14 mov %rdi,8*4(%rsp) # offload 1st argument mov %rax,%rdi mulq %r11 # f[0]*g[0] mov %r11,8*0(%rsp) # offload g[0] mov %rax,%rbx # %rbx:%rcx = h0 mov %rdi,%rax mov %rdx,%rcx mulq %r12 # f[0]*g[1] mov %r12,8*1(%rsp) # offload g[1] mov %rax,%r8 # %r8:%r9 = h1 mov %rdi,%rax lea (%r14,%r14,8),%r15 mov %rdx,%r9 mulq %r13 # f[0]*g[2] mov %r13,8*2(%rsp) # offload g[2] mov %rax,%r10 # %r10:%r11 = h2 mov %rdi,%rax lea (%r14,%r15,2),%rdi # g[4]*19 mov %rdx,%r11 mulq %rbp # f[0]*g[3] mov %rax,%r12 # %r12:%r13 = h3 mov 8*0(%rsi),%rax # f[0] mov %rdx,%r13 mulq %r14 # f[0]*g[4] mov %rax,%r14 # %r14:%r15 = h4 mov 8*1(%rsi),%rax # f[1] mov %rdx,%r15 mulq %rdi # f[1]*g[4]*19 add %rax,%rbx mov 8*2(%rsi),%rax # f[2] adc %rdx,%rcx mulq %rdi # f[2]*g[4]*19 add %rax,%r8 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r9 mulq %rdi # f[3]*g[4]*19 add %rax,%r10 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r11 mulq %rdi # f[4]*g[4]*19 imulq \$19,%rbp,%rdi # g[3]*19 add %rax,%r12 mov 8*1(%rsi),%rax # f[1] adc %rdx,%r13 mulq %rbp # f[1]*g[3] mov 8*2(%rsp),%rbp # g[2] add %rax,%r14 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r15 mulq %rdi # f[2]*g[3]*19 add %rax,%rbx mov 8*3(%rsi),%rax # f[3] adc %rdx,%rcx mulq %rdi # f[3]*g[3]*19 add %rax,%r8 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r9 mulq %rdi # f[4]*g[3]*19 imulq \$19,%rbp,%rdi # g[2]*19 add %rax,%r10 mov 8*1(%rsi),%rax # f[1] adc %rdx,%r11 mulq %rbp # f[1]*g[2] add %rax,%r12 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r13 mulq %rbp # f[2]*g[2] mov 8*1(%rsp),%rbp # g[1] add %rax,%r14 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r15 mulq %rdi # f[3]*g[2]*19 add %rax,%rbx mov 8*4(%rsi),%rax # f[3] adc %rdx,%rcx mulq %rdi # f[4]*g[2]*19 add %rax,%r8 mov 8*1(%rsi),%rax # f[1] adc %rdx,%r9 mulq %rbp # f[1]*g[1] imulq \$19,%rbp,%rdi add %rax,%r10 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r11 mulq %rbp # f[2]*g[1] add %rax,%r12 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r13 mulq %rbp # f[3]*g[1] mov 8*0(%rsp),%rbp # g[0] add %rax,%r14 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r15 mulq %rdi # f[4]*g[1]*19 add %rax,%rbx mov 8*1(%rsi),%rax # f[1] adc %rdx,%rcx mul %rbp # f[1]*g[0] add %rax,%r8 mov 8*2(%rsi),%rax # f[2] adc %rdx,%r9 mul %rbp # f[2]*g[0] add %rax,%r10 mov 8*3(%rsi),%rax # f[3] adc %rdx,%r11 mul %rbp # f[3]*g[0] add %rax,%r12 mov 8*4(%rsi),%rax # f[4] adc %rdx,%r13 mulq %rbp # f[4]*g[0] add %rax,%r14 adc %rdx,%r15 mov 8*4(%rsp),%rdi # restore 1st argument jmp .Lreduce51 .Lfe51_mul_epilogue: .cfi_endproc .size x25519_fe51_mul,.-x25519_fe51_mul .globl x25519_fe51_sqr .type x25519_fe51_sqr,\@function,2 .align 32 x25519_fe51_sqr: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_sqr_body: mov 8*0(%rsi),%rax # g[0] mov 8*2(%rsi),%r15 # g[2] mov 8*4(%rsi),%rbp # g[4] mov %rdi,8*4(%rsp) # offload 1st argument lea (%rax,%rax),%r14 mulq %rax # g[0]*g[0] mov %rax,%rbx mov 8*1(%rsi),%rax # g[1] mov %rdx,%rcx mulq %r14 # 2*g[0]*g[1] mov %rax,%r8 mov %r15,%rax mov %r15,8*0(%rsp) # offload g[2] mov %rdx,%r9 mulq %r14 # 2*g[0]*g[2] mov %rax,%r10 mov 8*3(%rsi),%rax mov %rdx,%r11 imulq \$19,%rbp,%rdi # g[4]*19 mulq %r14 # 2*g[0]*g[3] mov %rax,%r12 mov %rbp,%rax mov %rdx,%r13 mulq %r14 # 2*g[0]*g[4] mov %rax,%r14 mov %rbp,%rax mov %rdx,%r15 mulq %rdi # g[4]*g[4]*19 add %rax,%r12 mov 8*1(%rsi),%rax # g[1] adc %rdx,%r13 mov 8*3(%rsi),%rsi # g[3] lea (%rax,%rax),%rbp mulq %rax # g[1]*g[1] add %rax,%r10 mov 8*0(%rsp),%rax # g[2] adc %rdx,%r11 mulq %rbp # 2*g[1]*g[2] add %rax,%r12 mov %rbp,%rax adc %rdx,%r13 mulq %rsi # 2*g[1]*g[3] add %rax,%r14 mov %rbp,%rax adc %rdx,%r15 imulq \$19,%rsi,%rbp # g[3]*19 mulq %rdi # 2*g[1]*g[4]*19 add %rax,%rbx lea (%rsi,%rsi),%rax adc %rdx,%rcx mulq %rdi # 2*g[3]*g[4]*19 add %rax,%r10 mov %rsi,%rax adc %rdx,%r11 mulq %rbp # g[3]*g[3]*19 add %rax,%r8 mov 8*0(%rsp),%rax # g[2] adc %rdx,%r9 lea (%rax,%rax),%rsi mulq %rax # g[2]*g[2] add %rax,%r14 mov %rbp,%rax adc %rdx,%r15 mulq %rsi # 2*g[2]*g[3]*19 add %rax,%rbx mov %rsi,%rax adc %rdx,%rcx mulq %rdi # 2*g[2]*g[4]*19 add %rax,%r8 adc %rdx,%r9 mov 8*4(%rsp),%rdi # restore 1st argument jmp .Lreduce51 .align 32 .Lreduce51: mov \$0x7ffffffffffff,%rbp mov %r10,%rdx shr \$51,%r10 shl \$13,%r11 and %rbp,%rdx # %rdx = g2 = h2 & mask or %r10,%r11 # h2>>51 add %r11,%r12 adc \$0,%r13 # h3 += h2>>51 mov %rbx,%rax shr \$51,%rbx shl \$13,%rcx and %rbp,%rax # %rax = g0 = h0 & mask or %rbx,%rcx # h0>>51 add %rcx,%r8 # h1 += h0>>51 adc \$0,%r9 mov %r12,%rbx shr \$51,%r12 shl \$13,%r13 and %rbp,%rbx # %rbx = g3 = h3 & mask or %r12,%r13 # h3>>51 add %r13,%r14 # h4 += h3>>51 adc \$0,%r15 mov %r8,%rcx shr \$51,%r8 shl \$13,%r9 and %rbp,%rcx # %rcx = g1 = h1 & mask or %r8,%r9 add %r9,%rdx # g2 += h1>>51 mov %r14,%r10 shr \$51,%r14 shl \$13,%r15 and %rbp,%r10 # %r10 = g4 = h0 & mask or %r14,%r15 # h0>>51 lea (%r15,%r15,8),%r14 lea (%r15,%r14,2),%r15 add %r15,%rax # g0 += (h0>>51)*19 mov %rdx,%r8 and %rbp,%rdx # g2 &= mask shr \$51,%r8 add %r8,%rbx # g3 += g2>>51 mov %rax,%r9 and %rbp,%rax # g0 &= mask shr \$51,%r9 add %r9,%rcx # g1 += g0>>51 mov %rax,8*0(%rdi) # save the result mov %rcx,8*1(%rdi) mov %rdx,8*2(%rdi) mov %rbx,8*3(%rdi) mov %r10,8*4(%rdi) mov 8*5(%rsp),%r15 .cfi_restore %r15 mov 8*6(%rsp),%r14 .cfi_restore %r14 mov 8*7(%rsp),%r13 .cfi_restore %r13 mov 8*8(%rsp),%r12 .cfi_restore %r12 mov 8*9(%rsp),%rbx .cfi_restore %rbx mov 8*10(%rsp),%rbp .cfi_restore %rbp lea 8*11(%rsp),%rsp .cfi_adjust_cfa_offset 88 .Lfe51_sqr_epilogue: ret .cfi_endproc .size x25519_fe51_sqr,.-x25519_fe51_sqr .globl x25519_fe51_mul121666 .type x25519_fe51_mul121666,\@function,2 .align 32 x25519_fe51_mul121666: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 lea -8*5(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_mul121666_body: mov \$121666,%eax mulq 8*0(%rsi) mov %rax,%rbx # %rbx:%rcx = h0 mov \$121666,%eax mov %rdx,%rcx mulq 8*1(%rsi) mov %rax,%r8 # %r8:%r9 = h1 mov \$121666,%eax mov %rdx,%r9 mulq 8*2(%rsi) mov %rax,%r10 # %r10:%r11 = h2 mov \$121666,%eax mov %rdx,%r11 mulq 8*3(%rsi) mov %rax,%r12 # %r12:%r13 = h3 mov \$121666,%eax # f[0] mov %rdx,%r13 mulq 8*4(%rsi) mov %rax,%r14 # %r14:%r15 = h4 mov %rdx,%r15 jmp .Lreduce51 .Lfe51_mul121666_epilogue: .cfi_endproc .size x25519_fe51_mul121666,.-x25519_fe51_mul121666 ___ ######################################################################## # Base 2^64 subroutines modulo 2*(2^255-19) # if ($addx) { my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7) = map("%r$_",(8..15)); $code.=<<___; .extern OPENSSL_ia32cap_P .globl x25519_fe64_eligible .type x25519_fe64_eligible,\@abi-omnipotent .align 32 x25519_fe64_eligible: .cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%ecx xor %eax,%eax and \$0x80100,%ecx cmp \$0x80100,%ecx cmove %ecx,%eax ret .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,\@function,3 .align 32 x25519_fe64_mul: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push %rdi # offload dst .cfi_push %rdi lea -8*2(%rsp),%rsp .cfi_adjust_cfa_offset 16 .Lfe64_mul_body: mov %rdx,%rax mov 8*0(%rdx),%rbp # b[0] mov 8*0(%rsi),%rdx # a[0] mov 8*1(%rax),%rcx # b[1] mov 8*2(%rax),$acc6 # b[2] mov 8*3(%rax),$acc7 # b[3] mulx %rbp,$acc0,%rax # a[0]*b[0] xor %edi,%edi # cf=0,of=0 mulx %rcx,$acc1,%rbx # a[0]*b[1] adcx %rax,$acc1 mulx $acc6,$acc2,%rax # a[0]*b[2] adcx %rbx,$acc2 mulx $acc7,$acc3,$acc4 # a[0]*b[3] mov 8*1(%rsi),%rdx # a[1] adcx %rax,$acc3 mov $acc6,(%rsp) # offload b[2] adcx %rdi,$acc4 # cf=0 mulx %rbp,%rax,%rbx # a[1]*b[0] adox %rax,$acc1 adcx %rbx,$acc2 mulx %rcx,%rax,%rbx # a[1]*b[1] adox %rax,$acc2 adcx %rbx,$acc3 mulx $acc6,%rax,%rbx # a[1]*b[2] adox %rax,$acc3 adcx %rbx,$acc4 mulx $acc7,%rax,$acc5 # a[1]*b[3] mov 8*2(%rsi),%rdx # a[2] adox %rax,$acc4 adcx %rdi,$acc5 # cf=0 adox %rdi,$acc5 # of=0 mulx %rbp,%rax,%rbx # a[2]*b[0] adcx %rax,$acc2 adox %rbx,$acc3 mulx %rcx,%rax,%rbx # a[2]*b[1] adcx %rax,$acc3 adox %rbx,$acc4 mulx $acc6,%rax,%rbx # a[2]*b[2] adcx %rax,$acc4 adox %rbx,$acc5 mulx $acc7,%rax,$acc6 # a[2]*b[3] mov 8*3(%rsi),%rdx # a[3] adcx %rax,$acc5 adox %rdi,$acc6 # of=0 adcx %rdi,$acc6 # cf=0 mulx %rbp,%rax,%rbx # a[3]*b[0] adox %rax,$acc3 adcx %rbx,$acc4 mulx %rcx,%rax,%rbx # a[3]*b[1] adox %rax,$acc4 adcx %rbx,$acc5 mulx (%rsp),%rax,%rbx # a[3]*b[2] adox %rax,$acc5 adcx %rbx,$acc6 mulx $acc7,%rax,$acc7 # a[3]*b[3] mov \$38,%edx adox %rax,$acc6 adcx %rdi,$acc7 # cf=0 adox %rdi,$acc7 # of=0 jmp .Lreduce64 .Lfe64_mul_epilogue: .cfi_endproc .size x25519_fe64_mul,.-x25519_fe64_mul .globl x25519_fe64_sqr .type x25519_fe64_sqr,\@function,2 .align 32 x25519_fe64_sqr: .cfi_startproc push %rbp .cfi_push %rbp push %rbx .cfi_push %rbx push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 push %rdi # offload dst .cfi_push %rdi lea -8*2(%rsp),%rsp .cfi_adjust_cfa_offset 16 .Lfe64_sqr_body: mov 8*0(%rsi),%rdx # a[0] mov 8*1(%rsi),%rcx # a[1] mov 8*2(%rsi),%rbp # a[2] mov 8*3(%rsi),%rsi # a[3] ################################################################ mulx %rdx,$acc0,$acc7 # a[0]*a[0] mulx %rcx,$acc1,%rax # a[0]*a[1] xor %edi,%edi # cf=0,of=0 mulx %rbp,$acc2,%rbx # a[0]*a[2] adcx %rax,$acc2 mulx %rsi,$acc3,$acc4 # a[0]*a[3] mov %rcx,%rdx # a[1] adcx %rbx,$acc3 adcx %rdi,$acc4 # cf=0 ################################################################ mulx %rbp,%rax,%rbx # a[1]*a[2] adox %rax,$acc3 adcx %rbx,$acc4 mulx %rsi,%rax,$acc5 # a[1]*a[3] mov %rbp,%rdx # a[2] adox %rax,$acc4 adcx %rdi,$acc5 ################################################################ mulx %rsi,%rax,$acc6 # a[2]*a[3] mov %rcx,%rdx # a[1] adox %rax,$acc5 adcx %rdi,$acc6 # cf=0 adox %rdi,$acc6 # of=0 adcx $acc1,$acc1 # acc1:6<<1 adox $acc7,$acc1 adcx $acc2,$acc2 mulx %rdx,%rax,%rbx # a[1]*a[1] mov %rbp,%rdx # a[2] adcx $acc3,$acc3 adox %rax,$acc2 adcx $acc4,$acc4 adox %rbx,$acc3 mulx %rdx,%rax,%rbx # a[2]*a[2] mov %rsi,%rdx # a[3] adcx $acc5,$acc5 adox %rax,$acc4 adcx $acc6,$acc6 adox %rbx,$acc5 mulx %rdx,%rax,$acc7 # a[3]*a[3] mov \$38,%edx adox %rax,$acc6 adcx %rdi,$acc7 # cf=0 adox %rdi,$acc7 # of=0 jmp .Lreduce64 .align 32 .Lreduce64: mulx $acc4,%rax,%rbx adcx %rax,$acc0 adox %rbx,$acc1 mulx $acc5,%rax,%rbx adcx %rax,$acc1 adox %rbx,$acc2 mulx $acc6,%rax,%rbx adcx %rax,$acc2 adox %rbx,$acc3 mulx $acc7,%rax,$acc4 adcx %rax,$acc3 adox %rdi,$acc4 adcx %rdi,$acc4 mov 8*2(%rsp),%rdi # restore dst imulq %rdx,$acc4 add $acc4,$acc0 adc \$0,$acc1 adc \$0,$acc2 adc \$0,$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax add %rax,$acc0 mov $acc1,8*1(%rdi) mov $acc2,8*2(%rdi) mov $acc3,8*3(%rdi) mov $acc0,8*0(%rdi) mov 8*3(%rsp),%r15 .cfi_restore %r15 mov 8*4(%rsp),%r14 .cfi_restore %r14 mov 8*5(%rsp),%r13 .cfi_restore %r13 mov 8*6(%rsp),%r12 .cfi_restore %r12 mov 8*7(%rsp),%rbx .cfi_restore %rbx mov 8*8(%rsp),%rbp .cfi_restore %rbp lea 8*9(%rsp),%rsp .cfi_adjust_cfa_offset 88 .Lfe64_sqr_epilogue: ret .cfi_endproc .size x25519_fe64_sqr,.-x25519_fe64_sqr .globl x25519_fe64_mul121666 .type x25519_fe64_mul121666,\@function,2 .align 32 x25519_fe64_mul121666: .Lfe64_mul121666_body: .cfi_startproc mov \$121666,%edx mulx 8*0(%rsi),$acc0,%rcx mulx 8*1(%rsi),$acc1,%rax add %rcx,$acc1 mulx 8*2(%rsi),$acc2,%rcx adc %rax,$acc2 mulx 8*3(%rsi),$acc3,%rax adc %rcx,$acc3 adc \$0,%rax imulq \$38,%rax,%rax add %rax,$acc0 adc \$0,$acc1 adc \$0,$acc2 adc \$0,$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax add %rax,$acc0 mov $acc1,8*1(%rdi) mov $acc2,8*2(%rdi) mov $acc3,8*3(%rdi) mov $acc0,8*0(%rdi) .Lfe64_mul121666_epilogue: ret .cfi_endproc .size x25519_fe64_mul121666,.-x25519_fe64_mul121666 .globl x25519_fe64_add .type x25519_fe64_add,\@function,3 .align 32 x25519_fe64_add: .Lfe64_add_body: .cfi_startproc mov 8*0(%rsi),$acc0 mov 8*1(%rsi),$acc1 mov 8*2(%rsi),$acc2 mov 8*3(%rsi),$acc3 add 8*0(%rdx),$acc0 adc 8*1(%rdx),$acc1 adc 8*2(%rdx),$acc2 adc 8*3(%rdx),$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax add %rax,$acc0 adc \$0,$acc1 adc \$0,$acc2 mov $acc1,8*1(%rdi) adc \$0,$acc3 mov $acc2,8*2(%rdi) sbb %rax,%rax # cf -> mask mov $acc3,8*3(%rdi) and \$38,%rax add %rax,$acc0 mov $acc0,8*0(%rdi) .Lfe64_add_epilogue: ret .cfi_endproc .size x25519_fe64_add,.-x25519_fe64_add .globl x25519_fe64_sub .type x25519_fe64_sub,\@function,3 .align 32 x25519_fe64_sub: .Lfe64_sub_body: .cfi_startproc mov 8*0(%rsi),$acc0 mov 8*1(%rsi),$acc1 mov 8*2(%rsi),$acc2 mov 8*3(%rsi),$acc3 sub 8*0(%rdx),$acc0 sbb 8*1(%rdx),$acc1 sbb 8*2(%rdx),$acc2 sbb 8*3(%rdx),$acc3 sbb %rax,%rax # cf -> mask and \$38,%rax sub %rax,$acc0 sbb \$0,$acc1 sbb \$0,$acc2 mov $acc1,8*1(%rdi) sbb \$0,$acc3 mov $acc2,8*2(%rdi) sbb %rax,%rax # cf -> mask mov $acc3,8*3(%rdi) and \$38,%rax sub %rax,$acc0 mov $acc0,8*0(%rdi) .Lfe64_sub_epilogue: ret .cfi_endproc .size x25519_fe64_sub,.-x25519_fe64_sub .globl x25519_fe64_tobytes .type x25519_fe64_tobytes,\@function,2 .align 32 x25519_fe64_tobytes: .Lfe64_to_body: .cfi_startproc mov 8*0(%rsi),$acc0 mov 8*1(%rsi),$acc1 mov 8*2(%rsi),$acc2 mov 8*3(%rsi),$acc3 ################################# reduction modulo 2^255-19 lea ($acc3,$acc3),%rax sar \$63,$acc3 # most significant bit -> mask shr \$1,%rax # most significant bit cleared and \$19,$acc3 add \$19,$acc3 # compare to modulus in the same go add $acc3,$acc0 adc \$0,$acc1 adc \$0,$acc2 adc \$0,%rax lea (%rax,%rax),$acc3 sar \$63,%rax # most significant bit -> mask shr \$1,$acc3 # most significant bit cleared not %rax and \$19,%rax sub %rax,$acc0 sbb \$0,$acc1 sbb \$0,$acc2 sbb \$0,$acc3 mov $acc0,8*0(%rdi) mov $acc1,8*1(%rdi) mov $acc2,8*2(%rdi) mov $acc3,8*3(%rdi) .Lfe64_to_epilogue: ret .cfi_endproc .size x25519_fe64_tobytes,.-x25519_fe64_tobytes ___ } else { $code.=<<___; .globl x25519_fe64_eligible .type x25519_fe64_eligible,\@abi-omnipotent .align 32 x25519_fe64_eligible: .cfi_startproc xor %eax,%eax ret .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,\@abi-omnipotent .globl x25519_fe64_sqr .globl x25519_fe64_mul121666 .globl x25519_fe64_add .globl x25519_fe64_sub .globl x25519_fe64_tobytes x25519_fe64_mul: x25519_fe64_sqr: x25519_fe64_mul121666: x25519_fe64_add: x25519_fe64_sub: x25519_fe64_tobytes: .cfi_startproc .byte 0x0f,0x0b # ud2 ret .cfi_endproc .size x25519_fe64_mul,.-x25519_fe64_mul ___ } $code.=<<___; .asciz "X25519 primitives for x86_64, CRYPTOGAMS by " ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type short_handler,\@abi-omnipotent .align 16 short_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp jmp .Lcommon_seh_tail .size short_handler,.-short_handler .type full_handler,\@abi-omnipotent .align 16 full_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 8(%r11),%r10d # HandlerData[2] lea (%rax,%r10),%rax mov -8(%rax),%rbp mov -16(%rax),%rbx mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size full_handler,.-full_handler .section .pdata .align 4 .rva .LSEH_begin_x25519_fe51_mul .rva .LSEH_end_x25519_fe51_mul .rva .LSEH_info_x25519_fe51_mul .rva .LSEH_begin_x25519_fe51_sqr .rva .LSEH_end_x25519_fe51_sqr .rva .LSEH_info_x25519_fe51_sqr .rva .LSEH_begin_x25519_fe51_mul121666 .rva .LSEH_end_x25519_fe51_mul121666 .rva .LSEH_info_x25519_fe51_mul121666 ___ $code.=<<___ if ($addx); .rva .LSEH_begin_x25519_fe64_mul .rva .LSEH_end_x25519_fe64_mul .rva .LSEH_info_x25519_fe64_mul .rva .LSEH_begin_x25519_fe64_sqr .rva .LSEH_end_x25519_fe64_sqr .rva .LSEH_info_x25519_fe64_sqr .rva .LSEH_begin_x25519_fe64_mul121666 .rva .LSEH_end_x25519_fe64_mul121666 .rva .LSEH_info_x25519_fe64_mul121666 .rva .LSEH_begin_x25519_fe64_add .rva .LSEH_end_x25519_fe64_add .rva .LSEH_info_x25519_fe64_add .rva .LSEH_begin_x25519_fe64_sub .rva .LSEH_end_x25519_fe64_sub .rva .LSEH_info_x25519_fe64_sub .rva .LSEH_begin_x25519_fe64_tobytes .rva .LSEH_end_x25519_fe64_tobytes .rva .LSEH_info_x25519_fe64_tobytes ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_x25519_fe51_mul: .byte 9,0,0,0 .rva full_handler .rva .Lfe51_mul_body,.Lfe51_mul_epilogue # HandlerData[] .long 88,0 .LSEH_info_x25519_fe51_sqr: .byte 9,0,0,0 .rva full_handler .rva .Lfe51_sqr_body,.Lfe51_sqr_epilogue # HandlerData[] .long 88,0 .LSEH_info_x25519_fe51_mul121666: .byte 9,0,0,0 .rva full_handler .rva .Lfe51_mul121666_body,.Lfe51_mul121666_epilogue # HandlerData[] .long 88,0 ___ $code.=<<___ if ($addx); .LSEH_info_x25519_fe64_mul: .byte 9,0,0,0 .rva full_handler .rva .Lfe64_mul_body,.Lfe64_mul_epilogue # HandlerData[] .long 72,0 .LSEH_info_x25519_fe64_sqr: .byte 9,0,0,0 .rva full_handler .rva .Lfe64_sqr_body,.Lfe64_sqr_epilogue # HandlerData[] .long 72,0 .LSEH_info_x25519_fe64_mul121666: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_mul121666_body,.Lfe64_mul121666_epilogue # HandlerData[] .LSEH_info_x25519_fe64_add: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_add_body,.Lfe64_add_epilogue # HandlerData[] .LSEH_info_x25519_fe64_sub: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_sub_body,.Lfe64_sub_epilogue # HandlerData[] .LSEH_info_x25519_fe64_tobytes: .byte 9,0,0,0 .rva short_handler .rva .Lfe64_to_body,.Lfe64_to_epilogue # HandlerData[] ___ } $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/modes/asm/aesni-gcm-x86_64.pl (revision 364963) @@ -1,1107 +1,1107 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # # AES-NI-CTR+GHASH stitch. # # February 2013 # # OpenSSL GCM implementation is organized in such way that its # performance is rather close to the sum of its streamed components, # in the context parallelized AES-NI CTR and modulo-scheduled # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation # was observed to perform significantly better than the sum of the # components on contemporary CPUs, the effort was deemed impossible to # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles # pressure with notable relative improvement, achieving 1.0 cycle per # byte processed with 128-bit key on Haswell processor, 0.74 - on # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled # measurements for favourable packet size, one divisible by 96. # Applications using the EVP interface will observe a few percent # worse performance.] # # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if ($avx>1) {{{ ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); ($Ii,$T1,$T2,$Hkey, $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8)); ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15)); ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15"); $code=<<___; .text .type _aesni_ctr32_ghash_6x,\@abi-omnipotent .align 32 _aesni_ctr32_ghash_6x: .cfi_startproc vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb sub \$6,$len vpxor $Z0,$Z0,$Z0 # $Z0 = 0 vmovdqu 0x00-0x80($key),$rndkey vpaddb $T2,$T1,$inout1 vpaddb $T2,$inout1,$inout2 vpaddb $T2,$inout2,$inout3 vpaddb $T2,$inout3,$inout4 vpaddb $T2,$inout4,$inout5 vpxor $rndkey,$T1,$inout0 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0 jmp .Loop6x .align 32 .Loop6x: add \$`6<<24`,$counter jc .Lhandle_ctr32 # discard $inout[1-5]? vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpaddb $T2,$inout5,$T1 # next counter value vpxor $rndkey,$inout1,$inout1 vpxor $rndkey,$inout2,$inout2 .Lresume_ctr32: vmovdqu $T1,($ivp) # save next counter value vpclmulqdq \$0x10,$Hkey,$Z3,$Z1 vpxor $rndkey,$inout3,$inout3 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey vpclmulqdq \$0x01,$Hkey,$Z3,$Z2 xor %r12,%r12 cmp $in0,$end0 vaesenc $T2,$inout0,$inout0 vmovdqu 0x30+8(%rsp),$Ii # I[4] vpxor $rndkey,$inout4,$inout4 vpclmulqdq \$0x00,$Hkey,$Z3,$T1 vaesenc $T2,$inout1,$inout1 vpxor $rndkey,$inout5,$inout5 setnc %r12b vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 vaesenc $T2,$inout2,$inout2 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2 neg %r12 vaesenc $T2,$inout3,$inout3 vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1 vpxor $Z0,$Xi,$Xi # modulo-scheduled vaesenc $T2,$inout4,$inout4 vpxor $Z1,$T1,$Z0 and \$0x60,%r12 vmovups 0x20-0x80($key),$rndkey vpclmulqdq \$0x10,$Hkey,$Ii,$T1 vaesenc $T2,$inout5,$inout5 vpclmulqdq \$0x01,$Hkey,$Ii,$T2 lea ($in0,%r12),$in0 vaesenc $rndkey,$inout0,$inout0 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi] vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey vmovdqu 0x40+8(%rsp),$Ii # I[3] vaesenc $rndkey,$inout1,$inout1 movbe 0x58($in0),%r13 vaesenc $rndkey,$inout2,$inout2 movbe 0x50($in0),%r12 vaesenc $rndkey,$inout3,$inout3 mov %r13,0x20+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x28+8(%rsp) vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3 vaesenc $rndkey,$inout5,$inout5 vmovups 0x30-0x80($key),$rndkey vpxor $T1,$Z2,$Z2 vpclmulqdq \$0x00,$Z1,$Ii,$T1 vaesenc $rndkey,$inout0,$inout0 vpxor $T2,$Z2,$Z2 vpclmulqdq \$0x10,$Z1,$Ii,$T2 vaesenc $rndkey,$inout1,$inout1 vpxor $Hkey,$Z3,$Z3 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey vaesenc $rndkey,$inout2,$inout2 vpclmulqdq \$0x11,$Z1,$Ii,$Z1 vmovdqu 0x50+8(%rsp),$Ii # I[2] vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vpxor $T1,$Z0,$Z0 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4 vaesenc $rndkey,$inout5,$inout5 vmovups 0x40-0x80($key),$rndkey vpxor $T2,$Z2,$Z2 vpclmulqdq \$0x00,$T1,$Ii,$T2 vaesenc $rndkey,$inout0,$inout0 vpxor $Hkey,$Z2,$Z2 vpclmulqdq \$0x10,$T1,$Ii,$Hkey vaesenc $rndkey,$inout1,$inout1 movbe 0x48($in0),%r13 vpxor $Z1,$Z3,$Z3 vpclmulqdq \$0x01,$T1,$Ii,$Z1 vaesenc $rndkey,$inout2,$inout2 movbe 0x40($in0),%r12 vpclmulqdq \$0x11,$T1,$Ii,$T1 vmovdqu 0x60+8(%rsp),$Ii # I[1] vaesenc $rndkey,$inout3,$inout3 mov %r13,0x30+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x38+8(%rsp) vpxor $T2,$Z0,$Z0 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5 vaesenc $rndkey,$inout5,$inout5 vmovups 0x50-0x80($key),$rndkey vpxor $Hkey,$Z2,$Z2 vpclmulqdq \$0x00,$T2,$Ii,$Hkey vaesenc $rndkey,$inout0,$inout0 vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x10,$T2,$Ii,$Z1 vaesenc $rndkey,$inout1,$inout1 movbe 0x38($in0),%r13 vpxor $T1,$Z3,$Z3 vpclmulqdq \$0x01,$T2,$Ii,$T1 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0] vaesenc $rndkey,$inout2,$inout2 movbe 0x30($in0),%r12 vpclmulqdq \$0x11,$T2,$Ii,$T2 vaesenc $rndkey,$inout3,$inout3 mov %r13,0x40+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x48+8(%rsp) vpxor $Hkey,$Z0,$Z0 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6 vaesenc $rndkey,$inout5,$inout5 vmovups 0x60-0x80($key),$rndkey vpxor $Z1,$Z2,$Z2 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1 vaesenc $rndkey,$inout0,$inout0 vpxor $T1,$Z2,$Z2 vpclmulqdq \$0x01,$Hkey,$Xi,$T1 vaesenc $rndkey,$inout1,$inout1 movbe 0x28($in0),%r13 vpxor $T2,$Z3,$Z3 vpclmulqdq \$0x00,$Hkey,$Xi,$T2 vaesenc $rndkey,$inout2,$inout2 movbe 0x20($in0),%r12 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi vaesenc $rndkey,$inout3,$inout3 mov %r13,0x50+8(%rsp) vaesenc $rndkey,$inout4,$inout4 mov %r12,0x58+8(%rsp) vpxor $Z1,$Z2,$Z2 vaesenc $rndkey,$inout5,$inout5 vpxor $T1,$Z2,$Z2 vmovups 0x70-0x80($key),$rndkey vpslldq \$8,$Z2,$Z1 vpxor $T2,$Z0,$Z0 vmovdqu 0x10($const),$Hkey # .Lpoly vaesenc $rndkey,$inout0,$inout0 vpxor $Xi,$Z3,$Z3 vaesenc $rndkey,$inout1,$inout1 vpxor $Z1,$Z0,$Z0 movbe 0x18($in0),%r13 vaesenc $rndkey,$inout2,$inout2 movbe 0x10($in0),%r12 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 mov %r13,0x60+8(%rsp) vaesenc $rndkey,$inout3,$inout3 mov %r12,0x68+8(%rsp) vaesenc $rndkey,$inout4,$inout4 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vmovups 0x90-0x80($key),$rndkey vaesenc $T1,$inout1,$inout1 vpsrldq \$8,$Z2,$Z2 vaesenc $T1,$inout2,$inout2 vpxor $Z2,$Z3,$Z3 vaesenc $T1,$inout3,$inout3 vpxor $Ii,$Z0,$Z0 movbe 0x08($in0),%r13 vaesenc $T1,$inout4,$inout4 movbe 0x00($in0),%r12 vaesenc $T1,$inout5,$inout5 vmovups 0xa0-0x80($key),$T1 cmp \$11,$rounds jb .Lenc_tail # 128-bit key vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vaesenc $T1,$inout1,$inout1 vaesenc $T1,$inout2,$inout2 vaesenc $T1,$inout3,$inout3 vaesenc $T1,$inout4,$inout4 vmovups 0xb0-0x80($key),$rndkey vaesenc $T1,$inout5,$inout5 vmovups 0xc0-0x80($key),$T1 je .Lenc_tail # 192-bit key vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vaesenc $T1,$inout0,$inout0 vaesenc $T1,$inout1,$inout1 vaesenc $T1,$inout2,$inout2 vaesenc $T1,$inout3,$inout3 vaesenc $T1,$inout4,$inout4 vmovups 0xd0-0x80($key),$rndkey vaesenc $T1,$inout5,$inout5 vmovups 0xe0-0x80($key),$T1 jmp .Lenc_tail # 256-bit key .align 32 .Lhandle_ctr32: vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask vpshufb $Ii,$T1,$Z2 # byte-swap counter vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb vpaddd $Z1,$Z2,$inout2 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpaddd $Z1,$inout1,$inout3 vpshufb $Ii,$inout1,$inout1 vpaddd $Z1,$inout2,$inout4 vpshufb $Ii,$inout2,$inout2 vpxor $rndkey,$inout1,$inout1 vpaddd $Z1,$inout3,$inout5 vpshufb $Ii,$inout3,$inout3 vpxor $rndkey,$inout2,$inout2 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value vpshufb $Ii,$inout4,$inout4 vpshufb $Ii,$inout5,$inout5 vpshufb $Ii,$T1,$T1 # next counter value jmp .Lresume_ctr32 .align 32 .Lenc_tail: vaesenc $rndkey,$inout0,$inout0 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase vaesenc $rndkey,$inout1,$inout1 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0 vpxor 0x00($inp),$T1,$T2 vaesenc $rndkey,$inout2,$inout2 vpxor 0x10($inp),$T1,$Ii vaesenc $rndkey,$inout3,$inout3 vpxor 0x20($inp),$T1,$Z1 vaesenc $rndkey,$inout4,$inout4 vpxor 0x30($inp),$T1,$Z2 vaesenc $rndkey,$inout5,$inout5 vpxor 0x40($inp),$T1,$Z3 vpxor 0x50($inp),$T1,$Hkey vmovdqu ($ivp),$T1 # load next counter value vaesenclast $T2,$inout0,$inout0 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb vaesenclast $Ii,$inout1,$inout1 vpaddb $T2,$T1,$Ii mov %r13,0x70+8(%rsp) lea 0x60($inp),$inp vaesenclast $Z1,$inout2,$inout2 vpaddb $T2,$Ii,$Z1 mov %r12,0x78+8(%rsp) lea 0x60($out),$out vmovdqu 0x00-0x80($key),$rndkey vaesenclast $Z2,$inout3,$inout3 vpaddb $T2,$Z1,$Z2 vaesenclast $Z3, $inout4,$inout4 vpaddb $T2,$Z2,$Z3 vaesenclast $Hkey,$inout5,$inout5 vpaddb $T2,$Z3,$Hkey add \$0x60,$ret sub \$0x6,$len jc .L6x_done vmovups $inout0,-0x60($out) # save output vpxor $rndkey,$T1,$inout0 vmovups $inout1,-0x50($out) vmovdqa $Ii,$inout1 # 0 latency vmovups $inout2,-0x40($out) vmovdqa $Z1,$inout2 # 0 latency vmovups $inout3,-0x30($out) vmovdqa $Z2,$inout3 # 0 latency vmovups $inout4,-0x20($out) vmovdqa $Z3,$inout4 # 0 latency vmovups $inout5,-0x10($out) vmovdqa $Hkey,$inout5 # 0 latency vmovdqu 0x20+8(%rsp),$Z3 # I[5] jmp .Loop6x .L6x_done: vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled vpxor $Z0,$Xi,$Xi # modulo-scheduled ret .cfi_endproc .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x ___ ###################################################################### # # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len, # const AES_KEY *key, unsigned char iv[16], # struct { u128 Xi,H,Htbl[9]; } *Xip); $code.=<<___; .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,\@function,6 .align 32 aesni_gcm_decrypt: .cfi_startproc xor $ret,$ret cmp \$0x60,$len # minimal accepted length jb .Lgcm_dec_abort lea (%rsp),%rax # save stack pointer .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,-0xd8(%rax) movaps %xmm7,-0xc8(%rax) movaps %xmm8,-0xb8(%rax) movaps %xmm9,-0xa8(%rax) movaps %xmm10,-0x98(%rax) movaps %xmm11,-0x88(%rax) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) .Lgcm_dec_body: ___ $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const lea -0x80($key),$in0 # borrow $in0 mov \$0xf80,$end0 # borrow $end0 vmovdqu ($Xip),$Xi # load Xi and \$-128,%rsp # ensure stack alignment vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask lea 0x80($key),$key # size optimization lea 0x20+0x20($Xip),$Xip # size optimization mov 0xf0-0x80($key),$rounds vpshufb $Ii,$Xi,$Xi and $end0,$in0 and %rsp,$end0 sub $in0,$end0 jc .Ldec_no_key_aliasing cmp \$768,$end0 jnc .Ldec_no_key_aliasing sub $end0,%rsp # avoid aliasing with key .Ldec_no_key_aliasing: vmovdqu 0x50($inp),$Z3 # I[5] lea ($inp),$in0 vmovdqu 0x40($inp),$Z0 lea -0xc0($inp,$len),$end0 vmovdqu 0x30($inp),$Z1 shr \$4,$len xor $ret,$ret vmovdqu 0x20($inp),$Z2 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x vmovdqu 0x10($inp),$T2 vpshufb $Ii,$Z0,$Z0 vmovdqu ($inp),$Hkey vpshufb $Ii,$Z1,$Z1 vmovdqu $Z0,0x30(%rsp) vpshufb $Ii,$Z2,$Z2 vmovdqu $Z1,0x40(%rsp) vpshufb $Ii,$T2,$T2 vmovdqu $Z2,0x50(%rsp) vpshufb $Ii,$Hkey,$Hkey vmovdqu $T2,0x60(%rsp) vmovdqu $Hkey,0x70(%rsp) call _aesni_ctr32_ghash_6x vmovups $inout0,-0x60($out) # save output vmovups $inout1,-0x50($out) vmovups $inout2,-0x40($out) vmovups $inout3,-0x30($out) vmovups $inout4,-0x20($out) vmovups $inout5,-0x10($out) vpshufb ($const),$Xi,$Xi # .Lbswap_mask vmovdqu $Xi,-0x40($Xip) # output Xi vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lgcm_dec_abort: mov $ret,%rax # return value ret .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ $code.=<<___; .type _aesni_ctr32_6x,\@abi-omnipotent .align 32 _aesni_ctr32_6x: .cfi_startproc vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb lea -1($rounds),%r13 vmovups 0x10-0x80($key),$rndkey lea 0x20-0x80($key),%r12 vpxor $Z0,$T1,$inout0 add \$`6<<24`,$counter jc .Lhandle_ctr32_2 vpaddb $T2,$T1,$inout1 vpaddb $T2,$inout1,$inout2 vpxor $Z0,$inout1,$inout1 vpaddb $T2,$inout2,$inout3 vpxor $Z0,$inout2,$inout2 vpaddb $T2,$inout3,$inout4 vpxor $Z0,$inout3,$inout3 vpaddb $T2,$inout4,$inout5 vpxor $Z0,$inout4,$inout4 vpaddb $T2,$inout5,$T1 vpxor $Z0,$inout5,$inout5 jmp .Loop_ctr32 .align 16 .Loop_ctr32: vaesenc $rndkey,$inout0,$inout0 vaesenc $rndkey,$inout1,$inout1 vaesenc $rndkey,$inout2,$inout2 vaesenc $rndkey,$inout3,$inout3 vaesenc $rndkey,$inout4,$inout4 vaesenc $rndkey,$inout5,$inout5 vmovups (%r12),$rndkey lea 0x10(%r12),%r12 dec %r13d jnz .Loop_ctr32 vmovdqu (%r12),$Hkey # last round key vaesenc $rndkey,$inout0,$inout0 vpxor 0x00($inp),$Hkey,$Z0 vaesenc $rndkey,$inout1,$inout1 vpxor 0x10($inp),$Hkey,$Z1 vaesenc $rndkey,$inout2,$inout2 vpxor 0x20($inp),$Hkey,$Z2 vaesenc $rndkey,$inout3,$inout3 vpxor 0x30($inp),$Hkey,$Xi vaesenc $rndkey,$inout4,$inout4 vpxor 0x40($inp),$Hkey,$T2 vaesenc $rndkey,$inout5,$inout5 vpxor 0x50($inp),$Hkey,$Hkey lea 0x60($inp),$inp vaesenclast $Z0,$inout0,$inout0 vaesenclast $Z1,$inout1,$inout1 vaesenclast $Z2,$inout2,$inout2 vaesenclast $Xi,$inout3,$inout3 vaesenclast $T2,$inout4,$inout4 vaesenclast $Hkey,$inout5,$inout5 vmovups $inout0,0x00($out) vmovups $inout1,0x10($out) vmovups $inout2,0x20($out) vmovups $inout3,0x30($out) vmovups $inout4,0x40($out) vmovups $inout5,0x50($out) lea 0x60($out),$out ret .align 32 .Lhandle_ctr32_2: vpshufb $Ii,$T1,$Z2 # byte-swap counter vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb vpaddd $Z1,$Z2,$inout2 vpaddd $Z1,$inout1,$inout3 vpshufb $Ii,$inout1,$inout1 vpaddd $Z1,$inout2,$inout4 vpshufb $Ii,$inout2,$inout2 vpxor $Z0,$inout1,$inout1 vpaddd $Z1,$inout3,$inout5 vpshufb $Ii,$inout3,$inout3 vpxor $Z0,$inout2,$inout2 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value vpshufb $Ii,$inout4,$inout4 vpxor $Z0,$inout3,$inout3 vpshufb $Ii,$inout5,$inout5 vpxor $Z0,$inout4,$inout4 vpshufb $Ii,$T1,$T1 # next counter value vpxor $Z0,$inout5,$inout5 jmp .Loop_ctr32 .cfi_endproc .size _aesni_ctr32_6x,.-_aesni_ctr32_6x .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,\@function,6 .align 32 aesni_gcm_encrypt: .cfi_startproc xor $ret,$ret cmp \$0x60*3,$len # minimal accepted length jb .Lgcm_enc_abort lea (%rsp),%rax # save stack pointer .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,-0xd8(%rax) movaps %xmm7,-0xc8(%rax) movaps %xmm8,-0xb8(%rax) movaps %xmm9,-0xa8(%rax) movaps %xmm10,-0x98(%rax) movaps %xmm11,-0x88(%rax) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) .Lgcm_enc_body: ___ $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const lea -0x80($key),$in0 # borrow $in0 mov \$0xf80,$end0 # borrow $end0 lea 0x80($key),$key # size optimization vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask and \$-128,%rsp # ensure stack alignment mov 0xf0-0x80($key),$rounds and $end0,$in0 and %rsp,$end0 sub $in0,$end0 jc .Lenc_no_key_aliasing cmp \$768,$end0 jnc .Lenc_no_key_aliasing sub $end0,%rsp # avoid aliasing with key .Lenc_no_key_aliasing: lea ($out),$in0 lea -0xc0($out,$len),$end0 shr \$4,$len call _aesni_ctr32_6x vpshufb $Ii,$inout0,$Xi # save bswapped output on stack vpshufb $Ii,$inout1,$T2 vmovdqu $Xi,0x70(%rsp) vpshufb $Ii,$inout2,$Z0 vmovdqu $T2,0x60(%rsp) vpshufb $Ii,$inout3,$Z1 vmovdqu $Z0,0x50(%rsp) vpshufb $Ii,$inout4,$Z2 vmovdqu $Z1,0x40(%rsp) vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x vmovdqu $Z2,0x30(%rsp) call _aesni_ctr32_6x vmovdqu ($Xip),$Xi # load Xi lea 0x20+0x20($Xip),$Xip # size optimization sub \$12,$len mov \$0x60*2,$ret vpshufb $Ii,$Xi,$Xi call _aesni_ctr32_ghash_6x vmovdqu 0x20(%rsp),$Z3 # I[5] vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpunpckhqdq $Z3,$Z3,$T1 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK vmovups $inout0,-0x60($out) # save output vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy vpxor $Z3,$T1,$T1 vmovups $inout1,-0x50($out) vpshufb $Ii,$inout1,$inout1 vmovups $inout2,-0x40($out) vpshufb $Ii,$inout2,$inout2 vmovups $inout3,-0x30($out) vpshufb $Ii,$inout3,$inout3 vmovups $inout4,-0x20($out) vpshufb $Ii,$inout4,$inout4 vmovups $inout5,-0x10($out) vpshufb $Ii,$inout5,$inout5 vmovdqu $inout0,0x10(%rsp) # free $inout0 ___ { my ($HK,$T3)=($rndkey,$inout0); $code.=<<___; vmovdqu 0x30(%rsp),$Z2 # I[4] vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 vpunpckhqdq $Z2,$Z2,$T2 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1 vpxor $Z2,$T2,$T2 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3 vpclmulqdq \$0x00,$HK,$T1,$T1 vmovdqu 0x40(%rsp),$T3 # I[3] vpclmulqdq \$0x00,$Ii,$Z2,$Z0 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $T3,$T3,$Z1 vpclmulqdq \$0x11,$Ii,$Z2,$Z2 vpxor $T3,$Z1,$Z1 vpxor $Z3,$Z2,$Z2 vpclmulqdq \$0x10,$HK,$T2,$T2 vmovdqu 0x50-0x20($Xip),$HK vpxor $T1,$T2,$T2 vmovdqu 0x50(%rsp),$T1 # I[2] vpclmulqdq \$0x00,$Hkey,$T3,$Z3 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 vpxor $Z0,$Z3,$Z3 vpunpckhqdq $T1,$T1,$Z0 vpclmulqdq \$0x11,$Hkey,$T3,$T3 vpxor $T1,$Z0,$Z0 vpxor $Z2,$T3,$T3 vpclmulqdq \$0x00,$HK,$Z1,$Z1 vpxor $T2,$Z1,$Z1 vmovdqu 0x60(%rsp),$T2 # I[1] vpclmulqdq \$0x00,$Ii,$T1,$Z2 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 vpxor $Z3,$Z2,$Z2 vpunpckhqdq $T2,$T2,$Z3 vpclmulqdq \$0x11,$Ii,$T1,$T1 vpxor $T2,$Z3,$Z3 vpxor $T3,$T1,$T1 vpclmulqdq \$0x10,$HK,$Z0,$Z0 vmovdqu 0x80-0x20($Xip),$HK vpxor $Z1,$Z0,$Z0 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0] vpclmulqdq \$0x00,$Hkey,$T2,$Z1 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 vpunpckhqdq $Xi,$Xi,$T3 vpxor $Z2,$Z1,$Z1 vpclmulqdq \$0x11,$Hkey,$T2,$T2 vpxor $Xi,$T3,$T3 vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$HK,$Z3,$Z3 vpxor $Z0,$Z3,$Z0 vpclmulqdq \$0x00,$Ii,$Xi,$Z2 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1 vpunpckhqdq $inout5,$inout5,$T1 vpclmulqdq \$0x11,$Ii,$Xi,$Xi vpxor $inout5,$T1,$T1 vpxor $Z1,$Z2,$Z1 vpclmulqdq \$0x10,$HK,$T3,$T3 vmovdqu 0x20-0x20($Xip),$HK vpxor $T2,$Xi,$Z3 vpxor $Z0,$T3,$Z2 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing vpclmulqdq \$0x00,$Hkey,$inout5,$Z0 vpxor $T3,$Z2,$Z2 vpunpckhqdq $inout4,$inout4,$T2 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5 vpxor $inout4,$T2,$T2 vpslldq \$8,$Z2,$T3 vpclmulqdq \$0x00,$HK,$T1,$T1 vpxor $T3,$Z1,$Xi vpsrldq \$8,$Z2,$Z2 vpxor $Z2,$Z3,$Z3 vpclmulqdq \$0x00,$Ii,$inout4,$Z1 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3 vpxor $Z0,$Z1,$Z1 vpunpckhqdq $inout3,$inout3,$T3 vpclmulqdq \$0x11,$Ii,$inout4,$inout4 vpxor $inout3,$T3,$T3 vpxor $inout5,$inout4,$inout4 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase vpclmulqdq \$0x10,$HK,$T2,$T2 vmovdqu 0x50-0x20($Xip),$HK vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $inout2,$inout2,$T1 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3 vpxor $inout2,$T1,$T1 vpxor $inout4,$inout3,$inout3 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0 vpclmulqdq \$0x00,$HK,$T3,$T3 vpxor $T2,$T3,$T3 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi vxorps $inout5,$Xi,$Xi vpclmulqdq \$0x00,$Ii,$inout2,$Z1 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5 vpxor $Z0,$Z1,$Z1 vpunpckhqdq $inout1,$inout1,$T2 vpclmulqdq \$0x11,$Ii,$inout2,$inout2 vpxor $inout1,$T2,$T2 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase vpxor $inout3,$inout2,$inout2 vpclmulqdq \$0x10,$HK,$T1,$T1 vmovdqu 0x80-0x20($Xip),$HK vpxor $T3,$T1,$T1 vxorps $Z3,$inout5,$inout5 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi vxorps $inout5,$Xi,$Xi vpclmulqdq \$0x00,$Hkey,$inout1,$Z0 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6 vpxor $Z1,$Z0,$Z0 vpunpckhqdq $Xi,$Xi,$T3 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1 vpxor $Xi,$T3,$T3 vpxor $inout2,$inout1,$inout1 vpclmulqdq \$0x00,$HK,$T2,$T2 vpxor $T1,$T2,$T2 vpclmulqdq \$0x00,$Ii,$Xi,$Z1 vpclmulqdq \$0x11,$Ii,$Xi,$Z3 vpxor $Z0,$Z1,$Z1 vpclmulqdq \$0x10,$HK,$T3,$Z2 vpxor $inout1,$Z3,$Z3 vpxor $T2,$Z2,$Z2 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing vpxor $Z0,$Z2,$Z2 vpslldq \$8,$Z2,$T1 vmovdqu 0x10($const),$Hkey # .Lpoly vpsrldq \$8,$Z2,$Z2 vpxor $T1,$Z1,$Xi vpxor $Z2,$Z3,$Z3 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase vpclmulqdq \$0x10,$Hkey,$Xi,$Xi vpxor $T2,$Xi,$Xi vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase vpclmulqdq \$0x10,$Hkey,$Xi,$Xi vpxor $Z3,$T2,$T2 vpxor $T2,$Xi,$Xi ___ } $code.=<<___; vpshufb ($const),$Xi,$Xi # .Lbswap_mask vmovdqu $Xi,-0x40($Xip) # output Xi vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp # restore %rsp .cfi_def_cfa_register %rsp .Lgcm_enc_abort: mov $ret,%rax # return value ret .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt ___ $code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .Lpoly: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .Lone_msb: .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 .Ltwo_lsb: .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .Lone_lsb: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by " .align 64 ___ if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___ .extern __imp_RtlVirtualUnwind .type gcm_se_handler,\@abi-omnipotent .align 16 gcm_se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 120($context),%rax # pull context->Rax mov -48(%rax),%r15 mov -40(%rax),%r14 mov -32(%rax),%r13 mov -24(%rax),%r12 mov -16(%rax),%rbp mov -8(%rax),%rbx mov %r15,240($context) mov %r14,232($context) mov %r13,224($context) mov %r12,216($context) mov %rbp,160($context) mov %rbx,144($context) lea -0xd8(%rax),%rsi # %xmm save area lea 512($context),%rdi # & context.Xmm6 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax) .long 0xa548f3fc # cld; rep movsq .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size gcm_se_handler,.-gcm_se_handler .section .pdata .align 4 .rva .LSEH_begin_aesni_gcm_decrypt .rva .LSEH_end_aesni_gcm_decrypt .rva .LSEH_gcm_dec_info .rva .LSEH_begin_aesni_gcm_encrypt .rva .LSEH_end_aesni_gcm_encrypt .rva .LSEH_gcm_enc_info .section .xdata .align 8 .LSEH_gcm_dec_info: .byte 9,0,0,0 .rva gcm_se_handler .rva .Lgcm_dec_body,.Lgcm_dec_abort .LSEH_gcm_enc_info: .byte 9,0,0,0 .rva gcm_se_handler .rva .Lgcm_enc_body,.Lgcm_enc_abort ___ } }}} else {{{ $code=<<___; # assembler is too old .text .globl aesni_gcm_encrypt .type aesni_gcm_encrypt,\@abi-omnipotent aesni_gcm_encrypt: .cfi_startproc xor %eax,%eax ret .cfi_endproc .size aesni_gcm_encrypt,.-aesni_gcm_encrypt .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,\@abi-omnipotent aesni_gcm_decrypt: .cfi_startproc xor %eax,%eax ret .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt ___ }}} $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/modes/asm/ghash-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/modes/asm/ghash-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/modes/asm/ghash-x86_64.pl (revision 364963) @@ -1,1818 +1,1818 @@ #! /usr/bin/env perl # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # March, June 2010 # # The module implements "4-bit" GCM GHASH function and underlying # single multiplication operation in GF(2^128). "4-bit" means that # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH # function features so called "528B" variant utilizing additional # 256+16 bytes of per-key storage [+512 bytes shared table]. # Performance results are for this streamed GHASH subroutine and are # expressed in cycles per processed byte, less is better: # # gcc 3.4.x(*) assembler # # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% # Atom 31.6 16.8 +88% # VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results # are for "528B";-) # (**) it's mystery [to me] why Core2 result is not same as for # Opteron; # May 2010 # # Add PCLMULQDQ version performing at 2.02 cycles per processed byte. # See ghash-x86.pl for background information and details about coding # techniques. # # Special thanks to David Woodhouse for providing access to a # Westmere-based system on behalf of Intel Open Source Technology Centre. # December 2012 # # Overhaul: aggregate Karatsuba post-processing, improve ILP in # reduction_alg9, increase reduction aggregate factor to 4x. As for # the latter. ghash-x86.pl discusses that it makes lesser sense to # increase aggregate factor. Then why increase here? Critical path # consists of 3 independent pclmulqdq instructions, Karatsuba post- # processing and reduction. "On top" of this we lay down aggregated # multiplication operations, triplets of independent pclmulqdq's. As # issue rate for pclmulqdq is limited, it makes lesser sense to # aggregate more multiplications than it takes to perform remaining # non-multiplication operations. 2x is near-optimal coefficient for # contemporary Intel CPUs (therefore modest improvement coefficient), # but not for Bulldozer. Latter is because logical SIMD operations # are twice as slow in comparison to Intel, so that critical path is # longer. A CPU with higher pclmulqdq issue rate would also benefit # from higher aggregate factor... # # Westmere 1.78(+13%) # Sandy Bridge 1.80(+8%) # Ivy Bridge 1.80(+7%) # Haswell 0.55(+93%) (if system doesn't support AVX) # Broadwell 0.45(+110%)(if system doesn't support AVX) # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) # Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 # # ... 8x aggregate factor AVX code path is using reduction algorithm # suggested by Shay Gueron[1]. Even though contemporary AVX-capable # CPUs such as Sandy and Ivy Bridge can execute it, the code performs # sub-optimally in comparison to above mentioned version. But thanks # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # # Knights Landing achieves 1.09 cpb. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.20) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $do4xaggr=1; # common register layout $nlo="%rax"; $nhi="%rbx"; $Zlo="%r8"; $Zhi="%r9"; $tmp="%r10"; $rem_4bit = "%r11"; $Xi="%rdi"; $Htbl="%rsi"; # per-function register layout $cnt="%rcx"; $rem="%rdx"; sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or $r =~ s/%[er]([sd]i)/%\1l/ or $r =~ s/%[er](bp)/%\1l/ or $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; } sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } { my $N; sub loop() { my $inp = shift; $N++; $code.=<<___; xor $nlo,$nlo xor $nhi,$nhi mov `&LB("$Zlo")`,`&LB("$nlo")` mov `&LB("$Zlo")`,`&LB("$nhi")` shl \$4,`&LB("$nlo")` mov \$14,$cnt mov 8($Htbl,$nlo),$Zlo mov ($Htbl,$nlo),$Zhi and \$0xf0,`&LB("$nhi")` mov $Zlo,$rem jmp .Loop$N .align 16 .Loop$N: shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp mov ($inp,$cnt),`&LB("$nlo")` shr \$4,$Zhi xor 8($Htbl,$nhi),$Zlo shl \$60,$tmp xor ($Htbl,$nhi),$Zhi mov `&LB("$nlo")`,`&LB("$nhi")` xor ($rem_4bit,$rem,8),$Zhi mov $Zlo,$rem shl \$4,`&LB("$nlo")` xor $tmp,$Zlo dec $cnt js .Lbreak$N shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp shr \$4,$Zhi xor 8($Htbl,$nlo),$Zlo shl \$60,$tmp xor ($Htbl,$nlo),$Zhi and \$0xf0,`&LB("$nhi")` xor ($rem_4bit,$rem,8),$Zhi mov $Zlo,$rem xor $tmp,$Zlo jmp .Loop$N .align 16 .Lbreak$N: shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp shr \$4,$Zhi xor 8($Htbl,$nlo),$Zlo shl \$60,$tmp xor ($Htbl,$nlo),$Zhi and \$0xf0,`&LB("$nhi")` xor ($rem_4bit,$rem,8),$Zhi mov $Zlo,$rem xor $tmp,$Zlo shr \$4,$Zlo and \$0xf,$rem mov $Zhi,$tmp shr \$4,$Zhi xor 8($Htbl,$nhi),$Zlo shl \$60,$tmp xor ($Htbl,$nhi),$Zhi xor $tmp,$Zlo xor ($rem_4bit,$rem,8),$Zhi bswap $Zlo bswap $Zhi ___ }} $code=<<___; .text .extern OPENSSL_ia32cap_P .globl gcm_gmult_4bit .type gcm_gmult_4bit,\@function,2 .align 16 gcm_gmult_4bit: .cfi_startproc push %rbx .cfi_push %rbx push %rbp # %rbp and others are pushed exclusively in .cfi_push %rbp push %r12 # order to reuse Win64 exception handler... .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$280,%rsp .cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzb 15($Xi),$Zlo lea .Lrem_4bit(%rip),$rem_4bit ___ &loop ($Xi); $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) lea 280+48(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lgmult_epilogue: ret .cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit ___ # per-function register layout $inp="%rdx"; $len="%rcx"; $rem_8bit=$rem_4bit; $code.=<<___; .globl gcm_ghash_4bit .type gcm_ghash_4bit,\@function,4 .align 16 gcm_ghash_4bit: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$280,%rsp .cfi_adjust_cfa_offset 280 .Lghash_prologue: mov $inp,%r14 # reassign couple of args mov $len,%r15 ___ { my $inp="%r14"; my $dat="%edx"; my $len="%r15"; my @nhi=("%ebx","%ecx"); my @rem=("%r12","%r13"); my $Hshr4="%rbp"; &sub ($Htbl,-128); # size optimization &lea ($Hshr4,"16+128(%rsp)"); { my @lo =($nlo,$nhi); my @hi =($Zlo,$Zhi); &xor ($dat,$dat); for ($i=0,$j=-2;$i<18;$i++,$j++) { &mov ("$j(%rsp)",&LB($dat)) if ($i>1); &or ($lo[0],$tmp) if ($i>1); &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17); &shr ($lo[1],4) if ($i>0 && $i<17); &mov ($tmp,$hi[1]) if ($i>0 && $i<17); &shr ($hi[1],4) if ($i>0 && $i<17); &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1); &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16); &shl (&LB($dat),4) if ($i>0 && $i<17); &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1); &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16); &shl ($tmp,60) if ($i>0 && $i<17); push (@lo,shift(@lo)); push (@hi,shift(@hi)); } } &add ($Htbl,-128); &mov ($Zlo,"8($Xi)"); &mov ($Zhi,"0($Xi)"); &add ($len,$inp); # pointer to the end of data &lea ($rem_8bit,".Lrem_8bit(%rip)"); &jmp (".Louter_loop"); $code.=".align 16\n.Louter_loop:\n"; &xor ($Zhi,"($inp)"); &mov ("%rdx","8($inp)"); &lea ($inp,"16($inp)"); &xor ("%rdx",$Zlo); &mov ("($Xi)",$Zhi); &mov ("8($Xi)","%rdx"); &shr ("%rdx",32); &xor ($nlo,$nlo); &rol ($dat,8); &mov (&LB($nlo),&LB($dat)); &movz ($nhi[0],&LB($dat)); &shl (&LB($nlo),4); &shr ($nhi[0],4); for ($j=11,$i=0;$i<15;$i++) { &rol ($dat,8); &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0); &xor ($Zhi,"($Htbl,$nlo)") if ($i>0); &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0); &mov ($Zhi,"($Htbl,$nlo)") if ($i==0); &mov (&LB($nlo),&LB($dat)); &xor ($Zlo,$tmp) if ($i>0); &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0); &movz ($nhi[1],&LB($dat)); &shl (&LB($nlo),4); &movzb ($rem[0],"(%rsp,$nhi[0])"); &shr ($nhi[1],4) if ($i<14); &and ($nhi[1],0xf0) if ($i==14); &shl ($rem[1],48) if ($i>0); &xor ($rem[0],$Zlo); &mov ($tmp,$Zhi); &xor ($Zhi,$rem[1]) if ($i>0); &shr ($Zlo,8); &movz ($rem[0],&LB($rem[0])); &mov ($dat,"$j($Xi)") if (--$j%4==0); &shr ($Zhi,8); &xor ($Zlo,"-128($Hshr4,$nhi[0],8)"); &shl ($tmp,56); &xor ($Zhi,"($Hshr4,$nhi[0],8)"); unshift (@nhi,pop(@nhi)); # "rotate" registers unshift (@rem,pop(@rem)); } &movzw ($rem[1],"($rem_8bit,$rem[1],2)"); &xor ($Zlo,"8($Htbl,$nlo)"); &xor ($Zhi,"($Htbl,$nlo)"); &shl ($rem[1],48); &xor ($Zlo,$tmp); &xor ($Zhi,$rem[1]); &movz ($rem[0],&LB($Zlo)); &shr ($Zlo,4); &mov ($tmp,$Zhi); &shl (&LB($rem[0]),4); &shr ($Zhi,4); &xor ($Zlo,"8($Htbl,$nhi[0])"); &movzw ($rem[0],"($rem_8bit,$rem[0],2)"); &shl ($tmp,60); &xor ($Zhi,"($Htbl,$nhi[0])"); &xor ($Zlo,$tmp); &shl ($rem[0],48); &bswap ($Zlo); &xor ($Zhi,$rem[0]); &bswap ($Zhi); &cmp ($inp,$len); &jb (".Louter_loop"); } $code.=<<___; mov $Zlo,8($Xi) mov $Zhi,($Xi) lea 280+48(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea 0(%rsi),%rsp .cfi_def_cfa_register %rsp .Lghash_epilogue: ret .cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit ___ ###################################################################### # PCLMULQDQ version. @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2"; ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5"); sub clmul64x64_T2 { # minimal register pressure my ($Xhi,$Xi,$Hkey,$HK)=@_; if (!defined($HK)) { $HK = $T2; $code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pshufd \$0b01001110,$Hkey,$T2 pxor $Xi,$T1 # pxor $Hkey,$T2 ___ } else { $code.=<<___; movdqa $Xi,$Xhi # pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 # ___ } $code.=<<___; pclmulqdq \$0x00,$Hkey,$Xi ####### pclmulqdq \$0x11,$Hkey,$Xhi ####### pclmulqdq \$0x00,$HK,$T1 ####### pxor $Xi,$T1 # pxor $Xhi,$T1 # movdqa $T1,$T2 # psrldq \$8,$T1 pslldq \$8,$T2 # pxor $T1,$Xhi pxor $T2,$Xi # ___ } sub reduction_alg9 { # 17/11 times faster than Intel version my ($Xhi,$Xi) = @_; $code.=<<___; # 1st phase movdqa $Xi,$T2 # movdqa $Xi,$T1 psllq \$5,$Xi pxor $Xi,$T1 # psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # # 2nd phase movdqa $Xi,$T2 psrlq \$1,$Xi pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # psrlq \$1,$Xi # pxor $Xhi,$Xi # ___ } { my ($Htbl,$Xip)=@_4args; my $HK="%xmm6"; $code.=<<___; .globl gcm_init_clmul .type gcm_init_clmul,\@abi-omnipotent .align 16 gcm_init_clmul: .cfi_startproc .L_init_clmul: ___ $code.=<<___ if ($win64); .LSEH_begin_gcm_init_clmul: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) ___ $code.=<<___; movdqu ($Xip),$Hkey pshufd \$0b01001110,$Hkey,$Hkey # dword swap # <<1 twist pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword movdqa $Hkey,$T1 psllq \$1,$Hkey pxor $T3,$T3 # psrlq \$63,$T1 pcmpgtd $T2,$T3 # broadcast carry bit pslldq \$8,$T1 por $T1,$Hkey # H<<=1 # magic reduction pand .L0x1c2_polynomial(%rip),$T3 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial # calculate H^2 pshufd \$0b01001110,$Hkey,$HK movdqa $Hkey,$Xi pxor $Hkey,$HK ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); &reduction_alg9 ($Xhi,$Xi); $code.=<<___; pshufd \$0b01001110,$Hkey,$T1 pshufd \$0b01001110,$Xi,$T2 pxor $Hkey,$T1 # Karatsuba pre-processing movdqu $Hkey,0x00($Htbl) # save H pxor $Xi,$T2 # Karatsuba pre-processing movdqu $Xi,0x10($Htbl) # save H^2 palignr \$8,$T1,$T2 # low part is H.lo^H.hi... movdqu $T2,0x20($Htbl) # save Karatsuba "salt" ___ if ($do4xaggr) { &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3 &reduction_alg9 ($Xhi,$Xi); $code.=<<___; movdqa $Xi,$T3 ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4 &reduction_alg9 ($Xhi,$Xi); $code.=<<___; pshufd \$0b01001110,$T3,$T1 pshufd \$0b01001110,$Xi,$T2 pxor $T3,$T1 # Karatsuba pre-processing movdqu $T3,0x30($Htbl) # save H^3 pxor $Xi,$T2 # Karatsuba pre-processing movdqu $Xi,0x40($Htbl) # save H^4 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi... movdqu $T2,0x50($Htbl) # save Karatsuba "salt" ___ } $code.=<<___ if ($win64); movaps (%rsp),%xmm6 lea 0x18(%rsp),%rsp .LSEH_end_gcm_init_clmul: ___ $code.=<<___; ret .cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul ___ } { my ($Xip,$Htbl)=@_4args; $code.=<<___; .globl gcm_gmult_clmul .type gcm_gmult_clmul,\@abi-omnipotent .align 16 gcm_gmult_clmul: .cfi_startproc .L_gmult_clmul: movdqu ($Xip),$Xi movdqa .Lbswap_mask(%rip),$T3 movdqu ($Htbl),$Hkey movdqu 0x20($Htbl),$T2 pshufb $T3,$Xi ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2); $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0)); # experimental alternative. special thing about is that there # no dependency between the two multiplications... mov \$`0xE1<<1`,%eax mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff mov \$0x07,%r11d movq %rax,$T1 movq %r10,$T2 movq %r11,$T3 # borrow $T3 pand $Xi,$T3 pshufb $T3,$T2 # ($Xi&7)·0xE0 movq %rax,$T3 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1) pxor $Xi,$T2 pslldq \$15,$T2 paddd $T2,$T2 # <<(64+56+1) pxor $T2,$Xi pclmulqdq \$0x01,$T3,$Xi movdqa .Lbswap_mask(%rip),$T3 # reload $T3 psrldq \$1,$T1 pxor $T1,$Xhi pslldq \$7,$Xi pxor $Xhi,$Xi ___ $code.=<<___; pshufb $T3,$Xi movdqu $Xi,($Xip) ret .cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul ___ } { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7)); my ($T1,$T2,$T3)=map("%xmm$_",(8..10)); $code.=<<___; .globl gcm_ghash_clmul .type gcm_ghash_clmul,\@abi-omnipotent .align 32 gcm_ghash_clmul: .cfi_startproc .L_ghash_clmul: ___ $code.=<<___ if ($win64); lea -0x88(%rsp),%rax .LSEH_begin_gcm_ghash_clmul: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) ___ $code.=<<___; movdqa .Lbswap_mask(%rip),$T3 movdqu ($Xip),$Xi movdqu ($Htbl),$Hkey movdqu 0x20($Htbl),$HK pshufb $T3,$Xi sub \$0x10,$len jz .Lodd_tail movdqu 0x10($Htbl),$Hkey2 ___ if ($do4xaggr) { my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15)); $code.=<<___; mov OPENSSL_ia32cap_P+4(%rip),%eax cmp \$0x30,$len jb .Lskip4x and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE cmp \$`1<<22`,%eax # check for MOVBE without XSAVE je .Lskip4x sub \$0x30,$len mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff movdqu 0x30($Htbl),$Hkey3 movdqu 0x40($Htbl),$Hkey4 ####### # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P # movdqu 0x30($inp),$Xln movdqu 0x20($inp),$Xl pshufb $T3,$Xln pshufb $T3,$Xl movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$Xmn movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey2,$Xl pclmulqdq \$0x11,$Hkey2,$Xh pclmulqdq \$0x10,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn movups 0x50($Htbl),$HK xorps $Xm,$Xmn movdqu 0x10($inp),$Xl movdqu 0($inp),$T1 pshufb $T3,$Xl pshufb $T3,$T1 movdqa $Xl,$Xh pshufd \$0b01001110,$Xl,$Xm pxor $T1,$Xi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl movdqa $Xi,$Xhi pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x11,$Hkey3,$Xh pclmulqdq \$0x00,$HK,$Xm xorps $Xl,$Xln xorps $Xh,$Xhn lea 0x40($inp),$inp sub \$0x40,$len jc .Ltail4x jmp .Lmod4_loop .align 32 .Lmod4_loop: pclmulqdq \$0x00,$Hkey4,$Xi xorps $Xm,$Xmn movdqu 0x30($inp),$Xl pshufb $T3,$Xl pclmulqdq \$0x11,$Hkey4,$Xhi xorps $Xln,$Xi movdqu 0x20($inp),$Xln movdqa $Xl,$Xh pclmulqdq \$0x10,$HK,$T1 pshufd \$0b01001110,$Xl,$Xm xorps $Xhn,$Xhi pxor $Xl,$Xm pshufb $T3,$Xln movups 0x20($Htbl),$HK xorps $Xmn,$T1 pclmulqdq \$0x00,$Hkey,$Xl pshufd \$0b01001110,$Xln,$Xmn pxor $Xi,$T1 # aggregated Karatsuba post-processing movdqa $Xln,$Xhn pxor $Xhi,$T1 # pxor $Xln,$Xmn movdqa $T1,$T2 # pclmulqdq \$0x11,$Hkey,$Xh pslldq \$8,$T1 psrldq \$8,$T2 # pxor $T1,$Xi movdqa .L7_mask(%rip),$T1 pxor $T2,$Xhi # movq %rax,$T2 pand $Xi,$T1 # 1st phase pshufb $T1,$T2 # pxor $Xi,$T2 # pclmulqdq \$0x00,$HK,$Xm psllq \$57,$T2 # movdqa $T2,$T1 # pslldq \$8,$T2 pclmulqdq \$0x00,$Hkey2,$Xln psrldq \$8,$T1 # pxor $T2,$Xi pxor $T1,$Xhi # movdqu 0($inp),$T1 movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey2,$Xhn xorps $Xl,$Xln movdqu 0x10($inp),$Xl pshufb $T3,$Xl pclmulqdq \$0x10,$HK,$Xmn xorps $Xh,$Xhn movups 0x50($Htbl),$HK pshufb $T3,$T1 pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi movdqa $Xl,$Xh pxor $Xm,$Xmn pshufd \$0b01001110,$Xl,$Xm pxor $T2,$Xi # pxor $T1,$Xhi pxor $Xl,$Xm pclmulqdq \$0x00,$Hkey3,$Xl psrlq \$1,$Xi # pxor $Xhi,$Xi # movdqa $Xi,$Xhi pclmulqdq \$0x11,$Hkey3,$Xh xorps $Xl,$Xln pshufd \$0b01001110,$Xi,$T1 pxor $Xi,$T1 pclmulqdq \$0x00,$HK,$Xm xorps $Xh,$Xhn lea 0x40($inp),$inp sub \$0x40,$len jnc .Lmod4_loop .Ltail4x: pclmulqdq \$0x00,$Hkey4,$Xi pclmulqdq \$0x11,$Hkey4,$Xhi pclmulqdq \$0x10,$HK,$T1 xorps $Xm,$Xmn xorps $Xln,$Xi xorps $Xhn,$Xhi pxor $Xi,$Xhi # aggregated Karatsuba post-processing pxor $Xmn,$T1 pxor $Xhi,$T1 # pxor $Xi,$Xhi movdqa $T1,$T2 # psrldq \$8,$T1 pslldq \$8,$T2 # pxor $T1,$Xhi pxor $T2,$Xi # ___ &reduction_alg9($Xhi,$Xi); $code.=<<___; add \$0x40,$len jz .Ldone movdqu 0x20($Htbl),$HK sub \$0x10,$len jz .Lodd_tail .Lskip4x: ___ } $code.=<<___; ####### # Xi+2 =[H*(Ii+1 + Xi+1)] mod P = # [(H*Ii+1) + (H*Xi+1)] mod P = # [(H*Ii+1) + H^2*(Ii+Xi)] mod P # movdqu ($inp),$T1 # Ii movdqu 16($inp),$Xln # Ii+1 pshufb $T3,$T1 pshufb $T3,$Xln pxor $T1,$Xi # Ii+Xi movdqa $Xln,$Xhn pshufd \$0b01001110,$Xln,$Xmn pxor $Xln,$Xmn pclmulqdq \$0x00,$Hkey,$Xln pclmulqdq \$0x11,$Hkey,$Xhn pclmulqdq \$0x00,$HK,$Xmn lea 32($inp),$inp # i+=2 nop sub \$0x20,$len jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa $Xi,$Xhi movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi movdqu ($inp),$T2 # Ii pxor $Xi,$T1 # aggregated Karatsuba post-processing pshufb $T3,$T2 movdqu 16($inp),$Xln # Ii+1 pxor $Xhi,$T1 pxor $T2,$Xhi # "Ii+Xi", consume early pxor $T1,$Xmn pshufb $T3,$Xln movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $Xmn,$Xi # movdqa $Xln,$Xhn # movdqa $Xi,$T2 # 1st phase movdqa $Xi,$T1 psllq \$5,$Xi pxor $Xi,$T1 # pclmulqdq \$0x00,$Hkey,$Xln ####### psllq \$1,$Xi pxor $T1,$Xi # psllq \$57,$Xi # movdqa $Xi,$T1 # pslldq \$8,$Xi psrldq \$8,$T1 # pxor $T2,$Xi pshufd \$0b01001110,$Xhn,$Xmn pxor $T1,$Xhi # pxor $Xhn,$Xmn # movdqa $Xi,$T2 # 2nd phase psrlq \$1,$Xi pclmulqdq \$0x11,$Hkey,$Xhn ####### pxor $T2,$Xhi # pxor $Xi,$T2 psrlq \$5,$Xi pxor $T2,$Xi # lea 32($inp),$inp psrlq \$1,$Xi # pclmulqdq \$0x00,$HK,$Xmn ####### pxor $Xhi,$Xi # sub \$0x20,$len ja .Lmod_loop .Leven_tail: movdqa $Xi,$Xhi movdqa $Xmn,$T1 pshufd \$0b01001110,$Xi,$Xmn # pxor $Xi,$Xmn # pclmulqdq \$0x00,$Hkey2,$Xi pclmulqdq \$0x11,$Hkey2,$Xhi pclmulqdq \$0x10,$HK,$Xmn pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi) pxor $Xhn,$Xhi pxor $Xi,$T1 pxor $Xhi,$T1 pxor $T1,$Xmn movdqa $Xmn,$T1 # psrldq \$8,$T1 pslldq \$8,$Xmn # pxor $T1,$Xhi pxor $Xmn,$Xi # ___ &reduction_alg9 ($Xhi,$Xi); $code.=<<___; test $len,$len jnz .Ldone .Lodd_tail: movdqu ($inp),$T1 # Ii pshufb $T3,$T1 pxor $T1,$Xi # Ii+Xi ___ &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi) &reduction_alg9 ($Xhi,$Xi); $code.=<<___; .Ldone: pshufb $T3,$Xi movdqu $Xi,($Xip) ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp .LSEH_end_gcm_ghash_clmul: ___ $code.=<<___; ret .cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul ___ } $code.=<<___; .globl gcm_init_avx .type gcm_init_avx,\@abi-omnipotent .align 32 gcm_init_avx: .cfi_startproc ___ if ($avx) { my ($Htbl,$Xip)=@_4args; my $HK="%xmm6"; $code.=<<___ if ($win64); .LSEH_begin_gcm_init_avx: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp) ___ $code.=<<___; vzeroupper vmovdqu ($Xip),$Hkey vpshufd \$0b01001110,$Hkey,$Hkey # dword swap # <<1 twist vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword vpsrlq \$63,$Hkey,$T1 vpsllq \$1,$Hkey,$Hkey vpxor $T3,$T3,$T3 # vpcmpgtd $T2,$T3,$T3 # broadcast carry bit vpslldq \$8,$T1,$T1 vpor $T1,$Hkey,$Hkey # H<<=1 # magic reduction vpand .L0x1c2_polynomial(%rip),$T3,$T3 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial vpunpckhqdq $Hkey,$Hkey,$HK vmovdqa $Hkey,$Xi vpxor $Hkey,$HK,$HK mov \$4,%r10 # up to H^8 jmp .Linit_start_avx ___ sub clmul64x64_avx { my ($Xhi,$Xi,$Hkey,$HK)=@_; if (!defined($HK)) { $HK = $T2; $code.=<<___; vpunpckhqdq $Xi,$Xi,$T1 vpunpckhqdq $Hkey,$Hkey,$T2 vpxor $Xi,$T1,$T1 # vpxor $Hkey,$T2,$T2 ___ } else { $code.=<<___; vpunpckhqdq $Xi,$Xi,$T1 vpxor $Xi,$T1,$T1 # ___ } $code.=<<___; vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi ####### vpclmulqdq \$0x00,$Hkey,$Xi,$Xi ####### vpclmulqdq \$0x00,$HK,$T1,$T1 ####### vpxor $Xi,$Xhi,$T2 # vpxor $T2,$T1,$T1 # vpslldq \$8,$T1,$T2 # vpsrldq \$8,$T1,$T1 vpxor $T2,$Xi,$Xi # vpxor $T1,$Xhi,$Xhi ___ } sub reduction_avx { my ($Xhi,$Xi) = @_; $code.=<<___; vpsllq \$57,$Xi,$T1 # 1st phase vpsllq \$62,$Xi,$T2 vpxor $T1,$T2,$T2 # vpsllq \$63,$Xi,$T1 vpxor $T1,$T2,$T2 # vpslldq \$8,$T2,$T1 # vpsrldq \$8,$T2,$T2 vpxor $T1,$Xi,$Xi # vpxor $T2,$Xhi,$Xhi vpsrlq \$1,$Xi,$T2 # 2nd phase vpxor $Xi,$Xhi,$Xhi vpxor $T2,$Xi,$Xi # vpsrlq \$5,$T2,$T2 vpxor $T2,$Xi,$Xi # vpsrlq \$1,$Xi,$Xi # vpxor $Xhi,$Xi,$Xi # ___ } $code.=<<___; .align 32 .Linit_loop_avx: vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi... vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt" ___ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7 &reduction_avx ($Xhi,$Xi); $code.=<<___; .Linit_start_avx: vmovdqa $Xi,$T3 ___ &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8 &reduction_avx ($Xhi,$Xi); $code.=<<___; vpshufd \$0b01001110,$T3,$T1 vpshufd \$0b01001110,$Xi,$T2 vpxor $T3,$T1,$T1 # Karatsuba pre-processing vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8 lea 0x30($Htbl),$Htbl sub \$1,%r10 jnz .Linit_loop_avx vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped vmovdqu $T3,-0x10($Htbl) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 lea 0x18(%rsp),%rsp .LSEH_end_gcm_init_avx: ___ $code.=<<___; ret .cfi_endproc .size gcm_init_avx,.-gcm_init_avx ___ } else { $code.=<<___; jmp .L_init_clmul .cfi_endproc .size gcm_init_avx,.-gcm_init_avx ___ } $code.=<<___; .globl gcm_gmult_avx .type gcm_gmult_avx,\@abi-omnipotent .align 32 gcm_gmult_avx: .cfi_startproc jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx ___ $code.=<<___; .globl gcm_ghash_avx .type gcm_ghash_avx,\@abi-omnipotent .align 32 gcm_ghash_avx: .cfi_startproc ___ if ($avx) { my ($Xip,$Htbl,$inp,$len)=@_4args; my ($Xlo,$Xhi,$Xmi, $Zlo,$Zhi,$Zmi, $Hkey,$HK,$T1,$T2, $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15)); $code.=<<___ if ($win64); lea -0x88(%rsp),%rax .LSEH_begin_gcm_ghash_avx: # I can't trust assembler to use specific encoding:-( .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax) .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax) .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax) .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax) .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax) .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax) .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax) .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax) .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax) .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax) ___ $code.=<<___; vzeroupper vmovdqu ($Xip),$Xi # load $Xi lea .L0x1c2_polynomial(%rip),%r10 lea 0x40($Htbl),$Htbl # size optimization vmovdqu .Lbswap_mask(%rip),$bswap vpshufb $bswap,$Xi,$Xi cmp \$0x80,$len jb .Lshort_avx sub \$0x80,$len vmovdqu 0x70($inp),$Ii # I[7] vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vpshufb $bswap,$Ii,$Ii vmovdqu 0x20-0x40($Htbl),$HK vpunpckhqdq $Ii,$Ii,$T2 vmovdqu 0x60($inp),$Ij # I[6] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Ii,$T2,$T2 vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpunpckhqdq $Ij,$Ij,$T1 vmovdqu 0x50($inp),$Ii # I[5] vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpxor $Ii,$T2,$T2 vmovdqu 0x40($inp),$Ij # I[4] vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0x50-0x40($Htbl),$HK vpshufb $bswap,$Ij,$Ij vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vmovdqu 0x30($inp),$Ii # I[3] vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Zhi,$Xhi,$Xhi vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpxor $Zmi,$Xmi,$Xmi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0x80-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu 0x20($inp),$Ij # I[2] vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpxor $Xmi,$Zmi,$Zmi vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x00,$HK,$T2,$Xmi vpxor $Ij,$T1,$T1 vmovdqu 0x10($inp),$Ii # I[1] vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Zhi,$Xhi,$Xhi vpshufb $bswap,$Ii,$Ii vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpxor $Zmi,$Xmi,$Xmi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK,$T1,$Zmi vmovdqu 0xb0-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu ($inp),$Ij # I[0] vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Xhi,$Zhi,$Zhi vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x10,$HK,$T2,$Xmi lea 0x80($inp),$inp cmp \$0x80,$len jb .Ltail_avx vpxor $Xi,$Ij,$Ij # accumulate $Xi sub \$0x80,$len jmp .Loop8x_avx .align 32 .Loop8x_avx: vpunpckhqdq $Ij,$Ij,$T1 vmovdqu 0x70($inp),$Ii # I[7] vpxor $Xlo,$Zlo,$Zlo vpxor $Ij,$T1,$T1 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi vpshufb $bswap,$Ii,$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xo vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Tred vmovdqu 0x20-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vmovdqu 0x60($inp),$Ij # I[6] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpxor $Zlo,$Xi,$Xi # collect result vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vxorps $Zhi,$Xo,$Xo vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpunpckhqdq $Ij,$Ij,$T1 vpclmulqdq \$0x00,$HK, $T2,$Xmi vpxor $Zmi,$Tred,$Tred vxorps $Ij,$T1,$T1 vmovdqu 0x50($inp),$Ii # I[5] vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpxor $Xo,$Tred,$Tred vpslldq \$8,$Tred,$T2 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vpsrldq \$8,$Tred,$Tred vpxor $T2, $Xi, $Xi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpshufb $bswap,$Ii,$Ii vxorps $Tred,$Xo, $Xo vpxor $Xhi,$Zhi,$Zhi vpunpckhqdq $Ii,$Ii,$T2 vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0x50-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu 0x40($inp),$Ij # I[4] vpalignr \$8,$Xi,$Xi,$Tred # 1st phase vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpunpckhqdq $Ij,$Ij,$T1 vpxor $Zhi,$Xhi,$Xhi vpclmulqdq \$0x00,$HK, $T2,$Xmi vxorps $Ij,$T1,$T1 vpxor $Zmi,$Xmi,$Xmi vmovdqu 0x30($inp),$Ii # I[3] vpclmulqdq \$0x10,(%r10),$Xi,$Xi vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpshufb $bswap,$Ii,$Ii vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0x80-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu 0x20($inp),$Ij # I[2] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpxor $Zlo,$Xlo,$Xlo vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpunpckhqdq $Ij,$Ij,$T1 vpxor $Zhi,$Xhi,$Xhi vpclmulqdq \$0x00,$HK, $T2,$Xmi vpxor $Ij,$T1,$T1 vpxor $Zmi,$Xmi,$Xmi vxorps $Tred,$Xi,$Xi vmovdqu 0x10($inp),$Ii # I[1] vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo vpshufb $bswap,$Ii,$Ii vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpclmulqdq \$0x10,(%r10),$Xi,$Xi vxorps $Xo,$Tred,$Tred vpunpckhqdq $Ii,$Ii,$T2 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x10,$HK, $T1,$Zmi vmovdqu 0xb0-0x40($Htbl),$HK vpxor $Ii,$T2,$T2 vpxor $Xmi,$Zmi,$Zmi vmovdqu ($inp),$Ij # I[0] vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo vpshufb $bswap,$Ij,$Ij vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8 vpxor $Tred,$Ij,$Ij vpclmulqdq \$0x10,$HK, $T2,$Xmi vpxor $Xi,$Ij,$Ij # accumulate $Xi lea 0x80($inp),$inp sub \$0x80,$len jnc .Loop8x_avx add \$0x80,$len jmp .Ltail_no_xor_avx .align 32 .Lshort_avx: vmovdqu -0x10($inp,$len),$Ii # very last word lea ($inp,$len),$inp vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1 vmovdqu 0x20-0x40($Htbl),$HK vpshufb $bswap,$Ii,$Ij vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo, vmovdqa $Xhi,$Zhi # $Zhi and vmovdqa $Xmi,$Zmi # $Zmi sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x20($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x30($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu 0x50-0x40($Htbl),$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x40($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x50($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu 0x80-0x40($Htbl),$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x60($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vpsrldq \$8,$HK,$HK sub \$0x10,$len jz .Ltail_avx vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vmovdqu -0x70($inp),$Ii vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7 vpshufb $bswap,$Ii,$Ij vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovq 0xb8-0x40($Htbl),$HK sub \$0x10,$len jmp .Ltail_avx .align 32 .Ltail_avx: vpxor $Xi,$Ij,$Ij # accumulate $Xi .Ltail_no_xor_avx: vpunpckhqdq $Ij,$Ij,$T1 vpxor $Xlo,$Zlo,$Zlo vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo vpxor $Ij,$T1,$T1 vpxor $Xhi,$Zhi,$Zhi vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi vpxor $Xmi,$Zmi,$Zmi vpclmulqdq \$0x00,$HK,$T1,$Xmi vmovdqu (%r10),$Tred vpxor $Xlo,$Zlo,$Xi vpxor $Xhi,$Zhi,$Xo vpxor $Xmi,$Zmi,$Zmi vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing vpxor $Xo, $Zmi,$Zmi vpslldq \$8, $Zmi,$T2 vpsrldq \$8, $Zmi,$Zmi vpxor $T2, $Xi, $Xi vpxor $Zmi,$Xo, $Xo vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase vpalignr \$8,$Xi,$Xi,$Xi vpxor $T2,$Xi,$Xi vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase vpalignr \$8,$Xi,$Xi,$Xi vpxor $Xo,$Xi,$Xi vpxor $T2,$Xi,$Xi cmp \$0,$len jne .Lshort_avx vpshufb $bswap,$Xi,$Xi vmovdqu $Xi,($Xip) vzeroupper ___ $code.=<<___ if ($win64); movaps (%rsp),%xmm6 movaps 0x10(%rsp),%xmm7 movaps 0x20(%rsp),%xmm8 movaps 0x30(%rsp),%xmm9 movaps 0x40(%rsp),%xmm10 movaps 0x50(%rsp),%xmm11 movaps 0x60(%rsp),%xmm12 movaps 0x70(%rsp),%xmm13 movaps 0x80(%rsp),%xmm14 movaps 0x90(%rsp),%xmm15 lea 0xa8(%rsp),%rsp .LSEH_end_gcm_ghash_avx: ___ $code.=<<___; ret .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx ___ } else { $code.=<<___; jmp .L_ghash_clmul .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx ___ } $code.=<<___; .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .L7_mask_poly: .long 7,0,`0xE1<<1`,0 .align 64 .type .Lrem_4bit,\@object .Lrem_4bit: .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16` .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16` .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16` .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16` .type .Lrem_8bit,\@object .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE .asciz "GHASH for x86_64, CRYPTOGAMS by " .align 64 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue lea 48+280(%rax),%rax # adjust "rsp" mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$`1232/8`,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler .section .pdata .align 4 .rva .LSEH_begin_gcm_gmult_4bit .rva .LSEH_end_gcm_gmult_4bit .rva .LSEH_info_gcm_gmult_4bit .rva .LSEH_begin_gcm_ghash_4bit .rva .LSEH_end_gcm_ghash_4bit .rva .LSEH_info_gcm_ghash_4bit .rva .LSEH_begin_gcm_init_clmul .rva .LSEH_end_gcm_init_clmul .rva .LSEH_info_gcm_init_clmul .rva .LSEH_begin_gcm_ghash_clmul .rva .LSEH_end_gcm_ghash_clmul .rva .LSEH_info_gcm_ghash_clmul ___ $code.=<<___ if ($avx); .rva .LSEH_begin_gcm_init_avx .rva .LSEH_end_gcm_init_avx .rva .LSEH_info_gcm_init_clmul .rva .LSEH_begin_gcm_ghash_avx .rva .LSEH_end_gcm_ghash_avx .rva .LSEH_info_gcm_ghash_clmul ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_gcm_gmult_4bit: .byte 9,0,0,0 .rva se_handler .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData .LSEH_info_gcm_ghash_4bit: .byte 9,0,0,0 .rva se_handler .rva .Lghash_prologue,.Lghash_epilogue # HandlerData .LSEH_info_gcm_init_clmul: .byte 0x01,0x08,0x03,0x00 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18 .LSEH_info_gcm_ghash_clmul: .byte 0x01,0x33,0x16,0x00 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8 ___ } $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/poly1305/asm/poly1305-x86.pl =================================================================== --- stable/12/crypto/openssl/crypto/poly1305/asm/poly1305-x86.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/poly1305/asm/poly1305-x86.pl (revision 364963) @@ -1,1815 +1,1815 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for x86. # # April 2015 # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # # IALU/gcc-3.4(*) SSE2(**) AVX2 # Pentium 15.7/+80% - # PIII 6.21/+90% - # P4 19.8/+40% 3.24 # Core 2 4.85/+90% 1.80 # Westmere 4.58/+100% 1.43 # Sandy Bridge 3.90/+100% 1.36 # Haswell 3.88/+70% 1.18 0.72 # Skylake 3.10/+60% 1.14 0.62 # Silvermont 11.0/+40% 4.80 # Goldmont 4.10/+200% 2.10 # VIA Nano 6.71/+90% 2.47 # Sledgehammer 3.51/+180% 4.27 # Bulldozer 4.53/+140% 1.31 # # (*) gcc 4.8 for some reason generated worse code; # (**) besides SSE2 there are floating-point and AVX options; FP # is deemed unnecessary, because pre-SSE2 processor are too # old to care about, while it's not the fastest option on # SSE2-capable ones; AVX is omitted, because it doesn't give # a lot of improvement, 5-10% depending on processor; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $sse2=$avx=0; for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } if ($sse2) { &static_label("const_sse2"); &static_label("enter_blocks"); &static_label("enter_emit"); &external_label("OPENSSL_ia32cap_P"); if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } - if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { + if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } } ######################################################################## # Layout of opaque area is following. # # unsigned __int32 h[5]; # current hash value base 2^32 # unsigned __int32 pad; # is_base2_26 in vector context # unsigned __int32 r[4]; # key value base 2^32 &align(64); &function_begin("poly1305_init"); &mov ("edi",&wparam(0)); # context &mov ("esi",&wparam(1)); # key &mov ("ebp",&wparam(2)); # function table &xor ("eax","eax"); &mov (&DWP(4*0,"edi"),"eax"); # zero hash value &mov (&DWP(4*1,"edi"),"eax"); &mov (&DWP(4*2,"edi"),"eax"); &mov (&DWP(4*3,"edi"),"eax"); &mov (&DWP(4*4,"edi"),"eax"); &mov (&DWP(4*5,"edi"),"eax"); # is_base2_26 &cmp ("esi",0); &je (&label("nokey")); if ($sse2) { &call (&label("pic_point")); &set_label("pic_point"); &blindpop("ebx"); &lea ("eax",&DWP("poly1305_blocks-".&label("pic_point"),"ebx")); &lea ("edx",&DWP("poly1305_emit-".&label("pic_point"),"ebx")); &picmeup("edi","OPENSSL_ia32cap_P","ebx",&label("pic_point")); &mov ("ecx",&DWP(0,"edi")); &and ("ecx",1<<26|1<<24); &cmp ("ecx",1<<26|1<<24); # SSE2 and XMM? &jne (&label("no_sse2")); &lea ("eax",&DWP("_poly1305_blocks_sse2-".&label("pic_point"),"ebx")); &lea ("edx",&DWP("_poly1305_emit_sse2-".&label("pic_point"),"ebx")); if ($avx>1) { &mov ("ecx",&DWP(8,"edi")); &test ("ecx",1<<5); # AVX2? &jz (&label("no_sse2")); &lea ("eax",&DWP("_poly1305_blocks_avx2-".&label("pic_point"),"ebx")); } &set_label("no_sse2"); &mov ("edi",&wparam(0)); # reload context &mov (&DWP(0,"ebp"),"eax"); # fill function table &mov (&DWP(4,"ebp"),"edx"); } &mov ("eax",&DWP(4*0,"esi")); # load input key &mov ("ebx",&DWP(4*1,"esi")); &mov ("ecx",&DWP(4*2,"esi")); &mov ("edx",&DWP(4*3,"esi")); &and ("eax",0x0fffffff); &and ("ebx",0x0ffffffc); &and ("ecx",0x0ffffffc); &and ("edx",0x0ffffffc); &mov (&DWP(4*6,"edi"),"eax"); &mov (&DWP(4*7,"edi"),"ebx"); &mov (&DWP(4*8,"edi"),"ecx"); &mov (&DWP(4*9,"edi"),"edx"); &mov ("eax",$sse2); &set_label("nokey"); &function_end("poly1305_init"); ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $r0,$r1,$r2,$r3, $s1,$s2,$s3)=map(4*$_,(0..15)); &function_begin("poly1305_blocks"); &mov ("edi",&wparam(0)); # ctx &mov ("esi",&wparam(1)); # inp &mov ("ecx",&wparam(2)); # len &set_label("enter_blocks"); &and ("ecx",-15); &jz (&label("nodata")); &stack_push(16); &mov ("eax",&DWP(4*6,"edi")); # r0 &mov ("ebx",&DWP(4*7,"edi")); # r1 &lea ("ebp",&DWP(0,"esi","ecx")); # end of input &mov ("ecx",&DWP(4*8,"edi")); # r2 &mov ("edx",&DWP(4*9,"edi")); # r3 &mov (&wparam(2),"ebp"); &mov ("ebp","esi"); &mov (&DWP($r0,"esp"),"eax"); # r0 &mov ("eax","ebx"); &shr ("eax",2); &mov (&DWP($r1,"esp"),"ebx"); # r1 &add ("eax","ebx"); # s1 &mov ("ebx","ecx"); &shr ("ebx",2); &mov (&DWP($r2,"esp"),"ecx"); # r2 &add ("ebx","ecx"); # s2 &mov ("ecx","edx"); &shr ("ecx",2); &mov (&DWP($r3,"esp"),"edx"); # r3 &add ("ecx","edx"); # s3 &mov (&DWP($s1,"esp"),"eax"); # s1 &mov (&DWP($s2,"esp"),"ebx"); # s2 &mov (&DWP($s3,"esp"),"ecx"); # s3 &mov ("eax",&DWP(4*0,"edi")); # load hash value &mov ("ebx",&DWP(4*1,"edi")); &mov ("ecx",&DWP(4*2,"edi")); &mov ("esi",&DWP(4*3,"edi")); &mov ("edi",&DWP(4*4,"edi")); &jmp (&label("loop")); &set_label("loop",32); &add ("eax",&DWP(4*0,"ebp")); # accumulate input &adc ("ebx",&DWP(4*1,"ebp")); &adc ("ecx",&DWP(4*2,"ebp")); &adc ("esi",&DWP(4*3,"ebp")); &lea ("ebp",&DWP(4*4,"ebp")); &adc ("edi",&wparam(3)); # padbit &mov (&DWP($h0,"esp"),"eax"); # put aside hash[+inp] &mov (&DWP($h3,"esp"),"esi"); &mul (&DWP($r0,"esp")); # h0*r0 &mov (&DWP($h4,"esp"),"edi"); &mov ("edi","eax"); &mov ("eax","ebx"); # h1 &mov ("esi","edx"); &mul (&DWP($s3,"esp")); # h1*s3 &add ("edi","eax"); &mov ("eax","ecx"); # h2 &adc ("esi","edx"); &mul (&DWP($s2,"esp")); # h2*s2 &add ("edi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("esi","edx"); &mul (&DWP($s1,"esp")); # h3*s1 &add ("edi","eax"); &mov ("eax",&DWP($h0,"esp")); &adc ("esi","edx"); &mul (&DWP($r1,"esp")); # h0*r1 &mov (&DWP($d0,"esp"),"edi"); &xor ("edi","edi"); &add ("esi","eax"); &mov ("eax","ebx"); # h1 &adc ("edi","edx"); &mul (&DWP($r0,"esp")); # h1*r0 &add ("esi","eax"); &mov ("eax","ecx"); # h2 &adc ("edi","edx"); &mul (&DWP($s3,"esp")); # h2*s3 &add ("esi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("edi","edx"); &mul (&DWP($s2,"esp")); # h3*s2 &add ("esi","eax"); &mov ("eax",&DWP($h4,"esp")); &adc ("edi","edx"); &imul ("eax",&DWP($s1,"esp")); # h4*s1 &add ("esi","eax"); &mov ("eax",&DWP($h0,"esp")); &adc ("edi",0); &mul (&DWP($r2,"esp")); # h0*r2 &mov (&DWP($d1,"esp"),"esi"); &xor ("esi","esi"); &add ("edi","eax"); &mov ("eax","ebx"); # h1 &adc ("esi","edx"); &mul (&DWP($r1,"esp")); # h1*r1 &add ("edi","eax"); &mov ("eax","ecx"); # h2 &adc ("esi","edx"); &mul (&DWP($r0,"esp")); # h2*r0 &add ("edi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("esi","edx"); &mul (&DWP($s3,"esp")); # h3*s3 &add ("edi","eax"); &mov ("eax",&DWP($h4,"esp")); &adc ("esi","edx"); &imul ("eax",&DWP($s2,"esp")); # h4*s2 &add ("edi","eax"); &mov ("eax",&DWP($h0,"esp")); &adc ("esi",0); &mul (&DWP($r3,"esp")); # h0*r3 &mov (&DWP($d2,"esp"),"edi"); &xor ("edi","edi"); &add ("esi","eax"); &mov ("eax","ebx"); # h1 &adc ("edi","edx"); &mul (&DWP($r2,"esp")); # h1*r2 &add ("esi","eax"); &mov ("eax","ecx"); # h2 &adc ("edi","edx"); &mul (&DWP($r1,"esp")); # h2*r1 &add ("esi","eax"); &mov ("eax",&DWP($h3,"esp")); &adc ("edi","edx"); &mul (&DWP($r0,"esp")); # h3*r0 &add ("esi","eax"); &mov ("ecx",&DWP($h4,"esp")); &adc ("edi","edx"); &mov ("edx","ecx"); &imul ("ecx",&DWP($s3,"esp")); # h4*s3 &add ("esi","ecx"); &mov ("eax",&DWP($d0,"esp")); &adc ("edi",0); &imul ("edx",&DWP($r0,"esp")); # h4*r0 &add ("edx","edi"); &mov ("ebx",&DWP($d1,"esp")); &mov ("ecx",&DWP($d2,"esp")); &mov ("edi","edx"); # last reduction step &shr ("edx",2); &and ("edi",3); &lea ("edx",&DWP(0,"edx","edx",4)); # *5 &add ("eax","edx"); &adc ("ebx",0); &adc ("ecx",0); &adc ("esi",0); &adc ("edi",0); &cmp ("ebp",&wparam(2)); # done yet? &jne (&label("loop")); &mov ("edx",&wparam(0)); # ctx &stack_pop(16); &mov (&DWP(4*0,"edx"),"eax"); # store hash value &mov (&DWP(4*1,"edx"),"ebx"); &mov (&DWP(4*2,"edx"),"ecx"); &mov (&DWP(4*3,"edx"),"esi"); &mov (&DWP(4*4,"edx"),"edi"); &set_label("nodata"); &function_end("poly1305_blocks"); &function_begin("poly1305_emit"); &mov ("ebp",&wparam(0)); # context &set_label("enter_emit"); &mov ("edi",&wparam(1)); # output &mov ("eax",&DWP(4*0,"ebp")); # load hash value &mov ("ebx",&DWP(4*1,"ebp")); &mov ("ecx",&DWP(4*2,"ebp")); &mov ("edx",&DWP(4*3,"ebp")); &mov ("esi",&DWP(4*4,"ebp")); &add ("eax",5); # compare to modulus &adc ("ebx",0); &adc ("ecx",0); &adc ("edx",0); &adc ("esi",0); &shr ("esi",2); # did it carry/borrow? &neg ("esi"); # do we choose hash-modulus? &and ("eax","esi"); &and ("ebx","esi"); &and ("ecx","esi"); &and ("edx","esi"); &mov (&DWP(4*0,"edi"),"eax"); &mov (&DWP(4*1,"edi"),"ebx"); &mov (&DWP(4*2,"edi"),"ecx"); &mov (&DWP(4*3,"edi"),"edx"); ¬ ("esi"); # or original hash value? &mov ("eax",&DWP(4*0,"ebp")); &mov ("ebx",&DWP(4*1,"ebp")); &mov ("ecx",&DWP(4*2,"ebp")); &mov ("edx",&DWP(4*3,"ebp")); &mov ("ebp",&wparam(2)); &and ("eax","esi"); &and ("ebx","esi"); &and ("ecx","esi"); &and ("edx","esi"); &or ("eax",&DWP(4*0,"edi")); &or ("ebx",&DWP(4*1,"edi")); &or ("ecx",&DWP(4*2,"edi")); &or ("edx",&DWP(4*3,"edi")); &add ("eax",&DWP(4*0,"ebp")); # accumulate key &adc ("ebx",&DWP(4*1,"ebp")); &adc ("ecx",&DWP(4*2,"ebp")); &adc ("edx",&DWP(4*3,"ebp")); &mov (&DWP(4*0,"edi"),"eax"); &mov (&DWP(4*1,"edi"),"ebx"); &mov (&DWP(4*2,"edi"),"ecx"); &mov (&DWP(4*3,"edi"),"edx"); &function_end("poly1305_emit"); if ($sse2) { ######################################################################## # Layout of opaque area is following. # # unsigned __int32 h[5]; # current hash value base 2^26 # unsigned __int32 is_base2_26; # unsigned __int32 r[4]; # key value base 2^32 # unsigned __int32 pad[2]; # struct { unsigned __int32 r^4, r^3, r^2, r^1; } r[9]; # # where r^n are base 2^26 digits of degrees of multiplier key. There are # 5 digits, but last four are interleaved with multiples of 5, totalling # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("xmm$_",(0..7)); my $MASK=$T2; # borrow and keep in mind &align (32); &function_begin_B("_poly1305_init_sse2"); &movdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 &lea ("edi",&DWP(16*3,"edi")); # size optimization &mov ("ebp","esp"); &sub ("esp",16*(9+5)); &and ("esp",-16); #&pand ($D4,&QWP(96,"ebx")); # magic mask &movq ($MASK,&QWP(64,"ebx")); &movdqa ($D0,$D4); &movdqa ($D1,$D4); &movdqa ($D2,$D4); &pand ($D0,$MASK); # -> base 2^26 &psrlq ($D1,26); &psrldq ($D2,6); &pand ($D1,$MASK); &movdqa ($D3,$D2); &psrlq ($D2,4) &psrlq ($D3,30); &pand ($D2,$MASK); &pand ($D3,$MASK); &psrldq ($D4,13); &lea ("edx",&DWP(16*9,"esp")); # size optimization &mov ("ecx",2); &set_label("square"); &movdqa (&QWP(16*0,"esp"),$D0); &movdqa (&QWP(16*1,"esp"),$D1); &movdqa (&QWP(16*2,"esp"),$D2); &movdqa (&QWP(16*3,"esp"),$D3); &movdqa (&QWP(16*4,"esp"),$D4); &movdqa ($T1,$D1); &movdqa ($T0,$D2); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D1); # *5 &paddd ($T0,$D2); # *5 &movdqa (&QWP(16*5,"esp"),$T1); &movdqa (&QWP(16*6,"esp"),$T0); &movdqa ($T1,$D3); &movdqa ($T0,$D4); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D3); # *5 &paddd ($T0,$D4); # *5 &movdqa (&QWP(16*7,"esp"),$T1); &movdqa (&QWP(16*8,"esp"),$T0); &pshufd ($T1,$D0,0b01000100); &movdqa ($T0,$D1); &pshufd ($D1,$D1,0b01000100); &pshufd ($D2,$D2,0b01000100); &pshufd ($D3,$D3,0b01000100); &pshufd ($D4,$D4,0b01000100); &movdqa (&QWP(16*0,"edx"),$T1); &movdqa (&QWP(16*1,"edx"),$D1); &movdqa (&QWP(16*2,"edx"),$D2); &movdqa (&QWP(16*3,"edx"),$D3); &movdqa (&QWP(16*4,"edx"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &pmuludq ($D4,$D0); # h4*r0 &pmuludq ($D3,$D0); # h3*r0 &pmuludq ($D2,$D0); # h2*r0 &pmuludq ($D1,$D0); # h1*r0 &pmuludq ($D0,$T1); # h0*r0 sub pmuladd { my $load = shift; my $base = shift; $base = "esp" if (!defined($base)); ################################################################ # As for choice to "rotate" $T0-$T2 in order to move paddq # past next multiplication. While it makes code harder to read # and doesn't have significant effect on most processors, it # makes a lot of difference on Atom, up to 30% improvement. &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*3,$base)); # r1*h3 &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*2,$base)); # r1*h2 &paddq ($D4,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*1,$base)); # r1*h1 &paddq ($D3,$T1); &$load ($T1,5); # s1 &pmuludq ($T0,&QWP(16*0,$base)); # r1*h0 &paddq ($D2,$T2); &pmuludq ($T1,&QWP(16*4,$base)); # s1*h4 &$load ($T2,2); # r2^n &paddq ($D1,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*2,$base)); # r2*h2 &paddq ($D0,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*1,$base)); # r2*h1 &paddq ($D4,$T2); &$load ($T2,6); # s2^n &pmuludq ($T1,&QWP(16*0,$base)); # r2*h0 &paddq ($D3,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*4,$base)); # s2*h4 &paddq ($D2,$T1); &pmuludq ($T0,&QWP(16*3,$base)); # s2*h3 &$load ($T1,3); # r3^n &paddq ($D1,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*1,$base)); # r3*h1 &paddq ($D0,$T0); &$load ($T0,7); # s3^n &pmuludq ($T2,&QWP(16*0,$base)); # r3*h0 &paddq ($D4,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*4,$base)); # s3*h4 &paddq ($D3,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*3,$base)); # s3*h3 &paddq ($D2,$T0); &pmuludq ($T2,&QWP(16*2,$base)); # s3*h2 &$load ($T0,4); # r4^n &paddq ($D1,$T1); &$load ($T1,8); # s4^n &pmuludq ($T0,&QWP(16*0,$base)); # r4*h0 &paddq ($D0,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&QWP(16*4,$base)); # s4*h4 &paddq ($D4,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&QWP(16*1,$base)); # s4*h1 &paddq ($D3,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&QWP(16*2,$base)); # s4*h2 &paddq ($D0,$T2); &pmuludq ($T1,&QWP(16*3,$base)); # s4*h3 &movdqa ($MASK,&QWP(64,"ebx")); &paddq ($D1,$T0); &paddq ($D2,$T1); } &pmuladd (sub { my ($reg,$i)=@_; &movdqa ($reg,&QWP(16*$i,"esp")); },"edx"); sub lazy_reduction { my $extra = shift; ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe # # [(*) see discussion in poly1305-armv4 module] &movdqa ($T0,$D3); &pand ($D3,$MASK); &psrlq ($T0,26); &$extra () if (defined($extra)); &paddq ($T0,$D4); # h3 -> h4 &movdqa ($T1,$D0); &pand ($D0,$MASK); &psrlq ($T1,26); &movdqa ($D4,$T0); &paddq ($T1,$D1); # h0 -> h1 &psrlq ($T0,26); &pand ($D4,$MASK); &movdqa ($D1,$T1); &psrlq ($T1,26); &paddd ($D0,$T0); # favour paddd when # possible, because # paddq is "broken" # on Atom &psllq ($T0,2); &paddq ($T1,$D2); # h1 -> h2 &paddq ($T0,$D0); # h4 -> h0 (*) &pand ($D1,$MASK); &movdqa ($D2,$T1); &psrlq ($T1,26); &pand ($D2,$MASK); &paddd ($T1,$D3); # h2 -> h3 &movdqa ($D0,$T0); &psrlq ($T0,26); &movdqa ($D3,$T1); &psrlq ($T1,26); &pand ($D0,$MASK); &paddd ($D1,$T0); # h0 -> h1 &pand ($D3,$MASK); &paddd ($D4,$T1); # h3 -> h4 } &lazy_reduction (); &dec ("ecx"); &jz (&label("square_break")); &punpcklqdq ($D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 &punpcklqdq ($D1,&QWP(16*1,"esp")); &punpcklqdq ($D2,&QWP(16*2,"esp")); &punpcklqdq ($D3,&QWP(16*3,"esp")); &punpcklqdq ($D4,&QWP(16*4,"esp")); &jmp (&label("square")); &set_label("square_break"); &psllq ($D0,32); # -> r^3:0:r^4:0 &psllq ($D1,32); &psllq ($D2,32); &psllq ($D3,32); &psllq ($D4,32); &por ($D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 &por ($D1,&QWP(16*1,"esp")); &por ($D2,&QWP(16*2,"esp")); &por ($D3,&QWP(16*3,"esp")); &por ($D4,&QWP(16*4,"esp")); &pshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 &pshufd ($D1,$D1,0b10001101); &pshufd ($D2,$D2,0b10001101); &pshufd ($D3,$D3,0b10001101); &pshufd ($D4,$D4,0b10001101); &movdqu (&QWP(16*0,"edi"),$D0); # save the table &movdqu (&QWP(16*1,"edi"),$D1); &movdqu (&QWP(16*2,"edi"),$D2); &movdqu (&QWP(16*3,"edi"),$D3); &movdqu (&QWP(16*4,"edi"),$D4); &movdqa ($T1,$D1); &movdqa ($T0,$D2); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D1); # *5 &paddd ($T0,$D2); # *5 &movdqu (&QWP(16*5,"edi"),$T1); &movdqu (&QWP(16*6,"edi"),$T0); &movdqa ($T1,$D3); &movdqa ($T0,$D4); &pslld ($T1,2); &pslld ($T0,2); &paddd ($T1,$D3); # *5 &paddd ($T0,$D4); # *5 &movdqu (&QWP(16*7,"edi"),$T1); &movdqu (&QWP(16*8,"edi"),$T0); &mov ("esp","ebp"); &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization &ret (); &function_end_B("_poly1305_init_sse2"); &align (32); &function_begin("_poly1305_blocks_sse2"); &mov ("edi",&wparam(0)); # ctx &mov ("esi",&wparam(1)); # inp &mov ("ecx",&wparam(2)); # len &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 &and ("ecx",-16); &jz (&label("nodata")); &cmp ("ecx",64); &jae (&label("enter_sse2")); &test ("eax","eax"); # is_base2_26? &jz (&label("enter_blocks")); &set_label("enter_sse2",16); &call (&label("pic_point")); &set_label("pic_point"); &blindpop("ebx"); &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); &test ("eax","eax"); # is_base2_26? &jnz (&label("base2_26")); &call ("_poly1305_init_sse2"); ################################################# base 2^32 -> base 2^26 &mov ("eax",&DWP(0,"edi")); &mov ("ecx",&DWP(3,"edi")); &mov ("edx",&DWP(6,"edi")); &mov ("esi",&DWP(9,"edi")); &mov ("ebp",&DWP(13,"edi")); &mov (&DWP(4*5,"edi"),1); # is_base2_26 &shr ("ecx",2); &and ("eax",0x3ffffff); &shr ("edx",4); &and ("ecx",0x3ffffff); &shr ("esi",6); &and ("edx",0x3ffffff); &movd ($D0,"eax"); &movd ($D1,"ecx"); &movd ($D2,"edx"); &movd ($D3,"esi"); &movd ($D4,"ebp"); &mov ("esi",&wparam(1)); # [reload] inp &mov ("ecx",&wparam(2)); # [reload] len &jmp (&label("base2_32")); &set_label("base2_26",16); &movd ($D0,&DWP(4*0,"edi")); # load hash value &movd ($D1,&DWP(4*1,"edi")); &movd ($D2,&DWP(4*2,"edi")); &movd ($D3,&DWP(4*3,"edi")); &movd ($D4,&DWP(4*4,"edi")); &movdqa ($MASK,&QWP(64,"ebx")); &set_label("base2_32"); &mov ("eax",&wparam(3)); # padbit &mov ("ebp","esp"); &sub ("esp",16*(5+5+5+9+9)); &and ("esp",-16); &lea ("edi",&DWP(16*3,"edi")); # size optimization &shl ("eax",24); # padbit &test ("ecx",31); &jz (&label("even")); ################################################################ # process single block, with SSE2, because it's still faster # even though half of result is discarded &movdqu ($T1,&QWP(0,"esi")); # input &lea ("esi",&DWP(16,"esi")); &movdqa ($T0,$T1); # -> base 2^26 ... &pand ($T1,$MASK); &paddd ($D0,$T1); # ... and accumulate &movdqa ($T1,$T0); &psrlq ($T0,26); &psrldq ($T1,6); &pand ($T0,$MASK); &paddd ($D1,$T0); &movdqa ($T0,$T1); &psrlq ($T1,4); &pand ($T1,$MASK); &paddd ($D2,$T1); &movdqa ($T1,$T0); &psrlq ($T0,30); &pand ($T0,$MASK); &psrldq ($T1,7); &paddd ($D3,$T0); &movd ($T0,"eax"); # padbit &paddd ($D4,$T1); &movd ($T1,&DWP(16*0+12,"edi")); # r0 &paddd ($D4,$T0); &movdqa (&QWP(16*0,"esp"),$D0); &movdqa (&QWP(16*1,"esp"),$D1); &movdqa (&QWP(16*2,"esp"),$D2); &movdqa (&QWP(16*3,"esp"),$D3); &movdqa (&QWP(16*4,"esp"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &pmuludq ($D0,$T1); # h4*r0 &pmuludq ($D1,$T1); # h3*r0 &pmuludq ($D2,$T1); # h2*r0 &movd ($T0,&DWP(16*1+12,"edi")); # r1 &pmuludq ($D3,$T1); # h1*r0 &pmuludq ($D4,$T1); # h0*r0 &pmuladd (sub { my ($reg,$i)=@_; &movd ($reg,&DWP(16*$i+12,"edi")); }); &lazy_reduction (); &sub ("ecx",16); &jz (&label("done")); &set_label("even"); &lea ("edx",&DWP(16*(5+5+5+9),"esp"));# size optimization &lea ("eax",&DWP(-16*2,"esi")); &sub ("ecx",64); ################################################################ # expand and copy pre-calculated table to stack &movdqu ($T0,&QWP(16*0,"edi")); # r^1:r^2:r^3:r^4 &pshufd ($T1,$T0,0b01000100); # duplicate r^3:r^4 &cmovb ("esi","eax"); &pshufd ($T0,$T0,0b11101110); # duplicate r^1:r^2 &movdqa (&QWP(16*0,"edx"),$T1); &lea ("eax",&DWP(16*10,"esp")); &movdqu ($T1,&QWP(16*1,"edi")); &movdqa (&QWP(16*(0-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*1,"edx"),$T0); &movdqu ($T0,&QWP(16*2,"edi")); &movdqa (&QWP(16*(1-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*2,"edx"),$T1); &movdqu ($T1,&QWP(16*3,"edi")); &movdqa (&QWP(16*(2-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*3,"edx"),$T0); &movdqu ($T0,&QWP(16*4,"edi")); &movdqa (&QWP(16*(3-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*4,"edx"),$T1); &movdqu ($T1,&QWP(16*5,"edi")); &movdqa (&QWP(16*(4-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*5,"edx"),$T0); &movdqu ($T0,&QWP(16*6,"edi")); &movdqa (&QWP(16*(5-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*6,"edx"),$T1); &movdqu ($T1,&QWP(16*7,"edi")); &movdqa (&QWP(16*(6-9),"edx"),$T0); &pshufd ($T0,$T1,0b01000100); &pshufd ($T1,$T1,0b11101110); &movdqa (&QWP(16*7,"edx"),$T0); &movdqu ($T0,&QWP(16*8,"edi")); &movdqa (&QWP(16*(7-9),"edx"),$T1); &pshufd ($T1,$T0,0b01000100); &pshufd ($T0,$T0,0b11101110); &movdqa (&QWP(16*8,"edx"),$T1); &movdqa (&QWP(16*(8-9),"edx"),$T0); sub load_input { my ($inpbase,$offbase)=@_; &movdqu ($T0,&QWP($inpbase+0,"esi")); # load input &movdqu ($T1,&QWP($inpbase+16,"esi")); &lea ("esi",&DWP(16*2,"esi")); &movdqa (&QWP($offbase+16*2,"esp"),$D2); &movdqa (&QWP($offbase+16*3,"esp"),$D3); &movdqa (&QWP($offbase+16*4,"esp"),$D4); &movdqa ($D2,$T0); # splat input &movdqa ($D3,$T1); &psrldq ($D2,6); &psrldq ($D3,6); &movdqa ($D4,$T0); &punpcklqdq ($D2,$D3); # 2:3 &punpckhqdq ($D4,$T1); # 4 &punpcklqdq ($T0,$T1); # 0:1 &movdqa ($D3,$D2); &psrlq ($D2,4); &psrlq ($D3,30); &movdqa ($T1,$T0); &psrlq ($D4,40); # 4 &psrlq ($T1,26); &pand ($T0,$MASK); # 0 &pand ($T1,$MASK); # 1 &pand ($D2,$MASK); # 2 &pand ($D3,$MASK); # 3 &por ($D4,&QWP(0,"ebx")); # padbit, yes, always &movdqa (&QWP($offbase+16*0,"esp"),$D0) if ($offbase); &movdqa (&QWP($offbase+16*1,"esp"),$D1) if ($offbase); } &load_input (16*2,16*5); &jbe (&label("skip_loop")); &jmp (&label("loop")); &set_label("loop",32); ################################################################ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r # \___________________/ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r # \___________________/ \____________________/ ################################################################ &movdqa ($T2,&QWP(16*(0-9),"edx")); # r0^2 &movdqa (&QWP(16*1,"eax"),$T1); &movdqa (&QWP(16*2,"eax"),$D2); &movdqa (&QWP(16*3,"eax"),$D3); &movdqa (&QWP(16*4,"eax"),$D4); ################################################################ # d4 = h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 # d3 = h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 # d1 = h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 # d0 = h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 &movdqa ($D1,$T0); &pmuludq ($T0,$T2); # h0*r0 &movdqa ($D0,$T1); &pmuludq ($T1,$T2); # h1*r0 &pmuludq ($D2,$T2); # h2*r0 &pmuludq ($D3,$T2); # h3*r0 &pmuludq ($D4,$T2); # h4*r0 sub pmuladd_alt { my $addr = shift; &pmuludq ($D0,&$addr(8)); # h1*s4 &movdqa ($T2,$D1); &pmuludq ($D1,&$addr(1)); # h0*r1 &paddq ($D0,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(2)); # h0*r2 &paddq ($D1,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(3)); # h0*r3 &paddq ($D2,$T2); &movdqa ($T2,&QWP(16*1,"eax")); # pull h1 &pmuludq ($T1,&$addr(4)); # h0*r4 &paddq ($D3,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(1)); # h1*r1 &paddq ($D4,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(2)); # h1*r2 &paddq ($D2,$T2); &movdqa ($T2,&QWP(16*2,"eax")); # pull h2 &pmuludq ($T1,&$addr(3)); # h1*r3 &paddq ($D3,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(7)); # h2*s3 &paddq ($D4,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(8)); # h2*s4 &paddq ($D0,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&$addr(1)); # h2*r1 &paddq ($D1,$T0); &movdqa ($T0,&QWP(16*3,"eax")); # pull h3 &pmuludq ($T2,&$addr(2)); # h2*r2 &paddq ($D3,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(6)); # h3*s2 &paddq ($D4,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&$addr(7)); # h3*s3 &paddq ($D0,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(8)); # h3*s4 &paddq ($D1,$T1); &movdqa ($T1,&QWP(16*4,"eax")); # pull h4 &pmuludq ($T0,&$addr(1)); # h3*r1 &paddq ($D2,$T2); &movdqa ($T2,$T1); &pmuludq ($T1,&$addr(8)); # h4*s4 &paddq ($D4,$T0); &movdqa ($T0,$T2); &pmuludq ($T2,&$addr(5)); # h4*s1 &paddq ($D3,$T1); &movdqa ($T1,$T0); &pmuludq ($T0,&$addr(6)); # h4*s2 &paddq ($D0,$T2); &movdqa ($MASK,&QWP(64,"ebx")); &pmuludq ($T1,&$addr(7)); # h4*s3 &paddq ($D1,$T0); &paddq ($D2,$T1); } &pmuladd_alt (sub { my $i=shift; &QWP(16*($i-9),"edx"); }); &load_input (-16*2,0); &lea ("eax",&DWP(-16*2,"esi")); &sub ("ecx",64); &paddd ($T0,&QWP(16*(5+0),"esp")); # add hash value &paddd ($T1,&QWP(16*(5+1),"esp")); &paddd ($D2,&QWP(16*(5+2),"esp")); &paddd ($D3,&QWP(16*(5+3),"esp")); &paddd ($D4,&QWP(16*(5+4),"esp")); &cmovb ("esi","eax"); &lea ("eax",&DWP(16*10,"esp")); &movdqa ($T2,&QWP(16*0,"edx")); # r0^4 &movdqa (&QWP(16*1,"esp"),$D1); &movdqa (&QWP(16*1,"eax"),$T1); &movdqa (&QWP(16*2,"eax"),$D2); &movdqa (&QWP(16*3,"eax"),$D3); &movdqa (&QWP(16*4,"eax"),$D4); ################################################################ # d4 += h4*r0 + h0*r4 + h1*r3 + h2*r2 + h3*r1 # d3 += h3*r0 + h0*r3 + h1*r2 + h2*r1 + h4*5*r4 # d2 += h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 # d1 += h1*r0 + h0*r1 + h2*5*r4 + h3*5*r3 + h4*5*r2 # d0 += h0*r0 + h1*5*r4 + h2*5*r3 + h3*5*r2 + h4*5*r1 &movdqa ($D1,$T0); &pmuludq ($T0,$T2); # h0*r0 &paddq ($T0,$D0); &movdqa ($D0,$T1); &pmuludq ($T1,$T2); # h1*r0 &pmuludq ($D2,$T2); # h2*r0 &pmuludq ($D3,$T2); # h3*r0 &pmuludq ($D4,$T2); # h4*r0 &paddq ($T1,&QWP(16*1,"esp")); &paddq ($D2,&QWP(16*2,"esp")); &paddq ($D3,&QWP(16*3,"esp")); &paddq ($D4,&QWP(16*4,"esp")); &pmuladd_alt (sub { my $i=shift; &QWP(16*$i,"edx"); }); &lazy_reduction (); &load_input (16*2,16*5); &ja (&label("loop")); &set_label("skip_loop"); ################################################################ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 &pshufd ($T2,&QWP(16*(0-9),"edx"),0x10);# r0^n &add ("ecx",32); &jnz (&label("long_tail")); &paddd ($T0,$D0); # add hash value &paddd ($T1,$D1); &paddd ($D2,&QWP(16*7,"esp")); &paddd ($D3,&QWP(16*8,"esp")); &paddd ($D4,&QWP(16*9,"esp")); &set_label("long_tail"); &movdqa (&QWP(16*0,"eax"),$T0); &movdqa (&QWP(16*1,"eax"),$T1); &movdqa (&QWP(16*2,"eax"),$D2); &movdqa (&QWP(16*3,"eax"),$D3); &movdqa (&QWP(16*4,"eax"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &pmuludq ($T0,$T2); # h0*r0 &pmuludq ($T1,$T2); # h1*r0 &pmuludq ($D2,$T2); # h2*r0 &movdqa ($D0,$T0); &pshufd ($T0,&QWP(16*(1-9),"edx"),0x10);# r1^n &pmuludq ($D3,$T2); # h3*r0 &movdqa ($D1,$T1); &pmuludq ($D4,$T2); # h4*r0 &pmuladd (sub { my ($reg,$i)=@_; &pshufd ($reg,&QWP(16*($i-9),"edx"),0x10); },"eax"); &jz (&label("short_tail")); &load_input (-16*2,0); &pshufd ($T2,&QWP(16*0,"edx"),0x10); # r0^n &paddd ($T0,&QWP(16*5,"esp")); # add hash value &paddd ($T1,&QWP(16*6,"esp")); &paddd ($D2,&QWP(16*7,"esp")); &paddd ($D3,&QWP(16*8,"esp")); &paddd ($D4,&QWP(16*9,"esp")); ################################################################ # multiply inp[0:1] by r^4:r^3 and accumulate &movdqa (&QWP(16*0,"esp"),$T0); &pmuludq ($T0,$T2); # h0*r0 &movdqa (&QWP(16*1,"esp"),$T1); &pmuludq ($T1,$T2); # h1*r0 &paddq ($D0,$T0); &movdqa ($T0,$D2); &pmuludq ($D2,$T2); # h2*r0 &paddq ($D1,$T1); &movdqa ($T1,$D3); &pmuludq ($D3,$T2); # h3*r0 &paddq ($D2,&QWP(16*2,"esp")); &movdqa (&QWP(16*2,"esp"),$T0); &pshufd ($T0,&QWP(16*1,"edx"),0x10); # r1^n &paddq ($D3,&QWP(16*3,"esp")); &movdqa (&QWP(16*3,"esp"),$T1); &movdqa ($T1,$D4); &pmuludq ($D4,$T2); # h4*r0 &paddq ($D4,&QWP(16*4,"esp")); &movdqa (&QWP(16*4,"esp"),$T1); &pmuladd (sub { my ($reg,$i)=@_; &pshufd ($reg,&QWP(16*$i,"edx"),0x10); }); &set_label("short_tail"); ################################################################ # horizontal addition &pshufd ($T1,$D4,0b01001110); &pshufd ($T0,$D3,0b01001110); &paddq ($D4,$T1); &paddq ($D3,$T0); &pshufd ($T1,$D0,0b01001110); &pshufd ($T0,$D1,0b01001110); &paddq ($D0,$T1); &paddq ($D1,$T0); &pshufd ($T1,$D2,0b01001110); #&paddq ($D2,$T1); &lazy_reduction (sub { &paddq ($D2,$T1) }); &set_label("done"); &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value &movd (&DWP(-16*3+4*1,"edi"),$D1); &movd (&DWP(-16*3+4*2,"edi"),$D2); &movd (&DWP(-16*3+4*3,"edi"),$D3); &movd (&DWP(-16*3+4*4,"edi"),$D4); &mov ("esp","ebp"); &set_label("nodata"); &function_end("_poly1305_blocks_sse2"); &align (32); &function_begin("_poly1305_emit_sse2"); &mov ("ebp",&wparam(0)); # context &cmp (&DWP(4*5,"ebp"),0); # is_base2_26? &je (&label("enter_emit")); &mov ("eax",&DWP(4*0,"ebp")); # load hash value &mov ("edi",&DWP(4*1,"ebp")); &mov ("ecx",&DWP(4*2,"ebp")); &mov ("edx",&DWP(4*3,"ebp")); &mov ("esi",&DWP(4*4,"ebp")); &mov ("ebx","edi"); # base 2^26 -> base 2^32 &shl ("edi",26); &shr ("ebx",6); &add ("eax","edi"); &mov ("edi","ecx"); &adc ("ebx",0); &shl ("edi",20); &shr ("ecx",12); &add ("ebx","edi"); &mov ("edi","edx"); &adc ("ecx",0); &shl ("edi",14); &shr ("edx",18); &add ("ecx","edi"); &mov ("edi","esi"); &adc ("edx",0); &shl ("edi",8); &shr ("esi",24); &add ("edx","edi"); &adc ("esi",0); # can be partially reduced &mov ("edi","esi"); # final reduction &and ("esi",3); &shr ("edi",2); &lea ("ebp",&DWP(0,"edi","edi",4)); # *5 &mov ("edi",&wparam(1)); # output &add ("eax","ebp"); &mov ("ebp",&wparam(2)); # key &adc ("ebx",0); &adc ("ecx",0); &adc ("edx",0); &adc ("esi",0); &movd ($D0,"eax"); # offload original hash value &add ("eax",5); # compare to modulus &movd ($D1,"ebx"); &adc ("ebx",0); &movd ($D2,"ecx"); &adc ("ecx",0); &movd ($D3,"edx"); &adc ("edx",0); &adc ("esi",0); &shr ("esi",2); # did it carry/borrow? &neg ("esi"); # do we choose (hash-modulus) ... &and ("eax","esi"); &and ("ebx","esi"); &and ("ecx","esi"); &and ("edx","esi"); &mov (&DWP(4*0,"edi"),"eax"); &movd ("eax",$D0); &mov (&DWP(4*1,"edi"),"ebx"); &movd ("ebx",$D1); &mov (&DWP(4*2,"edi"),"ecx"); &movd ("ecx",$D2); &mov (&DWP(4*3,"edi"),"edx"); &movd ("edx",$D3); ¬ ("esi"); # ... or original hash value? &and ("eax","esi"); &and ("ebx","esi"); &or ("eax",&DWP(4*0,"edi")); &and ("ecx","esi"); &or ("ebx",&DWP(4*1,"edi")); &and ("edx","esi"); &or ("ecx",&DWP(4*2,"edi")); &or ("edx",&DWP(4*3,"edi")); &add ("eax",&DWP(4*0,"ebp")); # accumulate key &adc ("ebx",&DWP(4*1,"ebp")); &mov (&DWP(4*0,"edi"),"eax"); &adc ("ecx",&DWP(4*2,"ebp")); &mov (&DWP(4*1,"edi"),"ebx"); &adc ("edx",&DWP(4*3,"ebp")); &mov (&DWP(4*2,"edi"),"ecx"); &mov (&DWP(4*3,"edi"),"edx"); &function_end("_poly1305_emit_sse2"); if ($avx>1) { ######################################################################## # Note that poly1305_init_avx2 operates on %xmm, I could have used # poly1305_init_sse2... &align (32); &function_begin_B("_poly1305_init_avx2"); &vmovdqu ($D4,&QWP(4*6,"edi")); # key base 2^32 &lea ("edi",&DWP(16*3,"edi")); # size optimization &mov ("ebp","esp"); &sub ("esp",16*(9+5)); &and ("esp",-16); #&vpand ($D4,$D4,&QWP(96,"ebx")); # magic mask &vmovdqa ($MASK,&QWP(64,"ebx")); &vpand ($D0,$D4,$MASK); # -> base 2^26 &vpsrlq ($D1,$D4,26); &vpsrldq ($D3,$D4,6); &vpand ($D1,$D1,$MASK); &vpsrlq ($D2,$D3,4) &vpsrlq ($D3,$D3,30); &vpand ($D2,$D2,$MASK); &vpand ($D3,$D3,$MASK); &vpsrldq ($D4,$D4,13); &lea ("edx",&DWP(16*9,"esp")); # size optimization &mov ("ecx",2); &set_label("square"); &vmovdqa (&QWP(16*0,"esp"),$D0); &vmovdqa (&QWP(16*1,"esp"),$D1); &vmovdqa (&QWP(16*2,"esp"),$D2); &vmovdqa (&QWP(16*3,"esp"),$D3); &vmovdqa (&QWP(16*4,"esp"),$D4); &vpslld ($T1,$D1,2); &vpslld ($T0,$D2,2); &vpaddd ($T1,$T1,$D1); # *5 &vpaddd ($T0,$T0,$D2); # *5 &vmovdqa (&QWP(16*5,"esp"),$T1); &vmovdqa (&QWP(16*6,"esp"),$T0); &vpslld ($T1,$D3,2); &vpslld ($T0,$D4,2); &vpaddd ($T1,$T1,$D3); # *5 &vpaddd ($T0,$T0,$D4); # *5 &vmovdqa (&QWP(16*7,"esp"),$T1); &vmovdqa (&QWP(16*8,"esp"),$T0); &vpshufd ($T0,$D0,0b01000100); &vmovdqa ($T1,$D1); &vpshufd ($D1,$D1,0b01000100); &vpshufd ($D2,$D2,0b01000100); &vpshufd ($D3,$D3,0b01000100); &vpshufd ($D4,$D4,0b01000100); &vmovdqa (&QWP(16*0,"edx"),$T0); &vmovdqa (&QWP(16*1,"edx"),$D1); &vmovdqa (&QWP(16*2,"edx"),$D2); &vmovdqa (&QWP(16*3,"edx"),$D3); &vmovdqa (&QWP(16*4,"edx"),$D4); ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 &vpmuludq ($D4,$D4,$D0); # h4*r0 &vpmuludq ($D3,$D3,$D0); # h3*r0 &vpmuludq ($D2,$D2,$D0); # h2*r0 &vpmuludq ($D1,$D1,$D0); # h1*r0 &vpmuludq ($D0,$T0,$D0); # h0*r0 &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # r1*h3 &vpaddq ($D4,$D4,$T0); &vpmuludq ($T2,$T1,&QWP(16*2,"edx")); # r1*h2 &vpaddq ($D3,$D3,$T2); &vpmuludq ($T0,$T1,&QWP(16*1,"edx")); # r1*h1 &vpaddq ($D2,$D2,$T0); &vmovdqa ($T2,&QWP(16*5,"esp")); # s1 &vpmuludq ($T1,$T1,&QWP(16*0,"edx")); # r1*h0 &vpaddq ($D1,$D1,$T1); &vmovdqa ($T0,&QWP(16*2,"esp")); # r2 &vpmuludq ($T2,$T2,&QWP(16*4,"edx")); # s1*h4 &vpaddq ($D0,$D0,$T2); &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # r2*h2 &vpaddq ($D4,$D4,$T1); &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r2*h1 &vpaddq ($D3,$D3,$T2); &vmovdqa ($T1,&QWP(16*6,"esp")); # s2 &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r2*h0 &vpaddq ($D2,$D2,$T0); &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s2*h4 &vpaddq ($D1,$D1,$T2); &vmovdqa ($T0,&QWP(16*3,"esp")); # r3 &vpmuludq ($T1,$T1,&QWP(16*3,"edx")); # s2*h3 &vpaddq ($D0,$D0,$T1); &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # r3*h1 &vpaddq ($D4,$D4,$T2); &vmovdqa ($T1,&QWP(16*7,"esp")); # s3 &vpmuludq ($T0,$T0,&QWP(16*0,"edx")); # r3*h0 &vpaddq ($D3,$D3,$T0); &vpmuludq ($T2,$T1,&QWP(16*4,"edx")); # s3*h4 &vpaddq ($D2,$D2,$T2); &vpmuludq ($T0,$T1,&QWP(16*3,"edx")); # s3*h3 &vpaddq ($D1,$D1,$T0); &vmovdqa ($T2,&QWP(16*4,"esp")); # r4 &vpmuludq ($T1,$T1,&QWP(16*2,"edx")); # s3*h2 &vpaddq ($D0,$D0,$T1); &vmovdqa ($T0,&QWP(16*8,"esp")); # s4 &vpmuludq ($T2,$T2,&QWP(16*0,"edx")); # r4*h0 &vpaddq ($D4,$D4,$T2); &vpmuludq ($T1,$T0,&QWP(16*4,"edx")); # s4*h4 &vpaddq ($D3,$D3,$T1); &vpmuludq ($T2,$T0,&QWP(16*1,"edx")); # s4*h1 &vpaddq ($D0,$D0,$T2); &vpmuludq ($T1,$T0,&QWP(16*2,"edx")); # s4*h2 &vpaddq ($D1,$D1,$T1); &vmovdqa ($MASK,&QWP(64,"ebx")); &vpmuludq ($T0,$T0,&QWP(16*3,"edx")); # s4*h3 &vpaddq ($D2,$D2,$T0); ################################################################ # lazy reduction &vpsrlq ($T0,$D3,26); &vpand ($D3,$D3,$MASK); &vpsrlq ($T1,$D0,26); &vpand ($D0,$D0,$MASK); &vpaddq ($D4,$D4,$T0); # h3 -> h4 &vpaddq ($D1,$D1,$T1); # h0 -> h1 &vpsrlq ($T0,$D4,26); &vpand ($D4,$D4,$MASK); &vpsrlq ($T1,$D1,26); &vpand ($D1,$D1,$MASK); &vpaddq ($D2,$D2,$T1); # h1 -> h2 &vpaddd ($D0,$D0,$T0); &vpsllq ($T0,$T0,2); &vpsrlq ($T1,$D2,26); &vpand ($D2,$D2,$MASK); &vpaddd ($D0,$D0,$T0); # h4 -> h0 &vpaddd ($D3,$D3,$T1); # h2 -> h3 &vpsrlq ($T1,$D3,26); &vpsrlq ($T0,$D0,26); &vpand ($D0,$D0,$MASK); &vpand ($D3,$D3,$MASK); &vpaddd ($D1,$D1,$T0); # h0 -> h1 &vpaddd ($D4,$D4,$T1); # h3 -> h4 &dec ("ecx"); &jz (&label("square_break")); &vpunpcklqdq ($D0,$D0,&QWP(16*0,"esp")); # 0:r^1:0:r^2 &vpunpcklqdq ($D1,$D1,&QWP(16*1,"esp")); &vpunpcklqdq ($D2,$D2,&QWP(16*2,"esp")); &vpunpcklqdq ($D3,$D3,&QWP(16*3,"esp")); &vpunpcklqdq ($D4,$D4,&QWP(16*4,"esp")); &jmp (&label("square")); &set_label("square_break"); &vpsllq ($D0,$D0,32); # -> r^3:0:r^4:0 &vpsllq ($D1,$D1,32); &vpsllq ($D2,$D2,32); &vpsllq ($D3,$D3,32); &vpsllq ($D4,$D4,32); &vpor ($D0,$D0,&QWP(16*0,"esp")); # r^3:r^1:r^4:r^2 &vpor ($D1,$D1,&QWP(16*1,"esp")); &vpor ($D2,$D2,&QWP(16*2,"esp")); &vpor ($D3,$D3,&QWP(16*3,"esp")); &vpor ($D4,$D4,&QWP(16*4,"esp")); &vpshufd ($D0,$D0,0b10001101); # -> r^1:r^2:r^3:r^4 &vpshufd ($D1,$D1,0b10001101); &vpshufd ($D2,$D2,0b10001101); &vpshufd ($D3,$D3,0b10001101); &vpshufd ($D4,$D4,0b10001101); &vmovdqu (&QWP(16*0,"edi"),$D0); # save the table &vmovdqu (&QWP(16*1,"edi"),$D1); &vmovdqu (&QWP(16*2,"edi"),$D2); &vmovdqu (&QWP(16*3,"edi"),$D3); &vmovdqu (&QWP(16*4,"edi"),$D4); &vpslld ($T1,$D1,2); &vpslld ($T0,$D2,2); &vpaddd ($T1,$T1,$D1); # *5 &vpaddd ($T0,$T0,$D2); # *5 &vmovdqu (&QWP(16*5,"edi"),$T1); &vmovdqu (&QWP(16*6,"edi"),$T0); &vpslld ($T1,$D3,2); &vpslld ($T0,$D4,2); &vpaddd ($T1,$T1,$D3); # *5 &vpaddd ($T0,$T0,$D4); # *5 &vmovdqu (&QWP(16*7,"edi"),$T1); &vmovdqu (&QWP(16*8,"edi"),$T0); &mov ("esp","ebp"); &lea ("edi",&DWP(-16*3,"edi")); # size de-optimization &ret (); &function_end_B("_poly1305_init_avx2"); ######################################################################## # now it's time to switch to %ymm my ($D0,$D1,$D2,$D3,$D4,$T0,$T1,$T2)=map("ymm$_",(0..7)); my $MASK=$T2; sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; } &align (32); &function_begin("_poly1305_blocks_avx2"); &mov ("edi",&wparam(0)); # ctx &mov ("esi",&wparam(1)); # inp &mov ("ecx",&wparam(2)); # len &mov ("eax",&DWP(4*5,"edi")); # is_base2_26 &and ("ecx",-16); &jz (&label("nodata")); &cmp ("ecx",64); &jae (&label("enter_avx2")); &test ("eax","eax"); # is_base2_26? &jz (&label("enter_blocks")); &set_label("enter_avx2"); &vzeroupper (); &call (&label("pic_point")); &set_label("pic_point"); &blindpop("ebx"); &lea ("ebx",&DWP(&label("const_sse2")."-".&label("pic_point"),"ebx")); &test ("eax","eax"); # is_base2_26? &jnz (&label("base2_26")); &call ("_poly1305_init_avx2"); ################################################# base 2^32 -> base 2^26 &mov ("eax",&DWP(0,"edi")); &mov ("ecx",&DWP(3,"edi")); &mov ("edx",&DWP(6,"edi")); &mov ("esi",&DWP(9,"edi")); &mov ("ebp",&DWP(13,"edi")); &shr ("ecx",2); &and ("eax",0x3ffffff); &shr ("edx",4); &and ("ecx",0x3ffffff); &shr ("esi",6); &and ("edx",0x3ffffff); &mov (&DWP(4*0,"edi"),"eax"); &mov (&DWP(4*1,"edi"),"ecx"); &mov (&DWP(4*2,"edi"),"edx"); &mov (&DWP(4*3,"edi"),"esi"); &mov (&DWP(4*4,"edi"),"ebp"); &mov (&DWP(4*5,"edi"),1); # is_base2_26 &mov ("esi",&wparam(1)); # [reload] inp &mov ("ecx",&wparam(2)); # [reload] len &set_label("base2_26"); &mov ("eax",&wparam(3)); # padbit &mov ("ebp","esp"); &sub ("esp",32*(5+9)); &and ("esp",-512); # ensure that frame # doesn't cross page # boundary, which is # essential for # misaligned 32-byte # loads ################################################################ # expand and copy pre-calculated table to stack &vmovdqu (&X($D0),&QWP(16*(3+0),"edi")); &lea ("edx",&DWP(32*5+128,"esp")); # +128 size optimization &vmovdqu (&X($D1),&QWP(16*(3+1),"edi")); &vmovdqu (&X($D2),&QWP(16*(3+2),"edi")); &vmovdqu (&X($D3),&QWP(16*(3+3),"edi")); &vmovdqu (&X($D4),&QWP(16*(3+4),"edi")); &lea ("edi",&DWP(16*3,"edi")); # size optimization &vpermq ($D0,$D0,0b01000000); # 00001234 -> 12343434 &vpermq ($D1,$D1,0b01000000); &vpermq ($D2,$D2,0b01000000); &vpermq ($D3,$D3,0b01000000); &vpermq ($D4,$D4,0b01000000); &vpshufd ($D0,$D0,0b11001000); # 12343434 -> 14243444 &vpshufd ($D1,$D1,0b11001000); &vpshufd ($D2,$D2,0b11001000); &vpshufd ($D3,$D3,0b11001000); &vpshufd ($D4,$D4,0b11001000); &vmovdqa (&QWP(32*0-128,"edx"),$D0); &vmovdqu (&X($D0),&QWP(16*5,"edi")); &vmovdqa (&QWP(32*1-128,"edx"),$D1); &vmovdqu (&X($D1),&QWP(16*6,"edi")); &vmovdqa (&QWP(32*2-128,"edx"),$D2); &vmovdqu (&X($D2),&QWP(16*7,"edi")); &vmovdqa (&QWP(32*3-128,"edx"),$D3); &vmovdqu (&X($D3),&QWP(16*8,"edi")); &vmovdqa (&QWP(32*4-128,"edx"),$D4); &vpermq ($D0,$D0,0b01000000); &vpermq ($D1,$D1,0b01000000); &vpermq ($D2,$D2,0b01000000); &vpermq ($D3,$D3,0b01000000); &vpshufd ($D0,$D0,0b11001000); &vpshufd ($D1,$D1,0b11001000); &vpshufd ($D2,$D2,0b11001000); &vpshufd ($D3,$D3,0b11001000); &vmovdqa (&QWP(32*5-128,"edx"),$D0); &vmovd (&X($D0),&DWP(-16*3+4*0,"edi"));# load hash value &vmovdqa (&QWP(32*6-128,"edx"),$D1); &vmovd (&X($D1),&DWP(-16*3+4*1,"edi")); &vmovdqa (&QWP(32*7-128,"edx"),$D2); &vmovd (&X($D2),&DWP(-16*3+4*2,"edi")); &vmovdqa (&QWP(32*8-128,"edx"),$D3); &vmovd (&X($D3),&DWP(-16*3+4*3,"edi")); &vmovd (&X($D4),&DWP(-16*3+4*4,"edi")); &vmovdqa ($MASK,&QWP(64,"ebx")); &neg ("eax"); # padbit &test ("ecx",63); &jz (&label("even")); &mov ("edx","ecx"); &and ("ecx",-64); &and ("edx",63); &vmovdqu (&X($T0),&QWP(16*0,"esi")); &cmp ("edx",32); &jb (&label("one")); &vmovdqu (&X($T1),&QWP(16*1,"esi")); &je (&label("two")); &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); &lea ("esi",&DWP(16*3,"esi")); &lea ("ebx",&DWP(8,"ebx")); # three padbits &lea ("edx",&DWP(32*5+128+8,"esp")); # --:r^1:r^2:r^3 (*) &jmp (&label("tail")); &set_label("two"); &lea ("esi",&DWP(16*2,"esi")); &lea ("ebx",&DWP(16,"ebx")); # two padbits &lea ("edx",&DWP(32*5+128+16,"esp"));# --:--:r^1:r^2 (*) &jmp (&label("tail")); &set_label("one"); &lea ("esi",&DWP(16*1,"esi")); &vpxor ($T1,$T1,$T1); &lea ("ebx",&DWP(32,"ebx","eax",8)); # one or no padbits &lea ("edx",&DWP(32*5+128+24,"esp"));# --:--:--:r^1 (*) &jmp (&label("tail")); # (*) spots marked with '--' are data from next table entry, but they # are multiplied by 0 and therefore rendered insignificant &set_label("even",32); &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input &vmovdqu (&X($T1),&QWP(16*1,"esi")); &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); &lea ("esi",&DWP(16*4,"esi")); &sub ("ecx",64); &jz (&label("tail")); &set_label("loop"); ################################################################ # ((inp[0]*r^4+r[4])*r^4+r[8])*r^4 # ((inp[1]*r^4+r[5])*r^4+r[9])*r^3 # ((inp[2]*r^4+r[6])*r^4+r[10])*r^2 # ((inp[3]*r^4+r[7])*r^4+r[11])*r^1 # \________/ \_______/ ################################################################ sub vsplat_input { &vmovdqa (&QWP(32*2,"esp"),$D2); &vpsrldq ($D2,$T0,6); # splat input &vmovdqa (&QWP(32*0,"esp"),$D0); &vpsrldq ($D0,$T1,6); &vmovdqa (&QWP(32*1,"esp"),$D1); &vpunpckhqdq ($D1,$T0,$T1); # 4 &vpunpcklqdq ($T0,$T0,$T1); # 0:1 &vpunpcklqdq ($D2,$D2,$D0); # 2:3 &vpsrlq ($D0,$D2,30); &vpsrlq ($D2,$D2,4); &vpsrlq ($T1,$T0,26); &vpsrlq ($D1,$D1,40); # 4 &vpand ($D2,$D2,$MASK); # 2 &vpand ($T0,$T0,$MASK); # 0 &vpand ($T1,$T1,$MASK); # 1 &vpand ($D0,$D0,$MASK); # 3 (*) &vpor ($D1,$D1,&QWP(0,"ebx")); # padbit, yes, always # (*) note that output is counterintuitive, inp[3:4] is # returned in $D1-2, while $D3-4 are preserved; } &vsplat_input (); sub vpmuladd { my $addr = shift; &vpaddq ($D2,$D2,&QWP(32*2,"esp")); # add hash value &vpaddq ($T0,$T0,&QWP(32*0,"esp")); &vpaddq ($T1,$T1,&QWP(32*1,"esp")); &vpaddq ($D0,$D0,$D3); &vpaddq ($D1,$D1,$D4); ################################################################ # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 &vpmuludq ($D3,$D2,&$addr(1)); # d3 = h2*r1 &vmovdqa (QWP(32*1,"esp"),$T1); &vpmuludq ($D4,$D2,&$addr(2)); # d4 = h2*r2 &vmovdqa (QWP(32*3,"esp"),$D0); &vpmuludq ($D0,$D2,&$addr(7)); # d0 = h2*s3 &vmovdqa (QWP(32*4,"esp"),$D1); &vpmuludq ($D1,$D2,&$addr(8)); # d1 = h2*s4 &vpmuludq ($D2,$D2,&$addr(0)); # d2 = h2*r0 &vpmuludq ($T2,$T0,&$addr(3)); # h0*r3 &vpaddq ($D3,$D3,$T2); # d3 += h0*r3 &vpmuludq ($T1,$T0,&$addr(4)); # h0*r4 &vpaddq ($D4,$D4,$T1); # d4 + h0*r4 &vpmuludq ($T2,$T0,&$addr(0)); # h0*r0 &vpaddq ($D0,$D0,$T2); # d0 + h0*r0 &vmovdqa ($T2,&QWP(32*1,"esp")); # h1 &vpmuludq ($T1,$T0,&$addr(1)); # h0*r1 &vpaddq ($D1,$D1,$T1); # d1 += h0*r1 &vpmuludq ($T0,$T0,&$addr(2)); # h0*r2 &vpaddq ($D2,$D2,$T0); # d2 += h0*r2 &vpmuludq ($T1,$T2,&$addr(2)); # h1*r2 &vpaddq ($D3,$D3,$T1); # d3 += h1*r2 &vpmuludq ($T0,$T2,&$addr(3)); # h1*r3 &vpaddq ($D4,$D4,$T0); # d4 += h1*r3 &vpmuludq ($T1,$T2,&$addr(8)); # h1*s4 &vpaddq ($D0,$D0,$T1); # d0 += h1*s4 &vmovdqa ($T1,&QWP(32*3,"esp")); # h3 &vpmuludq ($T0,$T2,&$addr(0)); # h1*r0 &vpaddq ($D1,$D1,$T0); # d1 += h1*r0 &vpmuludq ($T2,$T2,&$addr(1)); # h1*r1 &vpaddq ($D2,$D2,$T2); # d2 += h1*r1 &vpmuludq ($T0,$T1,&$addr(0)); # h3*r0 &vpaddq ($D3,$D3,$T0); # d3 += h3*r0 &vpmuludq ($T2,$T1,&$addr(1)); # h3*r1 &vpaddq ($D4,$D4,$T2); # d4 += h3*r1 &vpmuludq ($T0,$T1,&$addr(6)); # h3*s2 &vpaddq ($D0,$D0,$T0); # d0 += h3*s2 &vmovdqa ($T0,&QWP(32*4,"esp")); # h4 &vpmuludq ($T2,$T1,&$addr(7)); # h3*s3 &vpaddq ($D1,$D1,$T2); # d1+= h3*s3 &vpmuludq ($T1,$T1,&$addr(8)); # h3*s4 &vpaddq ($D2,$D2,$T1); # d2 += h3*s4 &vpmuludq ($T2,$T0,&$addr(8)); # h4*s4 &vpaddq ($D3,$D3,$T2); # d3 += h4*s4 &vpmuludq ($T1,$T0,&$addr(5)); # h4*s1 &vpaddq ($D0,$D0,$T1); # d0 += h4*s1 &vpmuludq ($T2,$T0,&$addr(0)); # h4*r0 &vpaddq ($D4,$D4,$T2); # d4 += h4*r0 &vmovdqa ($MASK,&QWP(64,"ebx")); &vpmuludq ($T1,$T0,&$addr(6)); # h4*s2 &vpaddq ($D1,$D1,$T1); # d1 += h4*s2 &vpmuludq ($T0,$T0,&$addr(7)); # h4*s3 &vpaddq ($D2,$D2,$T0); # d2 += h4*s3 } &vpmuladd (sub { my $i=shift; &QWP(32*$i-128,"edx"); }); sub vlazy_reduction { ################################################################ # lazy reduction &vpsrlq ($T0,$D3,26); &vpand ($D3,$D3,$MASK); &vpsrlq ($T1,$D0,26); &vpand ($D0,$D0,$MASK); &vpaddq ($D4,$D4,$T0); # h3 -> h4 &vpaddq ($D1,$D1,$T1); # h0 -> h1 &vpsrlq ($T0,$D4,26); &vpand ($D4,$D4,$MASK); &vpsrlq ($T1,$D1,26); &vpand ($D1,$D1,$MASK); &vpaddq ($D2,$D2,$T1); # h1 -> h2 &vpaddq ($D0,$D0,$T0); &vpsllq ($T0,$T0,2); &vpsrlq ($T1,$D2,26); &vpand ($D2,$D2,$MASK); &vpaddq ($D0,$D0,$T0); # h4 -> h0 &vpaddq ($D3,$D3,$T1); # h2 -> h3 &vpsrlq ($T1,$D3,26); &vpsrlq ($T0,$D0,26); &vpand ($D0,$D0,$MASK); &vpand ($D3,$D3,$MASK); &vpaddq ($D1,$D1,$T0); # h0 -> h1 &vpaddq ($D4,$D4,$T1); # h3 -> h4 } &vlazy_reduction(); &vmovdqu (&X($T0),&QWP(16*0,"esi")); # load input &vmovdqu (&X($T1),&QWP(16*1,"esi")); &vinserti128 ($T0,$T0,&QWP(16*2,"esi"),1); &vinserti128 ($T1,$T1,&QWP(16*3,"esi"),1); &lea ("esi",&DWP(16*4,"esi")); &sub ("ecx",64); &jnz (&label("loop")); &set_label("tail"); &vsplat_input (); &and ("ebx",-64); # restore pointer &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); }); ################################################################ # horizontal addition &vpsrldq ($T0,$D4,8); &vpsrldq ($T1,$D3,8); &vpaddq ($D4,$D4,$T0); &vpsrldq ($T0,$D0,8); &vpaddq ($D3,$D3,$T1); &vpsrldq ($T1,$D1,8); &vpaddq ($D0,$D0,$T0); &vpsrldq ($T0,$D2,8); &vpaddq ($D1,$D1,$T1); &vpermq ($T1,$D4,2); # keep folding &vpaddq ($D2,$D2,$T0); &vpermq ($T0,$D3,2); &vpaddq ($D4,$D4,$T1); &vpermq ($T1,$D0,2); &vpaddq ($D3,$D3,$T0); &vpermq ($T0,$D1,2); &vpaddq ($D0,$D0,$T1); &vpermq ($T1,$D2,2); &vpaddq ($D1,$D1,$T0); &vpaddq ($D2,$D2,$T1); &vlazy_reduction(); &cmp ("ecx",0); &je (&label("done")); ################################################################ # clear all but single word &vpshufd (&X($D0),&X($D0),0b11111100); &lea ("edx",&DWP(32*5+128,"esp")); # restore pointer &vpshufd (&X($D1),&X($D1),0b11111100); &vpshufd (&X($D2),&X($D2),0b11111100); &vpshufd (&X($D3),&X($D3),0b11111100); &vpshufd (&X($D4),&X($D4),0b11111100); &jmp (&label("even")); &set_label("done",16); &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1)); &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2)); &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3)); &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4)); &vzeroupper (); &mov ("esp","ebp"); &set_label("nodata"); &function_end("_poly1305_blocks_avx2"); } &set_label("const_sse2",64); &data_word(1<<24,0, 1<<24,0, 1<<24,0, 1<<24,0); &data_word(0,0, 0,0, 0,0, 0,0); &data_word(0x03ffffff,0,0x03ffffff,0, 0x03ffffff,0, 0x03ffffff,0); &data_word(0x0fffffff,0x0ffffffc,0x0ffffffc,0x0ffffffc); } &asciz ("Poly1305 for x86, CRYPTOGAMS by "); &align (4); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/poly1305/asm/poly1305-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/poly1305/asm/poly1305-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/poly1305/asm/poly1305-x86_64.pl (revision 364963) @@ -1,4183 +1,4183 @@ #! /usr/bin/env perl # Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # This module implements Poly1305 hash for x86_64. # # March 2015 # # Initial release. # # December 2016 # # Add AVX512F+VL+BW code path. # # November 2017 # # Convert AVX512F+VL+BW code path to pure AVX512F, so that it can be # executed even on Knights Landing. Trigger for modification was # observation that AVX512 code paths can negatively affect overall # Skylake-X system performance. Since we are likely to suppress # AVX512F capability flag [at least on Skylake-X], conversion serves # as kind of "investment protection". Note that next *lake processor, # Cannolake, has AVX512IFMA code path to execute... # # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # # IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 # P4 4.46/+120% - # Core 2 2.41/+90% - # Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 # Haswell 1.14/+175% 1.11 0.65 # Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] # Silvermont 2.83/+95% - # Knights L 3.60/? 1.65 1.10 0.41(***) # Goldmont 1.70/+180% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - # Bulldozer 2.30/+130% 0.97 # Ryzen 1.15/+200% 1.08 1.18 # # (*) improvement coefficients relative to clang are more modest and # are ~50% on most processors, in both cases we are comparing to # __int128 code; # (**) SSE2 implementation was attempted, but among non-AVX processors # it was faster than integer-only code only on older Intel P4 and # Core processors, 50-30%, less newer processor is, but slower on # contemporary ones, for example almost 2x slower on Atom, and as # former are naturally disappearing, SSE2 is deemed unnecessary; # (***) strangely enough performance seems to vary from core to core, # listed result is best case; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22) + ($1>=2.25) + ($1>=2.26); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/) { $avx = ($1>=2.09) + ($1>=2.10) + 2 * ($1>=2.12); $avx += 2 if ($1==2.11 && $2>=8); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=12); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx"); my ($mac,$nonce)=($inp,$len); # *_emit arguments my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13)); my ($h0,$h1,$h2)=("%r14","%rbx","%rbp"); sub poly1305_iteration { # input: copy of $r1 in %rax, $h0-$h2, $r0-$r1 # output: $h0-$h2 *= $r0-$r1 $code.=<<___; mulq $h0 # h0*r1 mov %rax,$d2 mov $r0,%rax mov %rdx,$d3 mulq $h0 # h0*r0 mov %rax,$h0 # future $h0 mov $r0,%rax mov %rdx,$d1 mulq $h1 # h1*r0 add %rax,$d2 mov $s1,%rax adc %rdx,$d3 mulq $h1 # h1*s1 mov $h2,$h1 # borrow $h1 add %rax,$h0 adc %rdx,$d1 imulq $s1,$h1 # h2*s1 add $h1,$d2 mov $d1,$h1 adc \$0,$d3 imulq $r0,$h2 # h2*r0 add $d2,$h1 mov \$-4,%rax # mask value adc $h2,$d3 and $d3,%rax # last reduction step mov $d3,$h2 shr \$2,$d3 and \$3,$h2 add $d3,%rax add %rax,$h0 adc \$0,$h1 adc \$0,$h2 ___ } ######################################################################## # Layout of opaque area is following. # # unsigned __int64 h[3]; # current hash value base 2^64 # unsigned __int64 r[2]; # key value base 2^64 $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl poly1305_init .hidden poly1305_init .globl poly1305_blocks .hidden poly1305_blocks .globl poly1305_emit .hidden poly1305_emit .type poly1305_init,\@function,3 .align 32 poly1305_init: .cfi_startproc xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) cmp \$0,$inp je .Lno_key lea poly1305_blocks(%rip),%r10 lea poly1305_emit(%rip),%r11 ___ $code.=<<___ if ($avx); mov OPENSSL_ia32cap_P+4(%rip),%r9 lea poly1305_blocks_avx(%rip),%rax lea poly1305_emit_avx(%rip),%rcx bt \$`60-32`,%r9 # AVX? cmovc %rax,%r10 cmovc %rcx,%r11 ___ $code.=<<___ if ($avx>1); lea poly1305_blocks_avx2(%rip),%rax bt \$`5+32`,%r9 # AVX2? cmovc %rax,%r10 ___ $code.=<<___ if ($avx>3); mov \$`(1<<31|1<<21|1<<16)`,%rax shr \$32,%r9 and %rax,%r9 cmp %rax,%r9 je .Linit_base2_44 ___ $code.=<<___; mov \$0x0ffffffc0fffffff,%rax mov \$0x0ffffffc0ffffffc,%rcx and 0($inp),%rax and 8($inp),%rcx mov %rax,24($ctx) mov %rcx,32($ctx) ___ $code.=<<___ if ($flavour !~ /elf32/); mov %r10,0(%rdx) mov %r11,8(%rdx) ___ $code.=<<___ if ($flavour =~ /elf32/); mov %r10d,0(%rdx) mov %r11d,4(%rdx) ___ $code.=<<___; mov \$1,%eax .Lno_key: ret .cfi_endproc .size poly1305_init,.-poly1305_init .type poly1305_blocks,\@function,4 .align 32 poly1305_blocks: .cfi_startproc .Lblocks: shr \$4,$len jz .Lno_data # too short push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lblocks_body: mov $len,%r15 # reassign $len mov 24($ctx),$r0 # load r mov 32($ctx),$s1 mov 0($ctx),$h0 # load hash value mov 8($ctx),$h1 mov 16($ctx),$h2 mov $s1,$r1 shr \$2,$s1 mov $r1,%rax add $r1,$s1 # s1 = r1 + (r1 >> 2) jmp .Loop .align 32 .Loop: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 ___ &poly1305_iteration(); $code.=<<___; mov $r1,%rax dec %r15 # len-=16 jnz .Loop mov $h0,0($ctx) # store hash value mov $h1,8($ctx) mov $h2,16($ctx) mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: ret .cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,\@function,3 .align 32 poly1305_emit: .cfi_startproc .Lemit: mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 mov 16($ctx),%r10 mov %r8,%rax add \$5,%r8 # compare to modulus mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx add 0($nonce),%rax # accumulate nonce adc 8($nonce),%rcx mov %rax,0($mac) # write result mov %rcx,8($mac) ret .cfi_endproc .size poly1305_emit,.-poly1305_emit ___ if ($avx) { ######################################################################## # Layout of opaque area is following. # # unsigned __int32 h[5]; # current hash value base 2^26 # unsigned __int32 is_base2_26; # unsigned __int64 r[2]; # key value base 2^64 # unsigned __int64 pad; # struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9]; # # where r^n are base 2^26 digits of degrees of multiplier key. There are # 5 digits, but last four are interleaved with multiples of 5, totalling # in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4. my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) = map("%xmm$_",(0..15)); $code.=<<___; .type __poly1305_block,\@abi-omnipotent .align 32 __poly1305_block: .cfi_startproc ___ &poly1305_iteration(); $code.=<<___; ret .cfi_endproc .size __poly1305_block,.-__poly1305_block .type __poly1305_init_avx,\@abi-omnipotent .align 32 __poly1305_init_avx: .cfi_startproc mov $r0,$h0 mov $r1,$h1 xor $h2,$h2 lea 48+64($ctx),$ctx # size optimization mov $r1,%rax call __poly1305_block # r^2 mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26 mov \$0x3ffffff,%edx mov $h0,$d1 and $h0#d,%eax mov $r0,$d2 and $r0#d,%edx mov %eax,`16*0+0-64`($ctx) shr \$26,$d1 mov %edx,`16*0+4-64`($ctx) shr \$26,$d2 mov \$0x3ffffff,%eax mov \$0x3ffffff,%edx and $d1#d,%eax and $d2#d,%edx mov %eax,`16*1+0-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov %edx,`16*1+4-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 mov %eax,`16*2+0-64`($ctx) shr \$26,$d1 mov %edx,`16*2+4-64`($ctx) shr \$26,$d2 mov $h1,%rax mov $r1,%rdx shl \$12,%rax shl \$12,%rdx or $d1,%rax or $d2,%rdx and \$0x3ffffff,%eax and \$0x3ffffff,%edx mov %eax,`16*3+0-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov %edx,`16*3+4-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 mov %eax,`16*4+0-64`($ctx) mov $h1,$d1 mov %edx,`16*4+4-64`($ctx) mov $r1,$d2 mov \$0x3ffffff,%eax mov \$0x3ffffff,%edx shr \$14,$d1 shr \$14,$d2 and $d1#d,%eax and $d2#d,%edx mov %eax,`16*5+0-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov %edx,`16*5+4-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 mov %eax,`16*6+0-64`($ctx) shr \$26,$d1 mov %edx,`16*6+4-64`($ctx) shr \$26,$d2 mov $h2,%rax shl \$24,%rax or %rax,$d1 mov $d1#d,`16*7+0-64`($ctx) lea ($d1,$d1,4),$d1 # *5 mov $d2#d,`16*7+4-64`($ctx) lea ($d2,$d2,4),$d2 # *5 mov $d1#d,`16*8+0-64`($ctx) mov $d2#d,`16*8+4-64`($ctx) mov $r1,%rax call __poly1305_block # r^3 mov \$0x3ffffff,%eax # save r^3 base 2^26 mov $h0,$d1 and $h0#d,%eax shr \$26,$d1 mov %eax,`16*0+12-64`($ctx) mov \$0x3ffffff,%edx and $d1#d,%edx mov %edx,`16*1+12-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*2+12-64`($ctx) mov $h1,%rax shl \$12,%rax or $d1,%rax and \$0x3ffffff,%eax mov %eax,`16*3+12-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov $h1,$d1 mov %eax,`16*4+12-64`($ctx) mov \$0x3ffffff,%edx shr \$14,$d1 and $d1#d,%edx mov %edx,`16*5+12-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*6+12-64`($ctx) mov $h2,%rax shl \$24,%rax or %rax,$d1 mov $d1#d,`16*7+12-64`($ctx) lea ($d1,$d1,4),$d1 # *5 mov $d1#d,`16*8+12-64`($ctx) mov $r1,%rax call __poly1305_block # r^4 mov \$0x3ffffff,%eax # save r^4 base 2^26 mov $h0,$d1 and $h0#d,%eax shr \$26,$d1 mov %eax,`16*0+8-64`($ctx) mov \$0x3ffffff,%edx and $d1#d,%edx mov %edx,`16*1+8-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*2+8-64`($ctx) mov $h1,%rax shl \$12,%rax or $d1,%rax and \$0x3ffffff,%eax mov %eax,`16*3+8-64`($ctx) lea (%rax,%rax,4),%eax # *5 mov $h1,$d1 mov %eax,`16*4+8-64`($ctx) mov \$0x3ffffff,%edx shr \$14,$d1 and $d1#d,%edx mov %edx,`16*5+8-64`($ctx) lea (%rdx,%rdx,4),%edx # *5 shr \$26,$d1 mov %edx,`16*6+8-64`($ctx) mov $h2,%rax shl \$24,%rax or %rax,$d1 mov $d1#d,`16*7+8-64`($ctx) lea ($d1,$d1,4),$d1 # *5 mov $d1#d,`16*8+8-64`($ctx) lea -48-64($ctx),$ctx # size [de-]optimization ret .cfi_endproc .size __poly1305_init_avx,.-__poly1305_init_avx .type poly1305_blocks_avx,\@function,4 .align 32 poly1305_blocks_avx: .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx test %r8d,%r8d jz .Lblocks .Lblocks_avx: and \$-16,$len jz .Lno_data_avx vzeroupper test %r8d,%r8d jz .Lbase2_64_avx test \$31,$len jz .Leven_avx push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lblocks_avx_body: mov $len,%r15 # reassign $len mov 0($ctx),$d1 # load hash value mov 8($ctx),$d2 mov 16($ctx),$h2#d mov 24($ctx),$r0 # load r mov 32($ctx),$s1 ################################# base 2^26 -> base 2^64 mov $d1#d,$h0#d and \$`-1*(1<<31)`,$d1 mov $d2,$r1 # borrow $r1 mov $d2#d,$h1#d and \$`-1*(1<<31)`,$d2 shr \$6,$d1 shl \$52,$r1 add $d1,$h0 shr \$12,$h1 shr \$18,$d2 add $r1,$h0 adc $d2,$h1 mov $h2,$d1 shl \$40,$d1 shr \$24,$h2 add $d1,$h1 adc \$0,$h2 # can be partially reduced... mov \$-4,$d2 # ... so reduce mov $h2,$d1 and $h2,$d2 shr \$2,$d1 and \$3,$h2 add $d2,$d1 # =*5 add $d1,$h0 adc \$0,$h1 adc \$0,$h2 mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 call __poly1305_block test $padbit,$padbit # if $padbit is zero, jz .Lstore_base2_64_avx # store hash in base 2^64 format ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$r0 mov $h1,$r1 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$r0 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $r0,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$r1 and \$0x3ffffff,$h1 # h[3] or $r1,$h2 # h[4] sub \$16,%r15 jz .Lstore_base2_26_avx vmovd %rax#d,$H0 vmovd %rdx#d,$H1 vmovd $h0#d,$H2 vmovd $h1#d,$H3 vmovd $h2#d,$H4 jmp .Lproceed_avx .align 32 .Lstore_base2_64_avx: mov $h0,0($ctx) mov $h1,8($ctx) mov $h2,16($ctx) # note that is_base2_26 is zeroed jmp .Ldone_avx .align 16 .Lstore_base2_26_avx: mov %rax#d,0($ctx) # store hash value base 2^26 mov %rdx#d,4($ctx) mov $h0#d,8($ctx) mov $h1#d,12($ctx) mov $h2#d,16($ctx) .align 16 .Ldone_avx: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data_avx: .Lblocks_avx_epilogue: ret .cfi_endproc .align 32 .Lbase2_64_avx: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lbase2_64_avx_body: mov $len,%r15 # reassign $len mov 24($ctx),$r0 # load r mov 32($ctx),$s1 mov 0($ctx),$h0 # load hash value mov 8($ctx),$h1 mov 16($ctx),$h2#d mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) test \$31,$len jz .Linit_avx add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 sub \$16,%r15 call __poly1305_block .Linit_avx: ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$d1 mov $h1,$d2 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$d1 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $d1,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$d2 and \$0x3ffffff,$h1 # h[3] or $d2,$h2 # h[4] vmovd %rax#d,$H0 vmovd %rdx#d,$H1 vmovd $h0#d,$H2 vmovd $h1#d,$H3 vmovd $h2#d,$H4 movl \$1,20($ctx) # set is_base2_26 call __poly1305_init_avx .Lproceed_avx: mov %r15,$len mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lbase2_64_avx_epilogue: jmp .Ldo_avx .cfi_endproc .align 32 .Leven_avx: .cfi_startproc vmovd 4*0($ctx),$H0 # load hash value vmovd 4*1($ctx),$H1 vmovd 4*2($ctx),$H2 vmovd 4*3($ctx),$H3 vmovd 4*4($ctx),$H4 .Ldo_avx: ___ $code.=<<___ if (!$win64); lea -0x58(%rsp),%r11 .cfi_def_cfa %r11,0x60 sub \$0x178,%rsp ___ $code.=<<___ if ($win64); lea -0xf8(%rsp),%r11 sub \$0x218,%rsp vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) vmovdqa %xmm9,0x80(%r11) vmovdqa %xmm10,0x90(%r11) vmovdqa %xmm11,0xa0(%r11) vmovdqa %xmm12,0xb0(%r11) vmovdqa %xmm13,0xc0(%r11) vmovdqa %xmm14,0xd0(%r11) vmovdqa %xmm15,0xe0(%r11) .Ldo_avx_body: ___ $code.=<<___; sub \$64,$len lea -32($inp),%rax cmovc %rax,$inp vmovdqu `16*3`($ctx),$D4 # preload r0^2 lea `16*3+64`($ctx),$ctx # size optimization lea .Lconst(%rip),%rcx ################################################################ # load input vmovdqu 16*2($inp),$T0 vmovdqu 16*3($inp),$T1 vmovdqa 64(%rcx),$MASK # .Lmask26 vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpunpcklqdq $T3,$T2,$T3 # 2:3 vpsrlq \$40,$T4,$T4 # 4 vpsrlq \$26,$T0,$T1 vpand $MASK,$T0,$T0 # 0 vpsrlq \$4,$T3,$T2 vpand $MASK,$T1,$T1 # 1 vpsrlq \$30,$T3,$T3 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always jbe .Lskip_loop_avx # expand and copy pre-calculated table to stack vmovdqu `16*1-64`($ctx),$D1 vmovdqu `16*2-64`($ctx),$D2 vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434 vpshufd \$0x44,$D4,$D0 # xx12 -> 1212 vmovdqa $D3,-0x90(%r11) vmovdqa $D0,0x00(%rsp) vpshufd \$0xEE,$D1,$D4 vmovdqu `16*3-64`($ctx),$D0 vpshufd \$0x44,$D1,$D1 vmovdqa $D4,-0x80(%r11) vmovdqa $D1,0x10(%rsp) vpshufd \$0xEE,$D2,$D3 vmovdqu `16*4-64`($ctx),$D1 vpshufd \$0x44,$D2,$D2 vmovdqa $D3,-0x70(%r11) vmovdqa $D2,0x20(%rsp) vpshufd \$0xEE,$D0,$D4 vmovdqu `16*5-64`($ctx),$D2 vpshufd \$0x44,$D0,$D0 vmovdqa $D4,-0x60(%r11) vmovdqa $D0,0x30(%rsp) vpshufd \$0xEE,$D1,$D3 vmovdqu `16*6-64`($ctx),$D0 vpshufd \$0x44,$D1,$D1 vmovdqa $D3,-0x50(%r11) vmovdqa $D1,0x40(%rsp) vpshufd \$0xEE,$D2,$D4 vmovdqu `16*7-64`($ctx),$D1 vpshufd \$0x44,$D2,$D2 vmovdqa $D4,-0x40(%r11) vmovdqa $D2,0x50(%rsp) vpshufd \$0xEE,$D0,$D3 vmovdqu `16*8-64`($ctx),$D2 vpshufd \$0x44,$D0,$D0 vmovdqa $D3,-0x30(%r11) vmovdqa $D0,0x60(%rsp) vpshufd \$0xEE,$D1,$D4 vpshufd \$0x44,$D1,$D1 vmovdqa $D4,-0x20(%r11) vmovdqa $D1,0x70(%rsp) vpshufd \$0xEE,$D2,$D3 vmovdqa 0x00(%rsp),$D4 # preload r0^2 vpshufd \$0x44,$D2,$D2 vmovdqa $D3,-0x10(%r11) vmovdqa $D2,0x80(%rsp) jmp .Loop_avx .align 32 .Loop_avx: ################################################################ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r # \___________________/ # ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2 # ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r # \___________________/ \____________________/ # # Note that we start with inp[2:3]*r^2. This is because it # doesn't depend on reduction in previous iteration. ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 # # though note that $Tx and $Hx are "reversed" in this section, # and $D4 is preloaded with r0^2... vpmuludq $T0,$D4,$D0 # d0 = h0*r0 vpmuludq $T1,$D4,$D1 # d1 = h1*r0 vmovdqa $H2,0x20(%r11) # offload hash vpmuludq $T2,$D4,$D2 # d3 = h2*r0 vmovdqa 0x10(%rsp),$H2 # r1^2 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 vmovdqa $H0,0x00(%r11) # vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1 vmovdqa $H1,0x10(%r11) # vpmuludq $T3,$H2,$H1 # h3*r1 vpaddq $H0,$D0,$D0 # d0 += h4*s1 vpaddq $H1,$D4,$D4 # d4 += h3*r1 vmovdqa $H3,0x30(%r11) # vpmuludq $T2,$H2,$H0 # h2*r1 vpmuludq $T1,$H2,$H1 # h1*r1 vpaddq $H0,$D3,$D3 # d3 += h2*r1 vmovdqa 0x30(%rsp),$H3 # r2^2 vpaddq $H1,$D2,$D2 # d2 += h1*r1 vmovdqa $H4,0x40(%r11) # vpmuludq $T0,$H2,$H2 # h0*r1 vpmuludq $T2,$H3,$H0 # h2*r2 vpaddq $H2,$D1,$D1 # d1 += h0*r1 vmovdqa 0x40(%rsp),$H4 # s2^2 vpaddq $H0,$D4,$D4 # d4 += h2*r2 vpmuludq $T1,$H3,$H1 # h1*r2 vpmuludq $T0,$H3,$H3 # h0*r2 vpaddq $H1,$D3,$D3 # d3 += h1*r2 vmovdqa 0x50(%rsp),$H2 # r3^2 vpaddq $H3,$D2,$D2 # d2 += h0*r2 vpmuludq $T4,$H4,$H0 # h4*s2 vpmuludq $T3,$H4,$H4 # h3*s2 vpaddq $H0,$D1,$D1 # d1 += h4*s2 vmovdqa 0x60(%rsp),$H3 # s3^2 vpaddq $H4,$D0,$D0 # d0 += h3*s2 vmovdqa 0x80(%rsp),$H4 # s4^2 vpmuludq $T1,$H2,$H1 # h1*r3 vpmuludq $T0,$H2,$H2 # h0*r3 vpaddq $H1,$D4,$D4 # d4 += h1*r3 vpaddq $H2,$D3,$D3 # d3 += h0*r3 vpmuludq $T4,$H3,$H0 # h4*s3 vpmuludq $T3,$H3,$H1 # h3*s3 vpaddq $H0,$D2,$D2 # d2 += h4*s3 vmovdqu 16*0($inp),$H0 # load input vpaddq $H1,$D1,$D1 # d1 += h3*s3 vpmuludq $T2,$H3,$H3 # h2*s3 vpmuludq $T2,$H4,$T2 # h2*s4 vpaddq $H3,$D0,$D0 # d0 += h2*s3 vmovdqu 16*1($inp),$H1 # vpaddq $T2,$D1,$D1 # d1 += h2*s4 vpmuludq $T3,$H4,$T3 # h3*s4 vpmuludq $T4,$H4,$T4 # h4*s4 vpsrldq \$6,$H0,$H2 # splat input vpaddq $T3,$D2,$D2 # d2 += h3*s4 vpaddq $T4,$D3,$D3 # d3 += h4*s4 vpsrldq \$6,$H1,$H3 # vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4 vpmuludq $T1,$H4,$T0 # h1*s4 vpunpckhqdq $H1,$H0,$H4 # 4 vpaddq $T4,$D4,$D4 # d4 += h0*r4 vmovdqa -0x90(%r11),$T4 # r0^4 vpaddq $T0,$D0,$D0 # d0 += h1*s4 vpunpcklqdq $H1,$H0,$H0 # 0:1 vpunpcklqdq $H3,$H2,$H3 # 2:3 #vpsrlq \$40,$H4,$H4 # 4 vpsrldq \$`40/8`,$H4,$H4 # 4 vpsrlq \$26,$H0,$H1 vpand $MASK,$H0,$H0 # 0 vpsrlq \$4,$H3,$H2 vpand $MASK,$H1,$H1 # 1 vpand 0(%rcx),$H4,$H4 # .Lmask24 vpsrlq \$30,$H3,$H3 vpand $MASK,$H2,$H2 # 2 vpand $MASK,$H3,$H3 # 3 vpor 32(%rcx),$H4,$H4 # padbit, yes, always vpaddq 0x00(%r11),$H0,$H0 # add hash value vpaddq 0x10(%r11),$H1,$H1 vpaddq 0x20(%r11),$H2,$H2 vpaddq 0x30(%r11),$H3,$H3 vpaddq 0x40(%r11),$H4,$H4 lea 16*2($inp),%rax lea 16*4($inp),$inp sub \$64,$len cmovc %rax,$inp ################################################################ # Now we accumulate (inp[0:1]+hash)*r^4 ################################################################ # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 vpmuludq $H0,$T4,$T0 # h0*r0 vpmuludq $H1,$T4,$T1 # h1*r0 vpaddq $T0,$D0,$D0 vpaddq $T1,$D1,$D1 vmovdqa -0x80(%r11),$T2 # r1^4 vpmuludq $H2,$T4,$T0 # h2*r0 vpmuludq $H3,$T4,$T1 # h3*r0 vpaddq $T0,$D2,$D2 vpaddq $T1,$D3,$D3 vpmuludq $H4,$T4,$T4 # h4*r0 vpmuludq -0x70(%r11),$H4,$T0 # h4*s1 vpaddq $T4,$D4,$D4 vpaddq $T0,$D0,$D0 # d0 += h4*s1 vpmuludq $H2,$T2,$T1 # h2*r1 vpmuludq $H3,$T2,$T0 # h3*r1 vpaddq $T1,$D3,$D3 # d3 += h2*r1 vmovdqa -0x60(%r11),$T3 # r2^4 vpaddq $T0,$D4,$D4 # d4 += h3*r1 vpmuludq $H1,$T2,$T1 # h1*r1 vpmuludq $H0,$T2,$T2 # h0*r1 vpaddq $T1,$D2,$D2 # d2 += h1*r1 vpaddq $T2,$D1,$D1 # d1 += h0*r1 vmovdqa -0x50(%r11),$T4 # s2^4 vpmuludq $H2,$T3,$T0 # h2*r2 vpmuludq $H1,$T3,$T1 # h1*r2 vpaddq $T0,$D4,$D4 # d4 += h2*r2 vpaddq $T1,$D3,$D3 # d3 += h1*r2 vmovdqa -0x40(%r11),$T2 # r3^4 vpmuludq $H0,$T3,$T3 # h0*r2 vpmuludq $H4,$T4,$T0 # h4*s2 vpaddq $T3,$D2,$D2 # d2 += h0*r2 vpaddq $T0,$D1,$D1 # d1 += h4*s2 vmovdqa -0x30(%r11),$T3 # s3^4 vpmuludq $H3,$T4,$T4 # h3*s2 vpmuludq $H1,$T2,$T1 # h1*r3 vpaddq $T4,$D0,$D0 # d0 += h3*s2 vmovdqa -0x10(%r11),$T4 # s4^4 vpaddq $T1,$D4,$D4 # d4 += h1*r3 vpmuludq $H0,$T2,$T2 # h0*r3 vpmuludq $H4,$T3,$T0 # h4*s3 vpaddq $T2,$D3,$D3 # d3 += h0*r3 vpaddq $T0,$D2,$D2 # d2 += h4*s3 vmovdqu 16*2($inp),$T0 # load input vpmuludq $H3,$T3,$T2 # h3*s3 vpmuludq $H2,$T3,$T3 # h2*s3 vpaddq $T2,$D1,$D1 # d1 += h3*s3 vmovdqu 16*3($inp),$T1 # vpaddq $T3,$D0,$D0 # d0 += h2*s3 vpmuludq $H2,$T4,$H2 # h2*s4 vpmuludq $H3,$T4,$H3 # h3*s4 vpsrldq \$6,$T0,$T2 # splat input vpaddq $H2,$D1,$D1 # d1 += h2*s4 vpmuludq $H4,$T4,$H4 # h4*s4 vpsrldq \$6,$T1,$T3 # vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4 vpmuludq -0x20(%r11),$H0,$H4 # h0*r4 vpmuludq $H1,$T4,$H0 vpunpckhqdq $T1,$T0,$T4 # 4 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpunpcklqdq $T3,$T2,$T3 # 2:3 #vpsrlq \$40,$T4,$T4 # 4 vpsrldq \$`40/8`,$T4,$T4 # 4 vpsrlq \$26,$T0,$T1 vmovdqa 0x00(%rsp),$D4 # preload r0^2 vpand $MASK,$T0,$T0 # 0 vpsrlq \$4,$T3,$T2 vpand $MASK,$T1,$T1 # 1 vpand 0(%rcx),$T4,$T4 # .Lmask24 vpsrlq \$30,$T3,$T3 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$D1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D0 vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D0,$H0,$H0 vpsllq \$2,$D0,$D0 vpaddq $D0,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 ja .Loop_avx .Lskip_loop_avx: ################################################################ # multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1 vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2 add \$32,$len jnz .Long_tail_avx vpaddq $H2,$T2,$T2 vpaddq $H0,$T0,$T0 vpaddq $H1,$T1,$T1 vpaddq $H3,$T3,$T3 vpaddq $H4,$T4,$T4 .Long_tail_avx: vmovdqa $H2,0x20(%r11) vmovdqa $H0,0x00(%r11) vmovdqa $H1,0x10(%r11) vmovdqa $H3,0x30(%r11) vmovdqa $H4,0x40(%r11) # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 vpmuludq $T2,$D4,$D2 # d2 = h2*r0 vpmuludq $T0,$D4,$D0 # d0 = h0*r0 vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n vpmuludq $T1,$D4,$D1 # d1 = h1*r0 vpmuludq $T3,$D4,$D3 # d3 = h3*r0 vpmuludq $T4,$D4,$D4 # d4 = h4*r0 vpmuludq $T3,$H2,$H0 # h3*r1 vpaddq $H0,$D4,$D4 # d4 += h3*r1 vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n vpmuludq $T2,$H2,$H1 # h2*r1 vpaddq $H1,$D3,$D3 # d3 += h2*r1 vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n vpmuludq $T1,$H2,$H0 # h1*r1 vpaddq $H0,$D2,$D2 # d2 += h1*r1 vpmuludq $T0,$H2,$H2 # h0*r1 vpaddq $H2,$D1,$D1 # d1 += h0*r1 vpmuludq $T4,$H3,$H3 # h4*s1 vpaddq $H3,$D0,$D0 # d0 += h4*s1 vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n vpmuludq $T2,$H4,$H1 # h2*r2 vpaddq $H1,$D4,$D4 # d4 += h2*r2 vpmuludq $T1,$H4,$H0 # h1*r2 vpaddq $H0,$D3,$D3 # d3 += h1*r2 vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n vpmuludq $T0,$H4,$H4 # h0*r2 vpaddq $H4,$D2,$D2 # d2 += h0*r2 vpmuludq $T4,$H2,$H1 # h4*s2 vpaddq $H1,$D1,$D1 # d1 += h4*s2 vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n vpmuludq $T3,$H2,$H2 # h3*s2 vpaddq $H2,$D0,$D0 # d0 += h3*s2 vpmuludq $T1,$H3,$H0 # h1*r3 vpaddq $H0,$D4,$D4 # d4 += h1*r3 vpmuludq $T0,$H3,$H3 # h0*r3 vpaddq $H3,$D3,$D3 # d3 += h0*r3 vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n vpmuludq $T4,$H4,$H1 # h4*s3 vpaddq $H1,$D2,$D2 # d2 += h4*s3 vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n vpmuludq $T3,$H4,$H0 # h3*s3 vpaddq $H0,$D1,$D1 # d1 += h3*s3 vpmuludq $T2,$H4,$H4 # h2*s3 vpaddq $H4,$D0,$D0 # d0 += h2*s3 vpmuludq $T0,$H2,$H2 # h0*r4 vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4 vpmuludq $T4,$H3,$H1 # h4*s4 vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4 vpmuludq $T3,$H3,$H0 # h3*s4 vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4 vpmuludq $T2,$H3,$H1 # h2*s4 vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4 vpmuludq $T1,$H3,$H3 # h1*s4 vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4 jz .Lshort_tail_avx vmovdqu 16*0($inp),$H0 # load input vmovdqu 16*1($inp),$H1 vpsrldq \$6,$H0,$H2 # splat input vpsrldq \$6,$H1,$H3 vpunpckhqdq $H1,$H0,$H4 # 4 vpunpcklqdq $H1,$H0,$H0 # 0:1 vpunpcklqdq $H3,$H2,$H3 # 2:3 vpsrlq \$40,$H4,$H4 # 4 vpsrlq \$26,$H0,$H1 vpand $MASK,$H0,$H0 # 0 vpsrlq \$4,$H3,$H2 vpand $MASK,$H1,$H1 # 1 vpsrlq \$30,$H3,$H3 vpand $MASK,$H2,$H2 # 2 vpand $MASK,$H3,$H3 # 3 vpor 32(%rcx),$H4,$H4 # padbit, yes, always vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4 vpaddq 0x00(%r11),$H0,$H0 vpaddq 0x10(%r11),$H1,$H1 vpaddq 0x20(%r11),$H2,$H2 vpaddq 0x30(%r11),$H3,$H3 vpaddq 0x40(%r11),$H4,$H4 ################################################################ # multiply (inp[0:1]+hash) by r^4:r^3 and accumulate vpmuludq $H0,$T4,$T0 # h0*r0 vpaddq $T0,$D0,$D0 # d0 += h0*r0 vpmuludq $H1,$T4,$T1 # h1*r0 vpaddq $T1,$D1,$D1 # d1 += h1*r0 vpmuludq $H2,$T4,$T0 # h2*r0 vpaddq $T0,$D2,$D2 # d2 += h2*r0 vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n vpmuludq $H3,$T4,$T1 # h3*r0 vpaddq $T1,$D3,$D3 # d3 += h3*r0 vpmuludq $H4,$T4,$T4 # h4*r0 vpaddq $T4,$D4,$D4 # d4 += h4*r0 vpmuludq $H3,$T2,$T0 # h3*r1 vpaddq $T0,$D4,$D4 # d4 += h3*r1 vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1 vpmuludq $H2,$T2,$T1 # h2*r1 vpaddq $T1,$D3,$D3 # d3 += h2*r1 vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2 vpmuludq $H1,$T2,$T0 # h1*r1 vpaddq $T0,$D2,$D2 # d2 += h1*r1 vpmuludq $H0,$T2,$T2 # h0*r1 vpaddq $T2,$D1,$D1 # d1 += h0*r1 vpmuludq $H4,$T3,$T3 # h4*s1 vpaddq $T3,$D0,$D0 # d0 += h4*s1 vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2 vpmuludq $H2,$T4,$T1 # h2*r2 vpaddq $T1,$D4,$D4 # d4 += h2*r2 vpmuludq $H1,$T4,$T0 # h1*r2 vpaddq $T0,$D3,$D3 # d3 += h1*r2 vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3 vpmuludq $H0,$T4,$T4 # h0*r2 vpaddq $T4,$D2,$D2 # d2 += h0*r2 vpmuludq $H4,$T2,$T1 # h4*s2 vpaddq $T1,$D1,$D1 # d1 += h4*s2 vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3 vpmuludq $H3,$T2,$T2 # h3*s2 vpaddq $T2,$D0,$D0 # d0 += h3*s2 vpmuludq $H1,$T3,$T0 # h1*r3 vpaddq $T0,$D4,$D4 # d4 += h1*r3 vpmuludq $H0,$T3,$T3 # h0*r3 vpaddq $T3,$D3,$D3 # d3 += h0*r3 vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4 vpmuludq $H4,$T4,$T1 # h4*s3 vpaddq $T1,$D2,$D2 # d2 += h4*s3 vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4 vpmuludq $H3,$T4,$T0 # h3*s3 vpaddq $T0,$D1,$D1 # d1 += h3*s3 vpmuludq $H2,$T4,$T4 # h2*s3 vpaddq $T4,$D0,$D0 # d0 += h2*s3 vpmuludq $H0,$T2,$T2 # h0*r4 vpaddq $T2,$D4,$D4 # d4 += h0*r4 vpmuludq $H4,$T3,$T1 # h4*s4 vpaddq $T1,$D3,$D3 # d3 += h4*s4 vpmuludq $H3,$T3,$T0 # h3*s4 vpaddq $T0,$D2,$D2 # d2 += h3*s4 vpmuludq $H2,$T3,$T1 # h2*s4 vpaddq $T1,$D1,$D1 # d1 += h2*s4 vpmuludq $H1,$T3,$T3 # h1*s4 vpaddq $T3,$D0,$D0 # d0 += h1*s4 .Lshort_tail_avx: ################################################################ # horizontal addition vpsrldq \$8,$D4,$T4 vpsrldq \$8,$D3,$T3 vpsrldq \$8,$D1,$T1 vpsrldq \$8,$D0,$T0 vpsrldq \$8,$D2,$T2 vpaddq $T3,$D3,$D3 vpaddq $T4,$D4,$D4 vpaddq $T0,$D0,$D0 vpaddq $T1,$D1,$D1 vpaddq $T2,$D2,$D2 ################################################################ # lazy reduction vpsrlq \$26,$D3,$H3 vpand $MASK,$D3,$D3 vpaddq $H3,$D4,$D4 # h3 -> h4 vpsrlq \$26,$D0,$H0 vpand $MASK,$D0,$D0 vpaddq $H0,$D1,$D1 # h0 -> h1 vpsrlq \$26,$D4,$H4 vpand $MASK,$D4,$D4 vpsrlq \$26,$D1,$H1 vpand $MASK,$D1,$D1 vpaddq $H1,$D2,$D2 # h1 -> h2 vpaddq $H4,$D0,$D0 vpsllq \$2,$H4,$H4 vpaddq $H4,$D0,$D0 # h4 -> h0 vpsrlq \$26,$D2,$H2 vpand $MASK,$D2,$D2 vpaddq $H2,$D3,$D3 # h2 -> h3 vpsrlq \$26,$D0,$H0 vpand $MASK,$D0,$D0 vpaddq $H0,$D1,$D1 # h0 -> h1 vpsrlq \$26,$D3,$H3 vpand $MASK,$D3,$D3 vpaddq $H3,$D4,$D4 # h3 -> h4 vmovd $D0,`4*0-48-64`($ctx) # save partially reduced vmovd $D1,`4*1-48-64`($ctx) vmovd $D2,`4*2-48-64`($ctx) vmovd $D3,`4*3-48-64`($ctx) vmovd $D4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); vmovdqa 0x50(%r11),%xmm6 vmovdqa 0x60(%r11),%xmm7 vmovdqa 0x70(%r11),%xmm8 vmovdqa 0x80(%r11),%xmm9 vmovdqa 0x90(%r11),%xmm10 vmovdqa 0xa0(%r11),%xmm11 vmovdqa 0xb0(%r11),%xmm12 vmovdqa 0xc0(%r11),%xmm13 vmovdqa 0xd0(%r11),%xmm14 vmovdqa 0xe0(%r11),%xmm15 lea 0xf8(%r11),%rsp .Ldo_avx_epilogue: ___ $code.=<<___ if (!$win64); lea 0x58(%r11),%rsp .cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret .cfi_endproc .size poly1305_blocks_avx,.-poly1305_blocks_avx .type poly1305_emit_avx,\@function,3 .align 32 poly1305_emit_avx: .cfi_startproc cmpl \$0,20($ctx) # is_base2_26? je .Lemit mov 0($ctx),%eax # load hash value base 2^26 mov 4($ctx),%ecx mov 8($ctx),%r8d mov 12($ctx),%r11d mov 16($ctx),%r10d shl \$26,%rcx # base 2^26 -> base 2^64 mov %r8,%r9 shl \$52,%r8 add %rcx,%rax shr \$12,%r9 add %rax,%r8 # h0 adc \$0,%r9 shl \$14,%r11 mov %r10,%rax shr \$24,%r10 add %r11,%r9 shl \$40,%rax add %rax,%r9 # h1 adc \$0,%r10 # h2 mov %r10,%rax # could be partially reduced, so reduce mov %r10,%rcx and \$3,%r10 shr \$2,%rax and \$-4,%rcx add %rcx,%rax add %rax,%r8 adc \$0,%r9 adc \$0,%r10 mov %r8,%rax add \$5,%r8 # compare to modulus mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx add 0($nonce),%rax # accumulate nonce adc 8($nonce),%rcx mov %rax,0($mac) # write result mov %rcx,8($mac) ret .cfi_endproc .size poly1305_emit_avx,.-poly1305_emit_avx ___ if ($avx>1) { my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) = map("%ymm$_",(0..15)); my $S4=$MASK; $code.=<<___; .type poly1305_blocks_avx2,\@function,4 .align 32 poly1305_blocks_avx2: .cfi_startproc mov 20($ctx),%r8d # is_base2_26 cmp \$128,$len jae .Lblocks_avx2 test %r8d,%r8d jz .Lblocks .Lblocks_avx2: and \$-16,$len jz .Lno_data_avx2 vzeroupper test %r8d,%r8d jz .Lbase2_64_avx2 test \$63,$len jz .Leven_avx2 push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lblocks_avx2_body: mov $len,%r15 # reassign $len mov 0($ctx),$d1 # load hash value mov 8($ctx),$d2 mov 16($ctx),$h2#d mov 24($ctx),$r0 # load r mov 32($ctx),$s1 ################################# base 2^26 -> base 2^64 mov $d1#d,$h0#d and \$`-1*(1<<31)`,$d1 mov $d2,$r1 # borrow $r1 mov $d2#d,$h1#d and \$`-1*(1<<31)`,$d2 shr \$6,$d1 shl \$52,$r1 add $d1,$h0 shr \$12,$h1 shr \$18,$d2 add $r1,$h0 adc $d2,$h1 mov $h2,$d1 shl \$40,$d1 shr \$24,$h2 add $d1,$h1 adc \$0,$h2 # can be partially reduced... mov \$-4,$d2 # ... so reduce mov $h2,$d1 and $h2,$d2 shr \$2,$d1 and \$3,$h2 add $d2,$d1 # =*5 add $d1,$h0 adc \$0,$h1 adc \$0,$h2 mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) .Lbase2_26_pre_avx2: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 sub \$16,%r15 call __poly1305_block mov $r1,%rax test \$63,%r15 jnz .Lbase2_26_pre_avx2 test $padbit,$padbit # if $padbit is zero, jz .Lstore_base2_64_avx2 # store hash in base 2^64 format ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$r0 mov $h1,$r1 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$r0 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $r0,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$r1 and \$0x3ffffff,$h1 # h[3] or $r1,$h2 # h[4] test %r15,%r15 jz .Lstore_base2_26_avx2 vmovd %rax#d,%x#$H0 vmovd %rdx#d,%x#$H1 vmovd $h0#d,%x#$H2 vmovd $h1#d,%x#$H3 vmovd $h2#d,%x#$H4 jmp .Lproceed_avx2 .align 32 .Lstore_base2_64_avx2: mov $h0,0($ctx) mov $h1,8($ctx) mov $h2,16($ctx) # note that is_base2_26 is zeroed jmp .Ldone_avx2 .align 16 .Lstore_base2_26_avx2: mov %rax#d,0($ctx) # store hash value base 2^26 mov %rdx#d,4($ctx) mov $h0#d,8($ctx) mov $h1#d,12($ctx) mov $h2#d,16($ctx) .align 16 .Ldone_avx2: mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data_avx2: .Lblocks_avx2_epilogue: ret .cfi_endproc .align 32 .Lbase2_64_avx2: .cfi_startproc push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 .Lbase2_64_avx2_body: mov $len,%r15 # reassign $len mov 24($ctx),$r0 # load r mov 32($ctx),$s1 mov 0($ctx),$h0 # load hash value mov 8($ctx),$h1 mov 16($ctx),$h2#d mov $s1,$r1 mov $s1,%rax shr \$2,$s1 add $r1,$s1 # s1 = r1 + (r1 >> 2) test \$63,$len jz .Linit_avx2 .Lbase2_64_pre_avx2: add 0($inp),$h0 # accumulate input adc 8($inp),$h1 lea 16($inp),$inp adc $padbit,$h2 sub \$16,%r15 call __poly1305_block mov $r1,%rax test \$63,%r15 jnz .Lbase2_64_pre_avx2 .Linit_avx2: ################################# base 2^64 -> base 2^26 mov $h0,%rax mov $h0,%rdx shr \$52,$h0 mov $h1,$d1 mov $h1,$d2 shr \$26,%rdx and \$0x3ffffff,%rax # h[0] shl \$12,$d1 and \$0x3ffffff,%rdx # h[1] shr \$14,$h1 or $d1,$h0 shl \$24,$h2 and \$0x3ffffff,$h0 # h[2] shr \$40,$d2 and \$0x3ffffff,$h1 # h[3] or $d2,$h2 # h[4] vmovd %rax#d,%x#$H0 vmovd %rdx#d,%x#$H1 vmovd $h0#d,%x#$H2 vmovd $h1#d,%x#$H3 vmovd $h2#d,%x#$H4 movl \$1,20($ctx) # set is_base2_26 call __poly1305_init_avx .Lproceed_avx2: mov %r15,$len # restore $len mov OPENSSL_ia32cap_P+8(%rip),%r10d mov \$`(1<<31|1<<30|1<<16)`,%r11d mov 0(%rsp),%r15 .cfi_restore %r15 mov 8(%rsp),%r14 .cfi_restore %r14 mov 16(%rsp),%r13 .cfi_restore %r13 mov 24(%rsp),%r12 .cfi_restore %r12 mov 32(%rsp),%rbp .cfi_restore %rbp mov 40(%rsp),%rbx .cfi_restore %rbx lea 48(%rsp),%rax lea 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lbase2_64_avx2_epilogue: jmp .Ldo_avx2 .cfi_endproc .align 32 .Leven_avx2: .cfi_startproc mov OPENSSL_ia32cap_P+8(%rip),%r10d vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26 vmovd 4*1($ctx),%x#$H1 vmovd 4*2($ctx),%x#$H2 vmovd 4*3($ctx),%x#$H3 vmovd 4*4($ctx),%x#$H4 .Ldo_avx2: ___ $code.=<<___ if ($avx>2); cmp \$512,$len jb .Lskip_avx512 and %r11d,%r10d test \$`1<<16`,%r10d # check for AVX512F jnz .Lblocks_avx512 .Lskip_avx512: ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 .cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); lea -0xf8(%rsp),%r11 sub \$0x1c8,%rsp vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) vmovdqa %xmm9,0x80(%r11) vmovdqa %xmm10,0x90(%r11) vmovdqa %xmm11,0xa0(%r11) vmovdqa %xmm12,0xb0(%r11) vmovdqa %xmm13,0xc0(%r11) vmovdqa %xmm14,0xd0(%r11) vmovdqa %xmm15,0xe0(%r11) .Ldo_avx2_body: ___ $code.=<<___; lea .Lconst(%rip),%rcx lea 48+64($ctx),$ctx # size optimization vmovdqa 96(%rcx),$T0 # .Lpermd_avx2 # expand and copy pre-calculated table to stack vmovdqu `16*0-64`($ctx),%x#$T2 and \$-512,%rsp vmovdqu `16*1-64`($ctx),%x#$T3 vmovdqu `16*2-64`($ctx),%x#$T4 vmovdqu `16*3-64`($ctx),%x#$D0 vmovdqu `16*4-64`($ctx),%x#$D1 vmovdqu `16*5-64`($ctx),%x#$D2 lea 0x90(%rsp),%rax # size optimization vmovdqu `16*6-64`($ctx),%x#$D3 vpermd $T2,$T0,$T2 # 00003412 -> 14243444 vmovdqu `16*7-64`($ctx),%x#$D4 vpermd $T3,$T0,$T3 vmovdqu `16*8-64`($ctx),%x#$MASK vpermd $T4,$T0,$T4 vmovdqa $T2,0x00(%rsp) vpermd $D0,$T0,$D0 vmovdqa $T3,0x20-0x90(%rax) vpermd $D1,$T0,$D1 vmovdqa $T4,0x40-0x90(%rax) vpermd $D2,$T0,$D2 vmovdqa $D0,0x60-0x90(%rax) vpermd $D3,$T0,$D3 vmovdqa $D1,0x80-0x90(%rax) vpermd $D4,$T0,$D4 vmovdqa $D2,0xa0-0x90(%rax) vpermd $MASK,$T0,$MASK vmovdqa $D3,0xc0-0x90(%rax) vmovdqa $D4,0xe0-0x90(%rax) vmovdqa $MASK,0x100-0x90(%rax) vmovdqa 64(%rcx),$MASK # .Lmask26 ################################################################ # load input vmovdqu 16*0($inp),%x#$T0 vmovdqu 16*1($inp),%x#$T1 vinserti128 \$1,16*2($inp),$T0,$T0 vinserti128 \$1,16*3($inp),$T1,$T1 lea 16*4($inp),$inp vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpunpcklqdq $T3,$T2,$T2 # 2:3 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpsrlq \$30,$T2,$T3 vpsrlq \$4,$T2,$T2 vpsrlq \$26,$T0,$T1 vpsrlq \$40,$T4,$T4 # 4 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T0,$T0 # 0 vpand $MASK,$T1,$T1 # 1 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpaddq $H2,$T2,$H2 # accumulate input sub \$64,$len jz .Ltail_avx2 jmp .Loop_avx2 .align 32 .Loop_avx2: ################################################################ # ((inp[0]*r^4+inp[4])*r^4+inp[ 8])*r^4 # ((inp[1]*r^4+inp[5])*r^4+inp[ 9])*r^3 # ((inp[2]*r^4+inp[6])*r^4+inp[10])*r^2 # ((inp[3]*r^4+inp[7])*r^4+inp[11])*r^1 # \________/\__________/ ################################################################ #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 vmovdqa `32*0`(%rsp),$T0 # r0^4 vpaddq $H1,$T1,$H1 vmovdqa `32*1`(%rsp),$T1 # r1^4 vpaddq $H3,$T3,$H3 vmovdqa `32*3`(%rsp),$T2 # r2^4 vpaddq $H4,$T4,$H4 vmovdqa `32*6-0x90`(%rax),$T3 # s3^4 vmovdqa `32*8-0x90`(%rax),$S4 # s4^4 # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 # # however, as h2 is "chronologically" first one available pull # corresponding operations up, so it's # # d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4 # d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 # d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpmuludq $H0,$T1,$T4 # h0*r1 vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp vpaddq $T4,$D1,$D1 # d1 += h0*r1 vpaddq $H2,$D2,$D2 # d2 += h1*r1 vpmuludq $H3,$T1,$T4 # h3*r1 vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1 vpaddq $T4,$D4,$D4 # d4 += h3*r1 vpaddq $H2,$D0,$D0 # d0 += h4*s1 vmovdqa `32*4-0x90`(%rax),$T1 # s2 vpmuludq $H0,$T0,$T4 # h0*r0 vpmuludq $H1,$T0,$H2 # h1*r0 vpaddq $T4,$D0,$D0 # d0 += h0*r0 vpaddq $H2,$D1,$D1 # d1 += h1*r0 vpmuludq $H3,$T0,$T4 # h3*r0 vpmuludq $H4,$T0,$H2 # h4*r0 vmovdqu 16*0($inp),%x#$T0 # load input vpaddq $T4,$D3,$D3 # d3 += h3*r0 vpaddq $H2,$D4,$D4 # d4 += h4*r0 vinserti128 \$1,16*2($inp),$T0,$T0 vpmuludq $H3,$T1,$T4 # h3*s2 vpmuludq $H4,$T1,$H2 # h4*s2 vmovdqu 16*1($inp),%x#$T1 vpaddq $T4,$D0,$D0 # d0 += h3*s2 vpaddq $H2,$D1,$D1 # d1 += h4*s2 vmovdqa `32*5-0x90`(%rax),$H2 # r3 vpmuludq $H1,$T2,$T4 # h1*r2 vpmuludq $H0,$T2,$T2 # h0*r2 vpaddq $T4,$D3,$D3 # d3 += h1*r2 vpaddq $T2,$D2,$D2 # d2 += h0*r2 vinserti128 \$1,16*3($inp),$T1,$T1 lea 16*4($inp),$inp vpmuludq $H1,$H2,$T4 # h1*r3 vpmuludq $H0,$H2,$H2 # h0*r3 vpsrldq \$6,$T0,$T2 # splat input vpaddq $T4,$D4,$D4 # d4 += h1*r3 vpaddq $H2,$D3,$D3 # d3 += h0*r3 vpmuludq $H3,$T3,$T4 # h3*s3 vpmuludq $H4,$T3,$H2 # h4*s3 vpsrldq \$6,$T1,$T3 vpaddq $T4,$D1,$D1 # d1 += h3*s3 vpaddq $H2,$D2,$D2 # d2 += h4*s3 vpunpckhqdq $T1,$T0,$T4 # 4 vpmuludq $H3,$S4,$H3 # h3*s4 vpmuludq $H4,$S4,$H4 # h4*s4 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 vpunpcklqdq $T3,$T2,$T3 # 2:3 vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4 vpmuludq $H1,$S4,$H0 # h1*s4 vmovdqa 64(%rcx),$MASK # .Lmask26 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 ################################################################ # lazy reduction (interleaved with tail of input splat) vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$D1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 vpand $MASK,$H4,$H4 vpsrlq \$4,$T3,$T2 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpand $MASK,$T2,$T2 # 2 vpsrlq \$26,$T0,$T1 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpaddq $D2,$H3,$H3 # h2 -> h3 vpaddq $T2,$H2,$H2 # modulo-scheduled vpsrlq \$30,$T3,$T3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$40,$T4,$T4 # 4 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpand $MASK,$T0,$T0 # 0 vpand $MASK,$T1,$T1 # 1 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always sub \$64,$len jnz .Loop_avx2 .byte 0x66,0x90 .Ltail_avx2: ################################################################ # while above multiplications were by r^4 in all lanes, in last # iteration we multiply least significant lane by r^4 and most # significant one by r, so copy of above except that references # to the precomputed table are displaced by 4... #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 vmovdqu `32*0+4`(%rsp),$T0 # r0^4 vpaddq $H1,$T1,$H1 vmovdqu `32*1+4`(%rsp),$T1 # r1^4 vpaddq $H3,$T3,$H3 vmovdqu `32*3+4`(%rsp),$T2 # r2^4 vpaddq $H4,$T4,$H4 vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4 vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4 vpmuludq $H2,$T0,$D2 # d2 = h2*r0 vpmuludq $H2,$T1,$D3 # d3 = h2*r1 vpmuludq $H2,$T2,$D4 # d4 = h2*r2 vpmuludq $H2,$T3,$D0 # d0 = h2*s3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpmuludq $H0,$T1,$T4 # h0*r1 vpmuludq $H1,$T1,$H2 # h1*r1 vpaddq $T4,$D1,$D1 # d1 += h0*r1 vpaddq $H2,$D2,$D2 # d2 += h1*r1 vpmuludq $H3,$T1,$T4 # h3*r1 vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1 vpaddq $T4,$D4,$D4 # d4 += h3*r1 vpaddq $H2,$D0,$D0 # d0 += h4*s1 vpmuludq $H0,$T0,$T4 # h0*r0 vpmuludq $H1,$T0,$H2 # h1*r0 vpaddq $T4,$D0,$D0 # d0 += h0*r0 vmovdqu `32*4+4-0x90`(%rax),$T1 # s2 vpaddq $H2,$D1,$D1 # d1 += h1*r0 vpmuludq $H3,$T0,$T4 # h3*r0 vpmuludq $H4,$T0,$H2 # h4*r0 vpaddq $T4,$D3,$D3 # d3 += h3*r0 vpaddq $H2,$D4,$D4 # d4 += h4*r0 vpmuludq $H3,$T1,$T4 # h3*s2 vpmuludq $H4,$T1,$H2 # h4*s2 vpaddq $T4,$D0,$D0 # d0 += h3*s2 vpaddq $H2,$D1,$D1 # d1 += h4*s2 vmovdqu `32*5+4-0x90`(%rax),$H2 # r3 vpmuludq $H1,$T2,$T4 # h1*r2 vpmuludq $H0,$T2,$T2 # h0*r2 vpaddq $T4,$D3,$D3 # d3 += h1*r2 vpaddq $T2,$D2,$D2 # d2 += h0*r2 vpmuludq $H1,$H2,$T4 # h1*r3 vpmuludq $H0,$H2,$H2 # h0*r3 vpaddq $T4,$D4,$D4 # d4 += h1*r3 vpaddq $H2,$D3,$D3 # d3 += h0*r3 vpmuludq $H3,$T3,$T4 # h3*s3 vpmuludq $H4,$T3,$H2 # h4*s3 vpaddq $T4,$D1,$D1 # d1 += h3*s3 vpaddq $H2,$D2,$D2 # d2 += h4*s3 vpmuludq $H3,$S4,$H3 # h3*s4 vpmuludq $H4,$S4,$H4 # h4*s4 vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4 vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4 vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4 vpmuludq $H1,$S4,$H0 # h1*s4 vmovdqa 64(%rcx),$MASK # .Lmask26 vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 ################################################################ # horizontal addition vpsrldq \$8,$D1,$T1 vpsrldq \$8,$H2,$T2 vpsrldq \$8,$H3,$T3 vpsrldq \$8,$H4,$T4 vpsrldq \$8,$H0,$T0 vpaddq $T1,$D1,$D1 vpaddq $T2,$H2,$H2 vpaddq $T3,$H3,$H3 vpaddq $T4,$H4,$H4 vpaddq $T0,$H0,$H0 vpermq \$0x2,$H3,$T3 vpermq \$0x2,$H4,$T4 vpermq \$0x2,$H0,$T0 vpermq \$0x2,$D1,$T1 vpermq \$0x2,$H2,$T2 vpaddq $T3,$H3,$H3 vpaddq $T4,$H4,$H4 vpaddq $T0,$H0,$H0 vpaddq $T1,$D1,$D1 vpaddq $T2,$H2,$H2 ################################################################ # lazy reduction vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$D1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced vmovd %x#$H1,`4*1-48-64`($ctx) vmovd %x#$H2,`4*2-48-64`($ctx) vmovd %x#$H3,`4*3-48-64`($ctx) vmovd %x#$H4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); vmovdqa 0x50(%r11),%xmm6 vmovdqa 0x60(%r11),%xmm7 vmovdqa 0x70(%r11),%xmm8 vmovdqa 0x80(%r11),%xmm9 vmovdqa 0x90(%r11),%xmm10 vmovdqa 0xa0(%r11),%xmm11 vmovdqa 0xb0(%r11),%xmm12 vmovdqa 0xc0(%r11),%xmm13 vmovdqa 0xd0(%r11),%xmm14 vmovdqa 0xe0(%r11),%xmm15 lea 0xf8(%r11),%rsp .Ldo_avx2_epilogue: ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp .cfi_def_cfa %rsp,8 ___ $code.=<<___; vzeroupper ret .cfi_endproc .size poly1305_blocks_avx2,.-poly1305_blocks_avx2 ___ ####################################################################### if ($avx>2) { # On entry we have input length divisible by 64. But since inner loop # processes 128 bytes per iteration, cases when length is not divisible # by 128 are handled by passing tail 64 bytes to .Ltail_avx2. For this # reason stack layout is kept identical to poly1305_blocks_avx2. If not # for this tail, we wouldn't have to even allocate stack frame... my ($R0,$R1,$R2,$R3,$R4, $S1,$S2,$S3,$S4) = map("%zmm$_",(16..24)); my ($M0,$M1,$M2,$M3,$M4) = map("%zmm$_",(25..29)); my $PADBIT="%zmm30"; map(s/%y/%z/,($T4,$T0,$T1,$T2,$T3)); # switch to %zmm domain map(s/%y/%z/,($D0,$D1,$D2,$D3,$D4)); map(s/%y/%z/,($H0,$H1,$H2,$H3,$H4)); map(s/%y/%z/,($MASK)); $code.=<<___; .type poly1305_blocks_avx512,\@function,4 .align 32 poly1305_blocks_avx512: .cfi_startproc .Lblocks_avx512: mov \$15,%eax kmovw %eax,%k2 ___ $code.=<<___ if (!$win64); lea -8(%rsp),%r11 .cfi_def_cfa %r11,16 sub \$0x128,%rsp ___ $code.=<<___ if ($win64); lea -0xf8(%rsp),%r11 sub \$0x1c8,%rsp vmovdqa %xmm6,0x50(%r11) vmovdqa %xmm7,0x60(%r11) vmovdqa %xmm8,0x70(%r11) vmovdqa %xmm9,0x80(%r11) vmovdqa %xmm10,0x90(%r11) vmovdqa %xmm11,0xa0(%r11) vmovdqa %xmm12,0xb0(%r11) vmovdqa %xmm13,0xc0(%r11) vmovdqa %xmm14,0xd0(%r11) vmovdqa %xmm15,0xe0(%r11) .Ldo_avx512_body: ___ $code.=<<___; lea .Lconst(%rip),%rcx lea 48+64($ctx),$ctx # size optimization vmovdqa 96(%rcx),%y#$T2 # .Lpermd_avx2 # expand pre-calculated table vmovdqu `16*0-64`($ctx),%x#$D0 # will become expanded ${R0} and \$-512,%rsp vmovdqu `16*1-64`($ctx),%x#$D1 # will become ... ${R1} mov \$0x20,%rax vmovdqu `16*2-64`($ctx),%x#$T0 # ... ${S1} vmovdqu `16*3-64`($ctx),%x#$D2 # ... ${R2} vmovdqu `16*4-64`($ctx),%x#$T1 # ... ${S2} vmovdqu `16*5-64`($ctx),%x#$D3 # ... ${R3} vmovdqu `16*6-64`($ctx),%x#$T3 # ... ${S3} vmovdqu `16*7-64`($ctx),%x#$D4 # ... ${R4} vmovdqu `16*8-64`($ctx),%x#$T4 # ... ${S4} vpermd $D0,$T2,$R0 # 00003412 -> 14243444 vpbroadcastq 64(%rcx),$MASK # .Lmask26 vpermd $D1,$T2,$R1 vpermd $T0,$T2,$S1 vpermd $D2,$T2,$R2 vmovdqa64 $R0,0x00(%rsp){%k2} # save in case $len%128 != 0 vpsrlq \$32,$R0,$T0 # 14243444 -> 01020304 vpermd $T1,$T2,$S2 vmovdqu64 $R1,0x00(%rsp,%rax){%k2} vpsrlq \$32,$R1,$T1 vpermd $D3,$T2,$R3 vmovdqa64 $S1,0x40(%rsp){%k2} vpermd $T3,$T2,$S3 vpermd $D4,$T2,$R4 vmovdqu64 $R2,0x40(%rsp,%rax){%k2} vpermd $T4,$T2,$S4 vmovdqa64 $S2,0x80(%rsp){%k2} vmovdqu64 $R3,0x80(%rsp,%rax){%k2} vmovdqa64 $S3,0xc0(%rsp){%k2} vmovdqu64 $R4,0xc0(%rsp,%rax){%k2} vmovdqa64 $S4,0x100(%rsp){%k2} ################################################################ # calculate 5th through 8th powers of the key # # d0 = r0'*r0 + r1'*5*r4 + r2'*5*r3 + r3'*5*r2 + r4'*5*r1 # d1 = r0'*r1 + r1'*r0 + r2'*5*r4 + r3'*5*r3 + r4'*5*r2 # d2 = r0'*r2 + r1'*r1 + r2'*r0 + r3'*5*r4 + r4'*5*r3 # d3 = r0'*r3 + r1'*r2 + r2'*r1 + r3'*r0 + r4'*5*r4 # d4 = r0'*r4 + r1'*r3 + r2'*r2 + r3'*r1 + r4'*r0 vpmuludq $T0,$R0,$D0 # d0 = r0'*r0 vpmuludq $T0,$R1,$D1 # d1 = r0'*r1 vpmuludq $T0,$R2,$D2 # d2 = r0'*r2 vpmuludq $T0,$R3,$D3 # d3 = r0'*r3 vpmuludq $T0,$R4,$D4 # d4 = r0'*r4 vpsrlq \$32,$R2,$T2 vpmuludq $T1,$S4,$M0 vpmuludq $T1,$R0,$M1 vpmuludq $T1,$R1,$M2 vpmuludq $T1,$R2,$M3 vpmuludq $T1,$R3,$M4 vpsrlq \$32,$R3,$T3 vpaddq $M0,$D0,$D0 # d0 += r1'*5*r4 vpaddq $M1,$D1,$D1 # d1 += r1'*r0 vpaddq $M2,$D2,$D2 # d2 += r1'*r1 vpaddq $M3,$D3,$D3 # d3 += r1'*r2 vpaddq $M4,$D4,$D4 # d4 += r1'*r3 vpmuludq $T2,$S3,$M0 vpmuludq $T2,$S4,$M1 vpmuludq $T2,$R1,$M3 vpmuludq $T2,$R2,$M4 vpmuludq $T2,$R0,$M2 vpsrlq \$32,$R4,$T4 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r3 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r4 vpaddq $M3,$D3,$D3 # d3 += r2'*r1 vpaddq $M4,$D4,$D4 # d4 += r2'*r2 vpaddq $M2,$D2,$D2 # d2 += r2'*r0 vpmuludq $T3,$S2,$M0 vpmuludq $T3,$R0,$M3 vpmuludq $T3,$R1,$M4 vpmuludq $T3,$S3,$M1 vpmuludq $T3,$S4,$M2 vpaddq $M0,$D0,$D0 # d0 += r3'*5*r2 vpaddq $M3,$D3,$D3 # d3 += r3'*r0 vpaddq $M4,$D4,$D4 # d4 += r3'*r1 vpaddq $M1,$D1,$D1 # d1 += r3'*5*r3 vpaddq $M2,$D2,$D2 # d2 += r3'*5*r4 vpmuludq $T4,$S4,$M3 vpmuludq $T4,$R0,$M4 vpmuludq $T4,$S1,$M0 vpmuludq $T4,$S2,$M1 vpmuludq $T4,$S3,$M2 vpaddq $M3,$D3,$D3 # d3 += r2'*5*r4 vpaddq $M4,$D4,$D4 # d4 += r2'*r0 vpaddq $M0,$D0,$D0 # d0 += r2'*5*r1 vpaddq $M1,$D1,$D1 # d1 += r2'*5*r2 vpaddq $M2,$D2,$D2 # d2 += r2'*5*r3 ################################################################ # load input vmovdqu64 16*0($inp),%z#$T3 vmovdqu64 16*4($inp),%z#$T4 lea 16*8($inp),$inp ################################################################ # lazy reduction vpsrlq \$26,$D3,$M3 vpandq $MASK,$D3,$D3 vpaddq $M3,$D4,$D4 # d3 -> d4 vpsrlq \$26,$D0,$M0 vpandq $MASK,$D0,$D0 vpaddq $M0,$D1,$D1 # d0 -> d1 vpsrlq \$26,$D4,$M4 vpandq $MASK,$D4,$D4 vpsrlq \$26,$D1,$M1 vpandq $MASK,$D1,$D1 vpaddq $M1,$D2,$D2 # d1 -> d2 vpaddq $M4,$D0,$D0 vpsllq \$2,$M4,$M4 vpaddq $M4,$D0,$D0 # d4 -> d0 vpsrlq \$26,$D2,$M2 vpandq $MASK,$D2,$D2 vpaddq $M2,$D3,$D3 # d2 -> d3 vpsrlq \$26,$D0,$M0 vpandq $MASK,$D0,$D0 vpaddq $M0,$D1,$D1 # d0 -> d1 vpsrlq \$26,$D3,$M3 vpandq $MASK,$D3,$D3 vpaddq $M3,$D4,$D4 # d3 -> d4 ################################################################ # at this point we have 14243444 in $R0-$S4 and 05060708 in # $D0-$D4, ... vpunpcklqdq $T4,$T3,$T0 # transpose input vpunpckhqdq $T4,$T3,$T4 # ... since input 64-bit lanes are ordered as 73625140, we could # "vperm" it to 76543210 (here and in each loop iteration), *or* # we could just flow along, hence the goal for $R0-$S4 is # 1858286838784888 ... vmovdqa32 128(%rcx),$M0 # .Lpermd_avx512: mov \$0x7777,%eax kmovw %eax,%k1 vpermd $R0,$M0,$R0 # 14243444 -> 1---2---3---4--- vpermd $R1,$M0,$R1 vpermd $R2,$M0,$R2 vpermd $R3,$M0,$R3 vpermd $R4,$M0,$R4 vpermd $D0,$M0,${R0}{%k1} # 05060708 -> 1858286838784888 vpermd $D1,$M0,${R1}{%k1} vpermd $D2,$M0,${R2}{%k1} vpermd $D3,$M0,${R3}{%k1} vpermd $D4,$M0,${R4}{%k1} vpslld \$2,$R1,$S1 # *5 vpslld \$2,$R2,$S2 vpslld \$2,$R3,$S3 vpslld \$2,$R4,$S4 vpaddd $R1,$S1,$S1 vpaddd $R2,$S2,$S2 vpaddd $R3,$S3,$S3 vpaddd $R4,$S4,$S4 vpbroadcastq 32(%rcx),$PADBIT # .L129 vpsrlq \$52,$T0,$T2 # splat input vpsllq \$12,$T4,$T3 vporq $T3,$T2,$T2 vpsrlq \$26,$T0,$T1 vpsrlq \$14,$T4,$T3 vpsrlq \$40,$T4,$T4 # 4 vpandq $MASK,$T2,$T2 # 2 vpandq $MASK,$T0,$T0 # 0 #vpandq $MASK,$T1,$T1 # 1 #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H2,$T2,$H2 # accumulate input sub \$192,$len jbe .Ltail_avx512 jmp .Loop_avx512 .align 32 .Loop_avx512: ################################################################ # ((inp[0]*r^8+inp[ 8])*r^8+inp[16])*r^8 # ((inp[1]*r^8+inp[ 9])*r^8+inp[17])*r^7 # ((inp[2]*r^8+inp[10])*r^8+inp[18])*r^6 # ((inp[3]*r^8+inp[11])*r^8+inp[19])*r^5 # ((inp[4]*r^8+inp[12])*r^8+inp[20])*r^4 # ((inp[5]*r^8+inp[13])*r^8+inp[21])*r^3 # ((inp[6]*r^8+inp[14])*r^8+inp[22])*r^2 # ((inp[7]*r^8+inp[15])*r^8+inp[23])*r^1 # \________/\___________/ ################################################################ #vpaddq $H2,$T2,$H2 # accumulate input # d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4 # d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4 # d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4 # d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4 # d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4 # # however, as h2 is "chronologically" first one available pull # corresponding operations up, so it's # # d3 = h2*r1 + h0*r3 + h1*r2 + h3*r0 + h4*5*r4 # d4 = h2*r2 + h0*r4 + h1*r3 + h3*r1 + h4*r0 # d0 = h2*5*r3 + h0*r0 + h1*5*r4 + h3*5*r2 + h4*5*r1 # d1 = h2*5*r4 + h0*r1 + h1*r0 + h3*5*r3 + h4*5*r2 # d2 = h2*r0 + h0*r2 + h1*r1 + h3*5*r4 + h4*5*r3 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpaddq $H0,$T0,$H0 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vpaddq $H1,$T1,$H1 # accumulate input vpaddq $H3,$T3,$H3 vpaddq $H4,$T4,$H4 vmovdqu64 16*0($inp),$T3 # load input vmovdqu64 16*4($inp),$T4 lea 16*8($inp),$inp vpmuludq $H0,$R3,$M3 vpmuludq $H0,$R4,$M4 vpmuludq $H0,$R0,$M0 vpmuludq $H0,$R1,$M1 vpaddq $M3,$D3,$D3 # d3 += h0*r3 vpaddq $M4,$D4,$D4 # d4 += h0*r4 vpaddq $M0,$D0,$D0 # d0 += h0*r0 vpaddq $M1,$D1,$D1 # d1 += h0*r1 vpmuludq $H1,$R2,$M3 vpmuludq $H1,$R3,$M4 vpmuludq $H1,$S4,$M0 vpmuludq $H0,$R2,$M2 vpaddq $M3,$D3,$D3 # d3 += h1*r2 vpaddq $M4,$D4,$D4 # d4 += h1*r3 vpaddq $M0,$D0,$D0 # d0 += h1*s4 vpaddq $M2,$D2,$D2 # d2 += h0*r2 vpunpcklqdq $T4,$T3,$T0 # transpose input vpunpckhqdq $T4,$T3,$T4 vpmuludq $H3,$R0,$M3 vpmuludq $H3,$R1,$M4 vpmuludq $H1,$R0,$M1 vpmuludq $H1,$R1,$M2 vpaddq $M3,$D3,$D3 # d3 += h3*r0 vpaddq $M4,$D4,$D4 # d4 += h3*r1 vpaddq $M1,$D1,$D1 # d1 += h1*r0 vpaddq $M2,$D2,$D2 # d2 += h1*r1 vpmuludq $H4,$S4,$M3 vpmuludq $H4,$R0,$M4 vpmuludq $H3,$S2,$M0 vpmuludq $H3,$S3,$M1 vpaddq $M3,$D3,$D3 # d3 += h4*s4 vpmuludq $H3,$S4,$M2 vpaddq $M4,$D4,$D4 # d4 += h4*r0 vpaddq $M0,$D0,$D0 # d0 += h3*s2 vpaddq $M1,$D1,$D1 # d1 += h3*s3 vpaddq $M2,$D2,$D2 # d2 += h3*s4 vpmuludq $H4,$S1,$M0 vpmuludq $H4,$S2,$M1 vpmuludq $H4,$S3,$M2 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 ################################################################ # lazy reduction (interleaved with input splat) vpsrlq \$52,$T0,$T2 # splat input vpsllq \$12,$T4,$T3 vpsrlq \$26,$D3,$H3 vpandq $MASK,$D3,$D3 vpaddq $H3,$D4,$H4 # h3 -> h4 vporq $T3,$T2,$T2 vpsrlq \$26,$H0,$D0 vpandq $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpandq $MASK,$T2,$T2 # 2 vpsrlq \$26,$H4,$D4 vpandq $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpandq $MASK,$H1,$H1 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpaddq $T2,$H2,$H2 # modulo-scheduled vpsrlq \$26,$T0,$T1 vpsrlq \$26,$H2,$D2 vpandq $MASK,$H2,$H2 vpaddq $D2,$D3,$H3 # h2 -> h3 vpsrlq \$14,$T4,$T3 vpsrlq \$26,$H0,$D0 vpandq $MASK,$H0,$H0 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$40,$T4,$T4 # 4 vpsrlq \$26,$H3,$D3 vpandq $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 vpandq $MASK,$T0,$T0 # 0 #vpandq $MASK,$T1,$T1 # 1 #vpandq $MASK,$T3,$T3 # 3 #vporq $PADBIT,$T4,$T4 # padbit, yes, always sub \$128,$len ja .Loop_avx512 .Ltail_avx512: ################################################################ # while above multiplications were by r^8 in all lanes, in last # iteration we multiply least significant lane by r^8 and most # significant one by r, that's why table gets shifted... vpsrlq \$32,$R0,$R0 # 0105020603070408 vpsrlq \$32,$R1,$R1 vpsrlq \$32,$R2,$R2 vpsrlq \$32,$S3,$S3 vpsrlq \$32,$S4,$S4 vpsrlq \$32,$R3,$R3 vpsrlq \$32,$R4,$R4 vpsrlq \$32,$S1,$S1 vpsrlq \$32,$S2,$S2 ################################################################ # load either next or last 64 byte of input lea ($inp,$len),$inp #vpaddq $H2,$T2,$H2 # accumulate input vpaddq $H0,$T0,$H0 vpmuludq $H2,$R1,$D3 # d3 = h2*r1 vpmuludq $H2,$R2,$D4 # d4 = h2*r2 vpmuludq $H2,$S3,$D0 # d0 = h2*s3 vpandq $MASK,$T1,$T1 # 1 vpmuludq $H2,$S4,$D1 # d1 = h2*s4 vpandq $MASK,$T3,$T3 # 3 vpmuludq $H2,$R0,$D2 # d2 = h2*r0 vporq $PADBIT,$T4,$T4 # padbit, yes, always vpaddq $H1,$T1,$H1 # accumulate input vpaddq $H3,$T3,$H3 vpaddq $H4,$T4,$H4 vmovdqu 16*0($inp),%x#$T0 vpmuludq $H0,$R3,$M3 vpmuludq $H0,$R4,$M4 vpmuludq $H0,$R0,$M0 vpmuludq $H0,$R1,$M1 vpaddq $M3,$D3,$D3 # d3 += h0*r3 vpaddq $M4,$D4,$D4 # d4 += h0*r4 vpaddq $M0,$D0,$D0 # d0 += h0*r0 vpaddq $M1,$D1,$D1 # d1 += h0*r1 vmovdqu 16*1($inp),%x#$T1 vpmuludq $H1,$R2,$M3 vpmuludq $H1,$R3,$M4 vpmuludq $H1,$S4,$M0 vpmuludq $H0,$R2,$M2 vpaddq $M3,$D3,$D3 # d3 += h1*r2 vpaddq $M4,$D4,$D4 # d4 += h1*r3 vpaddq $M0,$D0,$D0 # d0 += h1*s4 vpaddq $M2,$D2,$D2 # d2 += h0*r2 vinserti128 \$1,16*2($inp),%y#$T0,%y#$T0 vpmuludq $H3,$R0,$M3 vpmuludq $H3,$R1,$M4 vpmuludq $H1,$R0,$M1 vpmuludq $H1,$R1,$M2 vpaddq $M3,$D3,$D3 # d3 += h3*r0 vpaddq $M4,$D4,$D4 # d4 += h3*r1 vpaddq $M1,$D1,$D1 # d1 += h1*r0 vpaddq $M2,$D2,$D2 # d2 += h1*r1 vinserti128 \$1,16*3($inp),%y#$T1,%y#$T1 vpmuludq $H4,$S4,$M3 vpmuludq $H4,$R0,$M4 vpmuludq $H3,$S2,$M0 vpmuludq $H3,$S3,$M1 vpmuludq $H3,$S4,$M2 vpaddq $M3,$D3,$H3 # h3 = d3 + h4*s4 vpaddq $M4,$D4,$D4 # d4 += h4*r0 vpaddq $M0,$D0,$D0 # d0 += h3*s2 vpaddq $M1,$D1,$D1 # d1 += h3*s3 vpaddq $M2,$D2,$D2 # d2 += h3*s4 vpmuludq $H4,$S1,$M0 vpmuludq $H4,$S2,$M1 vpmuludq $H4,$S3,$M2 vpaddq $M0,$D0,$H0 # h0 = d0 + h4*s1 vpaddq $M1,$D1,$H1 # h1 = d2 + h4*s2 vpaddq $M2,$D2,$H2 # h2 = d3 + h4*s3 ################################################################ # horizontal addition mov \$1,%eax vpermq \$0xb1,$H3,$D3 vpermq \$0xb1,$D4,$H4 vpermq \$0xb1,$H0,$D0 vpermq \$0xb1,$H1,$D1 vpermq \$0xb1,$H2,$D2 vpaddq $D3,$H3,$H3 vpaddq $D4,$H4,$H4 vpaddq $D0,$H0,$H0 vpaddq $D1,$H1,$H1 vpaddq $D2,$H2,$H2 kmovw %eax,%k3 vpermq \$0x2,$H3,$D3 vpermq \$0x2,$H4,$D4 vpermq \$0x2,$H0,$D0 vpermq \$0x2,$H1,$D1 vpermq \$0x2,$H2,$D2 vpaddq $D3,$H3,$H3 vpaddq $D4,$H4,$H4 vpaddq $D0,$H0,$H0 vpaddq $D1,$H1,$H1 vpaddq $D2,$H2,$H2 vextracti64x4 \$0x1,$H3,%y#$D3 vextracti64x4 \$0x1,$H4,%y#$D4 vextracti64x4 \$0x1,$H0,%y#$D0 vextracti64x4 \$0x1,$H1,%y#$D1 vextracti64x4 \$0x1,$H2,%y#$D2 vpaddq $D3,$H3,${H3}{%k3}{z} # keep single qword in case vpaddq $D4,$H4,${H4}{%k3}{z} # it's passed to .Ltail_avx2 vpaddq $D0,$H0,${H0}{%k3}{z} vpaddq $D1,$H1,${H1}{%k3}{z} vpaddq $D2,$H2,${H2}{%k3}{z} ___ map(s/%z/%y/,($T0,$T1,$T2,$T3,$T4, $PADBIT)); map(s/%z/%y/,($H0,$H1,$H2,$H3,$H4, $D0,$D1,$D2,$D3,$D4, $MASK)); $code.=<<___; ################################################################ # lazy reduction (interleaved with input splat) vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpsrldq \$6,$T0,$T2 # splat input vpsrldq \$6,$T1,$T3 vpunpckhqdq $T1,$T0,$T4 # 4 vpaddq $D3,$H4,$H4 # h3 -> h4 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpunpcklqdq $T3,$T2,$T2 # 2:3 vpunpcklqdq $T1,$T0,$T0 # 0:1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H4,$D4 vpand $MASK,$H4,$H4 vpsrlq \$26,$H1,$D1 vpand $MASK,$H1,$H1 vpsrlq \$30,$T2,$T3 vpsrlq \$4,$T2,$T2 vpaddq $D1,$H2,$H2 # h1 -> h2 vpaddq $D4,$H0,$H0 vpsllq \$2,$D4,$D4 vpsrlq \$26,$T0,$T1 vpsrlq \$40,$T4,$T4 # 4 vpaddq $D4,$H0,$H0 # h4 -> h0 vpsrlq \$26,$H2,$D2 vpand $MASK,$H2,$H2 vpand $MASK,$T2,$T2 # 2 vpand $MASK,$T0,$T0 # 0 vpaddq $D2,$H3,$H3 # h2 -> h3 vpsrlq \$26,$H0,$D0 vpand $MASK,$H0,$H0 vpaddq $H2,$T2,$H2 # accumulate input for .Ltail_avx2 vpand $MASK,$T1,$T1 # 1 vpaddq $D0,$H1,$H1 # h0 -> h1 vpsrlq \$26,$H3,$D3 vpand $MASK,$H3,$H3 vpand $MASK,$T3,$T3 # 3 vpor 32(%rcx),$T4,$T4 # padbit, yes, always vpaddq $D3,$H4,$H4 # h3 -> h4 lea 0x90(%rsp),%rax # size optimization for .Ltail_avx2 add \$64,$len jnz .Ltail_avx2 vpsubq $T2,$H2,$H2 # undo input accumulation vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced vmovd %x#$H1,`4*1-48-64`($ctx) vmovd %x#$H2,`4*2-48-64`($ctx) vmovd %x#$H3,`4*3-48-64`($ctx) vmovd %x#$H4,`4*4-48-64`($ctx) vzeroall ___ $code.=<<___ if ($win64); movdqa 0x50(%r11),%xmm6 movdqa 0x60(%r11),%xmm7 movdqa 0x70(%r11),%xmm8 movdqa 0x80(%r11),%xmm9 movdqa 0x90(%r11),%xmm10 movdqa 0xa0(%r11),%xmm11 movdqa 0xb0(%r11),%xmm12 movdqa 0xc0(%r11),%xmm13 movdqa 0xd0(%r11),%xmm14 movdqa 0xe0(%r11),%xmm15 lea 0xf8(%r11),%rsp .Ldo_avx512_epilogue: ___ $code.=<<___ if (!$win64); lea 8(%r11),%rsp .cfi_def_cfa %rsp,8 ___ $code.=<<___; ret .cfi_endproc .size poly1305_blocks_avx512,.-poly1305_blocks_avx512 ___ if ($avx>3) { ######################################################################## # VPMADD52 version using 2^44 radix. # # One can argue that base 2^52 would be more natural. Well, even though # some operations would be more natural, one has to recognize couple of # things. Base 2^52 doesn't provide advantage over base 2^44 if you look # at amount of multiply-n-accumulate operations. Secondly, it makes it # impossible to pre-compute multiples of 5 [referred to as s[]/sN in # reference implementations], which means that more such operations # would have to be performed in inner loop, which in turn makes critical # path longer. In other words, even though base 2^44 reduction might # look less elegant, overall critical path is actually shorter... ######################################################################## # Layout of opaque area is following. # # unsigned __int64 h[3]; # current hash value base 2^44 # unsigned __int64 s[2]; # key value*20 base 2^44 # unsigned __int64 r[3]; # key value base 2^44 # struct { unsigned __int64 r^1, r^3, r^2, r^4; } R[4]; # # r^n positions reflect # # placement in register, not # # memory, R[3] is R[1]*20 $code.=<<___; .type poly1305_init_base2_44,\@function,3 .align 32 poly1305_init_base2_44: .cfi_startproc xor %rax,%rax mov %rax,0($ctx) # initialize hash value mov %rax,8($ctx) mov %rax,16($ctx) .Linit_base2_44: lea poly1305_blocks_vpmadd52(%rip),%r10 lea poly1305_emit_base2_44(%rip),%r11 mov \$0x0ffffffc0fffffff,%rax mov \$0x0ffffffc0ffffffc,%rcx and 0($inp),%rax mov \$0x00000fffffffffff,%r8 and 8($inp),%rcx mov \$0x00000fffffffffff,%r9 and %rax,%r8 shrd \$44,%rcx,%rax mov %r8,40($ctx) # r0 and %r9,%rax shr \$24,%rcx mov %rax,48($ctx) # r1 lea (%rax,%rax,4),%rax # *5 mov %rcx,56($ctx) # r2 shl \$2,%rax # magic <<2 lea (%rcx,%rcx,4),%rcx # *5 shl \$2,%rcx # magic <<2 mov %rax,24($ctx) # s1 mov %rcx,32($ctx) # s2 movq \$-1,64($ctx) # write impossible value ___ $code.=<<___ if ($flavour !~ /elf32/); mov %r10,0(%rdx) mov %r11,8(%rdx) ___ $code.=<<___ if ($flavour =~ /elf32/); mov %r10d,0(%rdx) mov %r11d,4(%rdx) ___ $code.=<<___; mov \$1,%eax ret .cfi_endproc .size poly1305_init_base2_44,.-poly1305_init_base2_44 ___ { my ($H0,$H1,$H2,$r2r1r0,$r1r0s2,$r0s2s1,$Dlo,$Dhi) = map("%ymm$_",(0..5,16,17)); my ($T0,$inp_permd,$inp_shift,$PAD) = map("%ymm$_",(18..21)); my ($reduc_mask,$reduc_rght,$reduc_left) = map("%ymm$_",(22..25)); $code.=<<___; .type poly1305_blocks_vpmadd52,\@function,4 .align 32 poly1305_blocks_vpmadd52: .cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52 # too short shl \$40,$padbit mov 64($ctx),%r8 # peek on power of the key # if powers of the key are not calculated yet, process up to 3 # blocks with this single-block subroutine, otherwise ensure that # length is divisible by 2 blocks and pass the rest down to next # subroutine... mov \$3,%rax mov \$1,%r10 cmp \$4,$len # is input long cmovae %r10,%rax test %r8,%r8 # is power value impossible? cmovns %r10,%rax and $len,%rax # is input of favourable length? jz .Lblocks_vpmadd52_4x sub %rax,$len mov \$7,%r10d mov \$1,%r11d kmovw %r10d,%k7 lea .L2_44_inp_permd(%rip),%r10 kmovw %r11d,%k1 vmovq $padbit,%x#$PAD vmovdqa64 0(%r10),$inp_permd # .L2_44_inp_permd vmovdqa64 32(%r10),$inp_shift # .L2_44_inp_shift vpermq \$0xcf,$PAD,$PAD vmovdqa64 64(%r10),$reduc_mask # .L2_44_mask vmovdqu64 0($ctx),${Dlo}{%k7}{z} # load hash value vmovdqu64 40($ctx),${r2r1r0}{%k7}{z} # load keys vmovdqu64 32($ctx),${r1r0s2}{%k7}{z} vmovdqu64 24($ctx),${r0s2s1}{%k7}{z} vmovdqa64 96(%r10),$reduc_rght # .L2_44_shift_rgt vmovdqa64 128(%r10),$reduc_left # .L2_44_shift_lft jmp .Loop_vpmadd52 .align 32 .Loop_vpmadd52: vmovdqu32 0($inp),%x#$T0 # load input as ----3210 lea 16($inp),$inp vpermd $T0,$inp_permd,$T0 # ----3210 -> --322110 vpsrlvq $inp_shift,$T0,$T0 vpandq $reduc_mask,$T0,$T0 vporq $PAD,$T0,$T0 vpaddq $T0,$Dlo,$Dlo # accumulate input vpermq \$0,$Dlo,${H0}{%k7}{z} # smash hash value vpermq \$0b01010101,$Dlo,${H1}{%k7}{z} vpermq \$0b10101010,$Dlo,${H2}{%k7}{z} vpxord $Dlo,$Dlo,$Dlo vpxord $Dhi,$Dhi,$Dhi vpmadd52luq $r2r1r0,$H0,$Dlo vpmadd52huq $r2r1r0,$H0,$Dhi vpmadd52luq $r1r0s2,$H1,$Dlo vpmadd52huq $r1r0s2,$H1,$Dhi vpmadd52luq $r0s2s1,$H2,$Dlo vpmadd52huq $r0s2s1,$H2,$Dhi vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost qword vpsllvq $reduc_left,$Dhi,$Dhi # 0 in topmost qword vpandq $reduc_mask,$Dlo,$Dlo vpaddq $T0,$Dhi,$Dhi vpermq \$0b10010011,$Dhi,$Dhi # 0 in lowest qword vpaddq $Dhi,$Dlo,$Dlo # note topmost qword :-) vpsrlvq $reduc_rght,$Dlo,$T0 # 0 in topmost word vpandq $reduc_mask,$Dlo,$Dlo vpermq \$0b10010011,$T0,$T0 vpaddq $T0,$Dlo,$Dlo vpermq \$0b10010011,$Dlo,${T0}{%k1}{z} vpaddq $T0,$Dlo,$Dlo vpsllq \$2,$T0,$T0 vpaddq $T0,$Dlo,$Dlo dec %rax # len-=16 jnz .Loop_vpmadd52 vmovdqu64 $Dlo,0($ctx){%k7} # store hash value test $len,$len jnz .Lblocks_vpmadd52_4x .Lno_data_vpmadd52: ret .cfi_endproc .size poly1305_blocks_vpmadd52,.-poly1305_blocks_vpmadd52 ___ } { ######################################################################## # As implied by its name 4x subroutine processes 4 blocks in parallel # (but handles even 4*n+2 blocks lengths). It takes up to 4th key power # and is handled in 256-bit %ymm registers. my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); $code.=<<___; .type poly1305_blocks_vpmadd52_4x,\@function,4 .align 32 poly1305_blocks_vpmadd52_4x: .cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52_4x # too short shl \$40,$padbit mov 64($ctx),%r8 # peek on power of the key .Lblocks_vpmadd52_4x: vpbroadcastq $padbit,$PAD vmovdqa64 .Lx_mask44(%rip),$mask44 mov \$5,%eax vmovdqa64 .Lx_mask42(%rip),$mask42 kmovw %eax,%k1 # used in 2x path test %r8,%r8 # is power value impossible? js .Linit_vpmadd52 # if it is, then init R[4] vmovq 0($ctx),%x#$H0 # load current hash value vmovq 8($ctx),%x#$H1 vmovq 16($ctx),%x#$H2 test \$3,$len # is length 4*n+2? jnz .Lblocks_vpmadd52_2x_do .Lblocks_vpmadd52_4x_do: vpbroadcastq 64($ctx),$R0 # load 4th power of the key vpbroadcastq 96($ctx),$R1 vpbroadcastq 128($ctx),$R2 vpbroadcastq 160($ctx),$S1 .Lblocks_vpmadd52_4x_key_loaded: vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R2,$S2,$S2 vpsllq \$2,$S2,$S2 test \$7,$len # is len 8*n? jz .Lblocks_vpmadd52_8x vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*2($inp),$T3 lea 16*4($inp),$inp vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 # at this point 64-bit lanes are ordered as 3-1-2-0 vpsrlq \$24,$T3,$T2 # splat the data vporq $PAD,$T2,$T2 vpaddq $T2,$H2,$H2 # accumulate input vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 sub \$4,$len jz .Ltail_vpmadd52_4x jmp .Loop_vpmadd52_4x ud2 .align 32 .Linit_vpmadd52: vmovq 24($ctx),%x#$S1 # load key vmovq 56($ctx),%x#$H2 vmovq 32($ctx),%x#$S2 vmovq 40($ctx),%x#$R0 vmovq 48($ctx),%x#$R1 vmovdqa $R0,$H0 vmovdqa $R1,$H1 vmovdqa $H2,$R2 mov \$2,%eax .Lmul_init_vpmadd52: vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 dec %eax jz .Ldone_init_vpmadd52 vpunpcklqdq $R1,$H1,$R1 # 1,2 vpbroadcastq %x#$H1,%x#$H1 # 2,2 vpunpcklqdq $R2,$H2,$R2 vpbroadcastq %x#$H2,%x#$H2 vpunpcklqdq $R0,$H0,$R0 vpbroadcastq %x#$H0,%x#$H0 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R1,$S1,$S1 vpaddq $R2,$S2,$S2 vpsllq \$2,$S1,$S1 vpsllq \$2,$S2,$S2 jmp .Lmul_init_vpmadd52 ud2 .align 32 .Ldone_init_vpmadd52: vinserti128 \$1,%x#$R1,$H1,$R1 # 1,2,3,4 vinserti128 \$1,%x#$R2,$H2,$R2 vinserti128 \$1,%x#$R0,$H0,$R0 vpermq \$0b11011000,$R1,$R1 # 1,3,2,4 vpermq \$0b11011000,$R2,$R2 vpermq \$0b11011000,$R0,$R0 vpsllq \$2,$R1,$S1 # S1 = R1*5*4 vpaddq $R1,$S1,$S1 vpsllq \$2,$S1,$S1 vmovq 0($ctx),%x#$H0 # load current hash value vmovq 8($ctx),%x#$H1 vmovq 16($ctx),%x#$H2 test \$3,$len # is length 4*n+2? jnz .Ldone_init_vpmadd52_2x vmovdqu64 $R0,64($ctx) # save key powers vpbroadcastq %x#$R0,$R0 # broadcast 4th power vmovdqu64 $R1,96($ctx) vpbroadcastq %x#$R1,$R1 vmovdqu64 $R2,128($ctx) vpbroadcastq %x#$R2,$R2 vmovdqu64 $S1,160($ctx) vpbroadcastq %x#$S1,$S1 jmp .Lblocks_vpmadd52_4x_key_loaded ud2 .align 32 .Ldone_init_vpmadd52_2x: vmovdqu64 $R0,64($ctx) # save key powers vpsrldq \$8,$R0,$R0 # 0-1-0-2 vmovdqu64 $R1,96($ctx) vpsrldq \$8,$R1,$R1 vmovdqu64 $R2,128($ctx) vpsrldq \$8,$R2,$R2 vmovdqu64 $S1,160($ctx) vpsrldq \$8,$S1,$S1 jmp .Lblocks_vpmadd52_2x_key_loaded ud2 .align 32 .Lblocks_vpmadd52_2x_do: vmovdqu64 128+8($ctx),${R2}{%k1}{z}# load 2nd and 1st key powers vmovdqu64 160+8($ctx),${S1}{%k1}{z} vmovdqu64 64+8($ctx),${R0}{%k1}{z} vmovdqu64 96+8($ctx),${R1}{%k1}{z} .Lblocks_vpmadd52_2x_key_loaded: vmovdqu64 16*0($inp),$T2 # load data vpxorq $T3,$T3,$T3 lea 16*2($inp),$inp vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 # at this point 64-bit lanes are ordered as x-1-x-0 vpsrlq \$24,$T3,$T2 # splat the data vporq $PAD,$T2,$T2 vpaddq $T2,$H2,$H2 # accumulate input vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 jmp .Ltail_vpmadd52_2x ud2 .align 32 .Loop_vpmadd52_4x: #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*2($inp),$T3 lea 16*4($inp),$inp vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # partial reduction (interleaved with data splat) vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpsrlq \$24,$T3,$T2 vporq $PAD,$T2,$T2 vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $T2,$H2,$H2 # accumulate input vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 sub \$4,$len # len-=64 jnz .Loop_vpmadd52_4x .Ltail_vpmadd52_4x: vmovdqu64 128($ctx),$R2 # load all key powers vmovdqu64 160($ctx),$S1 vmovdqu64 64($ctx),$R0 vmovdqu64 96($ctx),$R1 .Ltail_vpmadd52_2x: vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R2,$S2,$S2 vpsllq \$2,$S2,$S2 #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # horizontal addition mov \$1,%eax kmovw %eax,%k1 vpsrldq \$8,$D0lo,$T0 vpsrldq \$8,$D0hi,$H0 vpsrldq \$8,$D1lo,$T1 vpsrldq \$8,$D1hi,$H1 vpaddq $T0,$D0lo,$D0lo vpaddq $H0,$D0hi,$D0hi vpsrldq \$8,$D2lo,$T2 vpsrldq \$8,$D2hi,$H2 vpaddq $T1,$D1lo,$D1lo vpaddq $H1,$D1hi,$D1hi vpermq \$0x2,$D0lo,$T0 vpermq \$0x2,$D0hi,$H0 vpaddq $T2,$D2lo,$D2lo vpaddq $H2,$D2hi,$D2hi vpermq \$0x2,$D1lo,$T1 vpermq \$0x2,$D1hi,$H1 vpaddq $T0,$D0lo,${D0lo}{%k1}{z} vpaddq $H0,$D0hi,${D0hi}{%k1}{z} vpermq \$0x2,$D2lo,$T2 vpermq \$0x2,$D2hi,$H2 vpaddq $T1,$D1lo,${D1lo}{%k1}{z} vpaddq $H1,$D1hi,${D1hi}{%k1}{z} vpaddq $T2,$D2lo,${D2lo}{%k1}{z} vpaddq $H2,$D2hi,${D2hi}{%k1}{z} ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 # at this point $len is # either 4*n+2 or 0... sub \$2,$len # len-=32 ja .Lblocks_vpmadd52_4x_do vmovq %x#$H0,0($ctx) vmovq %x#$H1,8($ctx) vmovq %x#$H2,16($ctx) vzeroall .Lno_data_vpmadd52_4x: ret .cfi_endproc .size poly1305_blocks_vpmadd52_4x,.-poly1305_blocks_vpmadd52_4x ___ } { ######################################################################## # As implied by its name 8x subroutine processes 8 blocks in parallel... # This is intermediate version, as it's used only in cases when input # length is either 8*n, 8*n+1 or 8*n+2... my ($H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2) = map("%ymm$_",(0..5,16,17)); my ($D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi) = map("%ymm$_",(18..23)); my ($T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD) = map("%ymm$_",(24..31)); my ($RR0,$RR1,$RR2,$SS1,$SS2) = map("%ymm$_",(6..10)); $code.=<<___; .type poly1305_blocks_vpmadd52_8x,\@function,4 .align 32 poly1305_blocks_vpmadd52_8x: .cfi_startproc shr \$4,$len jz .Lno_data_vpmadd52_8x # too short shl \$40,$padbit mov 64($ctx),%r8 # peek on power of the key vmovdqa64 .Lx_mask44(%rip),$mask44 vmovdqa64 .Lx_mask42(%rip),$mask42 test %r8,%r8 # is power value impossible? js .Linit_vpmadd52 # if it is, then init R[4] vmovq 0($ctx),%x#$H0 # load current hash value vmovq 8($ctx),%x#$H1 vmovq 16($ctx),%x#$H2 .Lblocks_vpmadd52_8x: ################################################################ # fist we calculate more key powers vmovdqu64 128($ctx),$R2 # load 1-3-2-4 powers vmovdqu64 160($ctx),$S1 vmovdqu64 64($ctx),$R0 vmovdqu64 96($ctx),$R1 vpsllq \$2,$R2,$S2 # S2 = R2*5*4 vpaddq $R2,$S2,$S2 vpsllq \$2,$S2,$S2 vpbroadcastq %x#$R2,$RR2 # broadcast 4th power vpbroadcastq %x#$R0,$RR0 vpbroadcastq %x#$R1,$RR1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $RR2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $RR2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $RR2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $RR2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $RR2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $RR2,$R0,$D2hi vpmadd52luq $RR0,$R0,$D0lo vpmadd52huq $RR0,$R0,$D0hi vpmadd52luq $RR0,$R1,$D1lo vpmadd52huq $RR0,$R1,$D1hi vpmadd52luq $RR0,$R2,$D2lo vpmadd52huq $RR0,$R2,$D2hi vpmadd52luq $RR1,$S2,$D0lo vpmadd52huq $RR1,$S2,$D0hi vpmadd52luq $RR1,$R0,$D1lo vpmadd52huq $RR1,$R0,$D1hi vpmadd52luq $RR1,$R1,$D2lo vpmadd52huq $RR1,$R1,$D2hi ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$RR0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$RR1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$RR2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$RR0,$RR0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$RR0,$RR0 vpsrlq \$44,$RR0,$tmp # additional step vpandq $mask44,$RR0,$RR0 vpaddq $tmp,$RR1,$RR1 ################################################################ # At this point Rx holds 1324 powers, RRx - 5768, and the goal # is 15263748, which reflects how data is loaded... vpunpcklqdq $R2,$RR2,$T2 # 3748 vpunpckhqdq $R2,$RR2,$R2 # 1526 vpunpcklqdq $R0,$RR0,$T0 vpunpckhqdq $R0,$RR0,$R0 vpunpcklqdq $R1,$RR1,$T1 vpunpckhqdq $R1,$RR1,$R1 ___ ######## switch to %zmm map(s/%y/%z/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); map(s/%y/%z/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); map(s/%y/%z/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); map(s/%y/%z/, $RR0,$RR1,$RR2,$SS1,$SS2); $code.=<<___; vshufi64x2 \$0x44,$R2,$T2,$RR2 # 15263748 vshufi64x2 \$0x44,$R0,$T0,$RR0 vshufi64x2 \$0x44,$R1,$T1,$RR1 vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*4($inp),$T3 lea 16*8($inp),$inp vpsllq \$2,$RR2,$SS2 # S2 = R2*5*4 vpsllq \$2,$RR1,$SS1 # S1 = R1*5*4 vpaddq $RR2,$SS2,$SS2 vpaddq $RR1,$SS1,$SS1 vpsllq \$2,$SS2,$SS2 vpsllq \$2,$SS1,$SS1 vpbroadcastq $padbit,$PAD vpbroadcastq %x#$mask44,$mask44 vpbroadcastq %x#$mask42,$mask42 vpbroadcastq %x#$SS1,$S1 # broadcast 8th power vpbroadcastq %x#$SS2,$S2 vpbroadcastq %x#$RR0,$R0 vpbroadcastq %x#$RR1,$R1 vpbroadcastq %x#$RR2,$R2 vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 # at this point 64-bit lanes are ordered as 73625140 vpsrlq \$24,$T3,$T2 # splat the data vporq $PAD,$T2,$T2 vpaddq $T2,$H2,$H2 # accumulate input vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 sub \$8,$len jz .Ltail_vpmadd52_8x jmp .Loop_vpmadd52_8x .align 32 .Loop_vpmadd52_8x: #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$S1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$S1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$S2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$S2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$R0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$R0,$D2hi vmovdqu64 16*0($inp),$T2 # load data vmovdqu64 16*4($inp),$T3 lea 16*8($inp),$inp vpmadd52luq $H0,$R0,$D0lo vpmadd52huq $H0,$R0,$D0hi vpmadd52luq $H0,$R1,$D1lo vpmadd52huq $H0,$R1,$D1hi vpmadd52luq $H0,$R2,$D2lo vpmadd52huq $H0,$R2,$D2hi vpunpcklqdq $T3,$T2,$T1 # transpose data vpunpckhqdq $T3,$T2,$T3 vpmadd52luq $H1,$S2,$D0lo vpmadd52huq $H1,$S2,$D0hi vpmadd52luq $H1,$R0,$D1lo vpmadd52huq $H1,$R0,$D1hi vpmadd52luq $H1,$R1,$D2lo vpmadd52huq $H1,$R1,$D2hi ################################################################ # partial reduction (interleaved with data splat) vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpsrlq \$24,$T3,$T2 vporq $PAD,$T2,$T2 vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpandq $mask44,$T1,$T0 vpsrlq \$44,$T1,$T1 vpsllq \$20,$T3,$T3 vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $T2,$H2,$H2 # accumulate input vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vporq $T3,$T1,$T1 vpandq $mask44,$T1,$T1 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 sub \$8,$len # len-=128 jnz .Loop_vpmadd52_8x .Ltail_vpmadd52_8x: #vpaddq $T2,$H2,$H2 # accumulate input vpaddq $T0,$H0,$H0 vpaddq $T1,$H1,$H1 vpxorq $D0lo,$D0lo,$D0lo vpmadd52luq $H2,$SS1,$D0lo vpxorq $D0hi,$D0hi,$D0hi vpmadd52huq $H2,$SS1,$D0hi vpxorq $D1lo,$D1lo,$D1lo vpmadd52luq $H2,$SS2,$D1lo vpxorq $D1hi,$D1hi,$D1hi vpmadd52huq $H2,$SS2,$D1hi vpxorq $D2lo,$D2lo,$D2lo vpmadd52luq $H2,$RR0,$D2lo vpxorq $D2hi,$D2hi,$D2hi vpmadd52huq $H2,$RR0,$D2hi vpmadd52luq $H0,$RR0,$D0lo vpmadd52huq $H0,$RR0,$D0hi vpmadd52luq $H0,$RR1,$D1lo vpmadd52huq $H0,$RR1,$D1hi vpmadd52luq $H0,$RR2,$D2lo vpmadd52huq $H0,$RR2,$D2hi vpmadd52luq $H1,$SS2,$D0lo vpmadd52huq $H1,$SS2,$D0hi vpmadd52luq $H1,$RR0,$D1lo vpmadd52huq $H1,$RR0,$D1hi vpmadd52luq $H1,$RR1,$D2lo vpmadd52huq $H1,$RR1,$D2hi ################################################################ # horizontal addition mov \$1,%eax kmovw %eax,%k1 vpsrldq \$8,$D0lo,$T0 vpsrldq \$8,$D0hi,$H0 vpsrldq \$8,$D1lo,$T1 vpsrldq \$8,$D1hi,$H1 vpaddq $T0,$D0lo,$D0lo vpaddq $H0,$D0hi,$D0hi vpsrldq \$8,$D2lo,$T2 vpsrldq \$8,$D2hi,$H2 vpaddq $T1,$D1lo,$D1lo vpaddq $H1,$D1hi,$D1hi vpermq \$0x2,$D0lo,$T0 vpermq \$0x2,$D0hi,$H0 vpaddq $T2,$D2lo,$D2lo vpaddq $H2,$D2hi,$D2hi vpermq \$0x2,$D1lo,$T1 vpermq \$0x2,$D1hi,$H1 vpaddq $T0,$D0lo,$D0lo vpaddq $H0,$D0hi,$D0hi vpermq \$0x2,$D2lo,$T2 vpermq \$0x2,$D2hi,$H2 vpaddq $T1,$D1lo,$D1lo vpaddq $H1,$D1hi,$D1hi vextracti64x4 \$1,$D0lo,%y#$T0 vextracti64x4 \$1,$D0hi,%y#$H0 vpaddq $T2,$D2lo,$D2lo vpaddq $H2,$D2hi,$D2hi vextracti64x4 \$1,$D1lo,%y#$T1 vextracti64x4 \$1,$D1hi,%y#$H1 vextracti64x4 \$1,$D2lo,%y#$T2 vextracti64x4 \$1,$D2hi,%y#$H2 ___ ######## switch back to %ymm map(s/%z/%y/, $H0,$H1,$H2,$R0,$R1,$R2,$S1,$S2); map(s/%z/%y/, $D0lo,$D0hi,$D1lo,$D1hi,$D2lo,$D2hi); map(s/%z/%y/, $T0,$T1,$T2,$T3,$mask44,$mask42,$tmp,$PAD); $code.=<<___; vpaddq $T0,$D0lo,${D0lo}{%k1}{z} vpaddq $H0,$D0hi,${D0hi}{%k1}{z} vpaddq $T1,$D1lo,${D1lo}{%k1}{z} vpaddq $H1,$D1hi,${D1hi}{%k1}{z} vpaddq $T2,$D2lo,${D2lo}{%k1}{z} vpaddq $H2,$D2hi,${D2hi}{%k1}{z} ################################################################ # partial reduction vpsrlq \$44,$D0lo,$tmp vpsllq \$8,$D0hi,$D0hi vpandq $mask44,$D0lo,$H0 vpaddq $tmp,$D0hi,$D0hi vpaddq $D0hi,$D1lo,$D1lo vpsrlq \$44,$D1lo,$tmp vpsllq \$8,$D1hi,$D1hi vpandq $mask44,$D1lo,$H1 vpaddq $tmp,$D1hi,$D1hi vpaddq $D1hi,$D2lo,$D2lo vpsrlq \$42,$D2lo,$tmp vpsllq \$10,$D2hi,$D2hi vpandq $mask42,$D2lo,$H2 vpaddq $tmp,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsllq \$2,$D2hi,$D2hi vpaddq $D2hi,$H0,$H0 vpsrlq \$44,$H0,$tmp # additional step vpandq $mask44,$H0,$H0 vpaddq $tmp,$H1,$H1 ################################################################ vmovq %x#$H0,0($ctx) vmovq %x#$H1,8($ctx) vmovq %x#$H2,16($ctx) vzeroall .Lno_data_vpmadd52_8x: ret .cfi_endproc .size poly1305_blocks_vpmadd52_8x,.-poly1305_blocks_vpmadd52_8x ___ } $code.=<<___; .type poly1305_emit_base2_44,\@function,3 .align 32 poly1305_emit_base2_44: .cfi_startproc mov 0($ctx),%r8 # load hash value mov 8($ctx),%r9 mov 16($ctx),%r10 mov %r9,%rax shr \$20,%r9 shl \$44,%rax mov %r10,%rcx shr \$40,%r10 shl \$24,%rcx add %rax,%r8 adc %rcx,%r9 adc \$0,%r10 mov %r8,%rax add \$5,%r8 # compare to modulus mov %r9,%rcx adc \$0,%r9 adc \$0,%r10 shr \$2,%r10 # did 130-bit value overflow? cmovnz %r8,%rax cmovnz %r9,%rcx add 0($nonce),%rax # accumulate nonce adc 8($nonce),%rcx mov %rax,0($mac) # write result mov %rcx,8($mac) ret .cfi_endproc .size poly1305_emit_base2_44,.-poly1305_emit_base2_44 ___ } } } $code.=<<___; .align 64 .Lconst: .Lmask24: .long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 .L129: .long `1<<24`,0,`1<<24`,0,`1<<24`,0,`1<<24`,0 .Lmask26: .long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 .Lpermd_avx2: .long 2,2,2,3,2,0,2,1 .Lpermd_avx512: .long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 .L2_44_inp_permd: .long 0,1,1,2,2,3,7,7 .L2_44_inp_shift: .quad 0,12,24,64 .L2_44_mask: .quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff .L2_44_shift_rgt: .quad 44,44,42,64 .L2_44_shift_lft: .quad 8,8,10,64 .align 64 .Lx_mask44: .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff .quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff .Lx_mask42: .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff ___ } $code.=<<___; .asciz "Poly1305 for x86_64, CRYPTOGAMS by " .align 16 ___ { # chacha20-poly1305 helpers my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") : # Win64 order ("%rdi","%rsi","%rdx","%rcx"); # Unix order $code.=<<___; .globl xor128_encrypt_n_pad .type xor128_encrypt_n_pad,\@abi-omnipotent .align 16 xor128_encrypt_n_pad: .cfi_startproc sub $otp,$inp sub $otp,$out mov $len,%r10 # put len aside shr \$4,$len # len / 16 jz .Ltail_enc nop .Loop_enc_xmm: movdqu ($inp,$otp),%xmm0 pxor ($otp),%xmm0 movdqu %xmm0,($out,$otp) movdqa %xmm0,($otp) lea 16($otp),$otp dec $len jnz .Loop_enc_xmm and \$15,%r10 # len % 16 jz .Ldone_enc .Ltail_enc: mov \$16,$len sub %r10,$len xor %eax,%eax .Loop_enc_byte: mov ($inp,$otp),%al xor ($otp),%al mov %al,($out,$otp) mov %al,($otp) lea 1($otp),$otp dec %r10 jnz .Loop_enc_byte xor %eax,%eax .Loop_enc_pad: mov %al,($otp) lea 1($otp),$otp dec $len jnz .Loop_enc_pad .Ldone_enc: mov $otp,%rax ret .cfi_endproc .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad .globl xor128_decrypt_n_pad .type xor128_decrypt_n_pad,\@abi-omnipotent .align 16 xor128_decrypt_n_pad: .cfi_startproc sub $otp,$inp sub $otp,$out mov $len,%r10 # put len aside shr \$4,$len # len / 16 jz .Ltail_dec nop .Loop_dec_xmm: movdqu ($inp,$otp),%xmm0 movdqa ($otp),%xmm1 pxor %xmm0,%xmm1 movdqu %xmm1,($out,$otp) movdqa %xmm0,($otp) lea 16($otp),$otp dec $len jnz .Loop_dec_xmm pxor %xmm1,%xmm1 and \$15,%r10 # len % 16 jz .Ldone_dec .Ltail_dec: mov \$16,$len sub %r10,$len xor %eax,%eax xor %r11,%r11 .Loop_dec_byte: mov ($inp,$otp),%r11b mov ($otp),%al xor %r11b,%al mov %al,($out,$otp) mov %r11b,($otp) lea 1($otp),$otp dec %r10 jnz .Loop_dec_byte xor %eax,%eax .Loop_dec_pad: mov %al,($otp) lea 1($otp),$otp dec $len jnz .Loop_dec_pad .Ldone_dec: mov $otp,%rax ret .cfi_endproc .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad ___ } # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea 48(%rax),%rax mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler .type avx_handler,\@abi-omnipotent .align 16 avx_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail mov 208($context),%rax # pull context->R11 lea 0x50(%rax),%rsi lea 0xf8(%rax),%rax lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size avx_handler,.-avx_handler .section .pdata .align 4 .rva .LSEH_begin_poly1305_init .rva .LSEH_end_poly1305_init .rva .LSEH_info_poly1305_init .rva .LSEH_begin_poly1305_blocks .rva .LSEH_end_poly1305_blocks .rva .LSEH_info_poly1305_blocks .rva .LSEH_begin_poly1305_emit .rva .LSEH_end_poly1305_emit .rva .LSEH_info_poly1305_emit ___ $code.=<<___ if ($avx); .rva .LSEH_begin_poly1305_blocks_avx .rva .Lbase2_64_avx .rva .LSEH_info_poly1305_blocks_avx_1 .rva .Lbase2_64_avx .rva .Leven_avx .rva .LSEH_info_poly1305_blocks_avx_2 .rva .Leven_avx .rva .LSEH_end_poly1305_blocks_avx .rva .LSEH_info_poly1305_blocks_avx_3 .rva .LSEH_begin_poly1305_emit_avx .rva .LSEH_end_poly1305_emit_avx .rva .LSEH_info_poly1305_emit_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_poly1305_blocks_avx2 .rva .Lbase2_64_avx2 .rva .LSEH_info_poly1305_blocks_avx2_1 .rva .Lbase2_64_avx2 .rva .Leven_avx2 .rva .LSEH_info_poly1305_blocks_avx2_2 .rva .Leven_avx2 .rva .LSEH_end_poly1305_blocks_avx2 .rva .LSEH_info_poly1305_blocks_avx2_3 ___ $code.=<<___ if ($avx>2); .rva .LSEH_begin_poly1305_blocks_avx512 .rva .LSEH_end_poly1305_blocks_avx512 .rva .LSEH_info_poly1305_blocks_avx512 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_poly1305_init: .byte 9,0,0,0 .rva se_handler .rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init .LSEH_info_poly1305_blocks: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_body,.Lblocks_epilogue .LSEH_info_poly1305_emit: .byte 9,0,0,0 .rva se_handler .rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit ___ $code.=<<___ if ($avx); .LSEH_info_poly1305_blocks_avx_1: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx_2: .byte 9,0,0,0 .rva se_handler .rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx_3: .byte 9,0,0,0 .rva avx_handler .rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[] .LSEH_info_poly1305_emit_avx: .byte 9,0,0,0 .rva se_handler .rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx ___ $code.=<<___ if ($avx>1); .LSEH_info_poly1305_blocks_avx2_1: .byte 9,0,0,0 .rva se_handler .rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx2_2: .byte 9,0,0,0 .rva se_handler .rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[] .LSEH_info_poly1305_blocks_avx2_3: .byte 9,0,0,0 .rva avx_handler .rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[] ___ $code.=<<___ if ($avx>2); .LSEH_info_poly1305_blocks_avx512: .byte 9,0,0,0 .rva avx_handler .rva .Ldo_avx512_body,.Ldo_avx512_epilogue # HandlerData[] ___ } foreach (split('\n',$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/%r([a-z]+)#d/%e$1/g; s/%r([0-9]+)#d/%r$1d/g; s/%x#%[yz]/%x/g or s/%y#%z/%y/g or s/%z#%[yz]/%z/g; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/sha/asm/sha1-586.pl =================================================================== --- stable/12/crypto/openssl/crypto/sha/asm/sha1-586.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/sha/asm/sha1-586.pl (revision 364963) @@ -1,1491 +1,1491 @@ #! /usr/bin/env perl # Copyright 1998-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # [Re]written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # "[Re]written" was achieved in two major overhauls. In 2004 BODY_* # functions were re-implemented to address P4 performance issue [see # commentary below], and in 2006 the rest was rewritten in order to # gain freedom to liberate licensing terms. # January, September 2004. # # It was noted that Intel IA-32 C compiler generates code which # performs ~30% *faster* on P4 CPU than original *hand-coded* # SHA1 assembler implementation. To address this problem (and # prove that humans are still better than machines:-), the # original code was overhauled, which resulted in following # performance changes: # # compared with original compared with Intel cc # assembler impl. generated code # Pentium -16% +48% # PIII/AMD +8% +16% # P4 +85%(!) +45% # # As you can see Pentium came out as looser:-( Yet I reckoned that # improvement on P4 outweighs the loss and incorporate this # re-tuned code to 0.9.7 and later. # ---------------------------------------------------------------- # August 2009. # # George Spelvin has tipped that F_40_59(b,c,d) can be rewritten as # '(c&d) + (b&(c^d))', which allows to accumulate partial results # and lighten "pressure" on scratch registers. This resulted in # >12% performance improvement on contemporary AMD cores (with no # degradation on other CPUs:-). Also, the code was revised to maximize # "distance" between instructions producing input to 'lea' instruction # and the 'lea' instruction itself, which is essential for Intel Atom # core and resulted in ~15% improvement. # October 2010. # # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it # is to offload message schedule denoted by Wt in NIST specification, # or Xupdate in OpenSSL source, to SIMD unit. The idea is not novel, # and in SSE2 context was first explored by Dean Gaudet in 2004, see # http://arctic.org/~dean/crypto/sha1.html. Since then several things # have changed that made it interesting again: # # a) XMM units became faster and wider; # b) instruction set became more versatile; # c) an important observation was made by Max Locktykhin, which made # it possible to reduce amount of instructions required to perform # the operation in question, for further details see # http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/. # April 2011. # # Add AVX code path, probably most controversial... The thing is that # switch to AVX alone improves performance by as little as 4% in # comparison to SSSE3 code path. But below result doesn't look like # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as # pair of µ-ops, and it's the additional µ-ops, two per round, that # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded # as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with # equivalent 'sh[rl]d' that is responsible for the impressive 5.1 # cycles per processed byte. But 'sh[rl]d' is not something that used # to be fast, nor does it appear to be fast in upcoming Bulldozer # [according to its optimization manual]. Which is why AVX code path # is guarded by *both* AVX and synthetic bit denoting Intel CPUs. # One can argue that it's unfair to AMD, but without 'sh[rl]d' it # makes no sense to keep the AVX code path. If somebody feels that # strongly, it's probably more appropriate to discuss possibility of # using vector rotate XOP on AMD... # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance is summarized in following table. Numbers are # CPU clock cycles spent to process single byte (less is better). # # x86 SSSE3 AVX # Pentium 15.7 - # PIII 11.5 - # P4 10.6 - # AMD K8 7.1 - # Core2 7.3 6.0/+22% - # Westmere 7.3 5.5/+33% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% # Skylake 6.4 4.1/+55% 4.1(**)/+55% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.5/+41% # Atom 12.5 9.3(*)/+35% # Silvermont 14.5 9.9(*)/+46% # Goldmont 8.8 6.7/+30% 1.7(***)/+415% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # The discrepancy is because of front-end limitations, so # called MS-ROM penalties, and on Silvermont even rotate's # limited parallelism. # # (**) As per above comment, the result is for AVX *plus* sh[rl]d. # # (***) SHAEXT result $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } $ymm=1 if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && +$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/ && $2>=3.0); # first version supporting AVX $shaext=$xmm; ### set to zero if compiling for 1.0.1 &external_label("OPENSSL_ia32cap_P") if ($xmm); $A="eax"; $B="ebx"; $C="ecx"; $D="edx"; $E="edi"; $T="esi"; $tmp1="ebp"; @V=($A,$B,$C,$D,$E,$T); $alt=0; # 1 denotes alternative IALU implementation, which performs # 8% *worse* on P4, same on Westmere and Atom, 2% better on # Sandy Bridge... sub BODY_00_15 { local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("00_15 $n"); &mov($f,$c); # f to hold F_00_19(b,c,d) if ($n==0) { &mov($tmp1,$a); } else { &mov($a,$tmp1); } &rotl($tmp1,5); # tmp1=ROTATE(a,5) &xor($f,$d); &add($tmp1,$e); # tmp1+=e; &mov($e,&swtmp($n%16)); # e becomes volatile and is loaded # with xi, also note that e becomes # f in next round... &and($f,$b); &rotr($b,2); # b=ROTATE(b,30) &xor($f,$d); # f holds F_00_19(b,c,d) &lea($tmp1,&DWP(0x5a827999,$tmp1,$e)); # tmp1+=K_00_19+xi if ($n==15) { &mov($e,&swtmp(($n+1)%16));# pre-fetch f for next round &add($f,$tmp1); } # f+=tmp1 else { &add($tmp1,$f); } # f becomes a in next round &mov($tmp1,$a) if ($alt && $n==15); } sub BODY_16_19 { local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("16_19 $n"); if ($alt) { &xor($c,$d); &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &and($tmp1,$c); # tmp1 to hold F_00_19(b,c,d), b&=c^d &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($e,$tmp1); # e+=F_00_19(b,c,d) &xor($c,$d); # restore $c &mov($tmp1,$a); # b in next round &rotr($b,$n==16?2:7); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f); # xi=f &rotl($a,5); # ROTATE(a,5) &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$a); # f+=ROTATE(a,5) } else { &mov($tmp1,$c); # tmp1 to hold F_00_19(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &xor($tmp1,$d); # tmp1=F_00_19(b,c,d) &add($e,$tmp1); # e+=F_00_19(b,c,d) &mov($tmp1,$a); &rotr($b,2); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f); # xi=f &rotl($tmp1,5); # ROTATE(a,5) &lea($f,&DWP(0x5a827999,$f,$e));# f+=F_00_19(b,c,d)+e &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } } sub BODY_20_39 { local($n,$a,$b,$c,$d,$e,$f)=@_; local $K=($n<40)?0x6ed9eba1:0xca62c1d6; &comment("20_39 $n"); if ($alt) { &xor($tmp1,$c); # tmp1 to hold F_20_39(b,c,d), b^=c &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+8)%16)); &add($e,$tmp1); # e+=F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &mov($tmp1,$a); # b in next round &rotr($b,7); # b=ROTATE(b,30) &mov(&swtmp($n%16),$f) if($n<77);# xi=f &rotl($a,5); # ROTATE(a,5) &xor($b,$c) if($n==39);# warm up for BODY_40_59 &and($tmp1,$b) if($n==39); &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round &add($f,$a); # f+=ROTATE(a,5) &rotr($a,5) if ($n==79); } else { &mov($tmp1,$b); # tmp1 to hold F_20_39(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$c); &xor($f,&swtmp(($n+8)%16)); &xor($tmp1,$d); # tmp1 holds F_20_39(b,c,d) &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($e,$tmp1); # e+=F_20_39(b,c,d) &rotr($b,2); # b=ROTATE(b,30) &mov($tmp1,$a); &rotl($tmp1,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f) if($n<77);# xi=f &lea($f,&DWP($K,$f,$e)); # f+=e+K_XX_YY &mov($e,&swtmp(($n+1)%16)) if($n<79);# pre-fetch f for next round &add($f,$tmp1); # f+=ROTATE(a,5) } } sub BODY_40_59 { local($n,$a,$b,$c,$d,$e,$f)=@_; &comment("40_59 $n"); if ($alt) { &add($e,$tmp1); # e+=b&(c^d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &mov($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &xor($c,$d); # restore $c &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &and($tmp1,$c); &rotr($b,7); # b=ROTATE(b,30) &add($e,$tmp1); # e+=c&d &mov($tmp1,$a); # b in next round &mov(&swtmp($n%16),$f); # xi=f &rotl($a,5); # ROTATE(a,5) &xor($b,$c) if ($n<59); &and($tmp1,$b) if ($n<59);# tmp1 to hold F_40_59(b,c,d) &lea($f,&DWP(0x8f1bbcdc,$f,$e));# f+=K_40_59+e+(b&(c^d)) &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$a); # f+=ROTATE(a,5) } else { &mov($tmp1,$c); # tmp1 to hold F_40_59(b,c,d) &xor($f,&swtmp(($n+2)%16)); # f to hold Xupdate(xi,xa,xb,xc,xd) &xor($tmp1,$d); &xor($f,&swtmp(($n+8)%16)); &and($tmp1,$b); &xor($f,&swtmp(($n+13)%16)); # f holds xa^xb^xc^xd &rotl($f,1); # f=ROTATE(f,1) &add($tmp1,$e); # b&(c^d)+=e &rotr($b,2); # b=ROTATE(b,30) &mov($e,$a); # e becomes volatile &rotl($e,5); # ROTATE(a,5) &mov(&swtmp($n%16),$f); # xi=f &lea($f,&DWP(0x8f1bbcdc,$f,$tmp1));# f+=K_40_59+e+(b&(c^d)) &mov($tmp1,$c); &add($f,$e); # f+=ROTATE(a,5) &and($tmp1,$d); &mov($e,&swtmp(($n+1)%16)); # pre-fetch f for next round &add($f,$tmp1); # f+=c&d } } &function_begin("sha1_block_data_order"); if ($xmm) { &static_label("shaext_shortcut") if ($shaext); &static_label("ssse3_shortcut"); &static_label("avx_shortcut") if ($ymm); &static_label("K_XX_XX"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &picmeup($T,"OPENSSL_ia32cap_P",$tmp1,&label("pic_point")); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &mov ($A,&DWP(0,$T)); &mov ($D,&DWP(4,$T)); &test ($D,1<<9); # check SSSE3 bit &jz (&label("x86")); &mov ($C,&DWP(8,$T)); &test ($A,1<<24); # check FXSR bit &jz (&label("x86")); if ($shaext) { &test ($C,1<<29); # check SHA bit &jnz (&label("shaext_shortcut")); } if ($ymm) { &and ($D,1<<28); # mask AVX bit &and ($A,1<<30); # mask "Intel CPU" bit &or ($A,$D); &cmp ($A,1<<28|1<<30); &je (&label("avx_shortcut")); } &jmp (&label("ssse3_shortcut")); &set_label("x86",16); } &mov($tmp1,&wparam(0)); # SHA_CTX *c &mov($T,&wparam(1)); # const void *input &mov($A,&wparam(2)); # size_t num &stack_push(16+3); # allocate X[16] &shl($A,6); &add($A,$T); &mov(&wparam(2),$A); # pointer beyond the end of input &mov($E,&DWP(16,$tmp1));# pre-load E &jmp(&label("loop")); &set_label("loop",16); # copy input chunk to X, but reversing byte order! for ($i=0; $i<16; $i+=4) { &mov($A,&DWP(4*($i+0),$T)); &mov($B,&DWP(4*($i+1),$T)); &mov($C,&DWP(4*($i+2),$T)); &mov($D,&DWP(4*($i+3),$T)); &bswap($A); &bswap($B); &bswap($C); &bswap($D); &mov(&swtmp($i+0),$A); &mov(&swtmp($i+1),$B); &mov(&swtmp($i+2),$C); &mov(&swtmp($i+3),$D); } &mov(&wparam(1),$T); # redundant in 1st spin &mov($A,&DWP(0,$tmp1)); # load SHA_CTX &mov($B,&DWP(4,$tmp1)); &mov($C,&DWP(8,$tmp1)); &mov($D,&DWP(12,$tmp1)); # E is pre-loaded for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } for(;$i<20;$i++) { &BODY_16_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } (($V[5] eq $D) and ($V[0] eq $E)) or die; # double-check &mov($tmp1,&wparam(0)); # re-load SHA_CTX* &mov($D,&wparam(1)); # D is last "T" and is discarded &add($E,&DWP(0,$tmp1)); # E is last "A"... &add($T,&DWP(4,$tmp1)); &add($A,&DWP(8,$tmp1)); &add($B,&DWP(12,$tmp1)); &add($C,&DWP(16,$tmp1)); &mov(&DWP(0,$tmp1),$E); # update SHA_CTX &add($D,64); # advance input pointer &mov(&DWP(4,$tmp1),$T); &cmp($D,&wparam(2)); # have we reached the end yet? &mov(&DWP(8,$tmp1),$A); &mov($E,$C); # C is last "E" which needs to be "pre-loaded" &mov(&DWP(12,$tmp1),$B); &mov($T,$D); # input pointer &mov(&DWP(16,$tmp1),$C); &jb(&label("loop")); &stack_pop(16+3); &function_end("sha1_block_data_order"); if ($xmm) { if ($shaext) { ###################################################################### # Intel SHA Extensions implementation of SHA1 update function. # my ($ctx,$inp,$num)=("edi","esi","ecx"); my ($ABCD,$E,$E_,$BSWAP)=map("xmm$_",(0..3)); my @MSG=map("xmm$_",(4..7)); sub sha1rnds4 { my ($dst,$src,$imm)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x0f,0x3a,0xcc,0xc0|($1<<3)|$2,$imm); } } sub sha1op38 { my ($opcodelet,$dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } } sub sha1nexte { sha1op38(0xc8,@_); } sub sha1msg1 { sha1op38(0xc9,@_); } sub sha1msg2 { sha1op38(0xca,@_); } &function_begin("_sha1_block_data_order_shaext"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &set_label("shaext_shortcut"); &mov ($ctx,&wparam(0)); &mov ("ebx","esp"); &mov ($inp,&wparam(1)); &mov ($num,&wparam(2)); &sub ("esp",32); &movdqu ($ABCD,&QWP(0,$ctx)); &movd ($E,&DWP(16,$ctx)); &and ("esp",-32); &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap &movdqu (@MSG[0],&QWP(0,$inp)); &pshufd ($ABCD,$ABCD,0b00011011); # flip word order &movdqu (@MSG[1],&QWP(0x10,$inp)); &pshufd ($E,$E,0b00011011); # flip word order &movdqu (@MSG[2],&QWP(0x20,$inp)); &pshufb (@MSG[0],$BSWAP); &movdqu (@MSG[3],&QWP(0x30,$inp)); &pshufb (@MSG[1],$BSWAP); &pshufb (@MSG[2],$BSWAP); &pshufb (@MSG[3],$BSWAP); &jmp (&label("loop_shaext")); &set_label("loop_shaext",16); &dec ($num); &lea ("eax",&DWP(0x40,$inp)); &movdqa (&QWP(0,"esp"),$E); # offload $E &paddd ($E,@MSG[0]); &cmovne ($inp,"eax"); &movdqa (&QWP(16,"esp"),$ABCD); # offload $ABCD for($i=0;$i<20-4;$i+=2) { &sha1msg1 (@MSG[0],@MSG[1]); &movdqa ($E_,$ABCD); &sha1rnds4 ($ABCD,$E,int($i/5)); # 0-3... &sha1nexte ($E_,@MSG[1]); &pxor (@MSG[0],@MSG[2]); &sha1msg1 (@MSG[1],@MSG[2]); &sha1msg2 (@MSG[0],@MSG[3]); &movdqa ($E,$ABCD); &sha1rnds4 ($ABCD,$E_,int(($i+1)/5)); &sha1nexte ($E,@MSG[2]); &pxor (@MSG[1],@MSG[3]); &sha1msg2 (@MSG[1],@MSG[0]); push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); } &movdqu (@MSG[0],&QWP(0,$inp)); &movdqa ($E_,$ABCD); &sha1rnds4 ($ABCD,$E,3); # 64-67 &sha1nexte ($E_,@MSG[1]); &movdqu (@MSG[1],&QWP(0x10,$inp)); &pshufb (@MSG[0],$BSWAP); &movdqa ($E,$ABCD); &sha1rnds4 ($ABCD,$E_,3); # 68-71 &sha1nexte ($E,@MSG[2]); &movdqu (@MSG[2],&QWP(0x20,$inp)); &pshufb (@MSG[1],$BSWAP); &movdqa ($E_,$ABCD); &sha1rnds4 ($ABCD,$E,3); # 72-75 &sha1nexte ($E_,@MSG[3]); &movdqu (@MSG[3],&QWP(0x30,$inp)); &pshufb (@MSG[2],$BSWAP); &movdqa ($E,$ABCD); &sha1rnds4 ($ABCD,$E_,3); # 76-79 &movdqa ($E_,&QWP(0,"esp")); &pshufb (@MSG[3],$BSWAP); &sha1nexte ($E,$E_); &paddd ($ABCD,&QWP(16,"esp")); &jnz (&label("loop_shaext")); &pshufd ($ABCD,$ABCD,0b00011011); &pshufd ($E,$E,0b00011011); &movdqu (&QWP(0,$ctx),$ABCD) &movd (&DWP(16,$ctx),$E); &mov ("esp","ebx"); &function_end("_sha1_block_data_order_shaext"); } ###################################################################### # The SSSE3 implementation. # # %xmm[0-7] are used as ring @X[] buffer containing quadruples of last # 32 elements of the message schedule or Xupdate outputs. First 4 # quadruples are simply byte-swapped input, next 4 are calculated # according to method originally suggested by Dean Gaudet (modulo # being implemented in SSSE3). Once 8 quadruples or 32 elements are # collected, it switches to routine proposed by Max Locktyukhin. # # Calculations inevitably require temporary registers, and there are # no %xmm registers left to spare. For this reason part of the ring # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - # X[-5], and X[4] - X[-4]... # # Another notable optimization is aggressive stack frame compression # aiming to minimize amount of 9-byte instructions... # # Yet another notable optimization is "jumping" $B variable. It means # that there is no register permanently allocated for $B value. This # allowed to eliminate one instruction from body_20_39... # my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 my @V=($A,$B,$C,$D,$E); my $j=0; # hash round my $rx=0; my @T=($T,$tmp1); my $inp; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; &function_begin("_sha1_block_data_order_ssse3"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &set_label("ssse3_shortcut"); &movdqa (@X[3],&QWP(0,$tmp1)); # K_00_19 &movdqa (@X[4],&QWP(16,$tmp1)); # K_20_39 &movdqa (@X[5],&QWP(32,$tmp1)); # K_40_59 &movdqa (@X[6],&QWP(48,$tmp1)); # K_60_79 &movdqa (@X[2],&QWP(64,$tmp1)); # pbswap mask &mov ($E,&wparam(0)); # load argument block &mov ($inp=@T[1],&wparam(1)); &mov ($D,&wparam(2)); &mov (@T[0],"esp"); # stack frame layout # # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area # X[4]+K X[5]+K X[6]+K X[7]+K # X[8]+K X[9]+K X[10]+K X[11]+K # X[12]+K X[13]+K X[14]+K X[15]+K # # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area # X[4] X[5] X[6] X[7] # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 # # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants # K_40_59 K_40_59 K_40_59 K_40_59 # K_60_79 K_60_79 K_60_79 K_60_79 # K_00_19 K_00_19 K_00_19 K_00_19 # pbswap mask # # +192 ctx # argument block # +196 inp # +200 end # +204 esp &sub ("esp",208); &and ("esp",-64); &movdqa (&QWP(112+0,"esp"),@X[4]); # copy constants &movdqa (&QWP(112+16,"esp"),@X[5]); &movdqa (&QWP(112+32,"esp"),@X[6]); &shl ($D,6); # len*64 &movdqa (&QWP(112+48,"esp"),@X[3]); &add ($D,$inp); # end of input &movdqa (&QWP(112+64,"esp"),@X[2]); &add ($inp,64); &mov (&DWP(192+0,"esp"),$E); # save argument block &mov (&DWP(192+4,"esp"),$inp); &mov (&DWP(192+8,"esp"),$D); &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp &mov ($A,&DWP(0,$E)); # load context &mov ($B,&DWP(4,$E)); &mov ($C,&DWP(8,$E)); &mov ($D,&DWP(12,$E)); &mov ($E,&DWP(16,$E)); &mov (@T[0],$B); # magic seed &movdqu (@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] &movdqu (@X[-3&7],&QWP(-48,$inp)); &movdqu (@X[-2&7],&QWP(-32,$inp)); &movdqu (@X[-1&7],&QWP(-16,$inp)); &pshufb (@X[-4&7],@X[2]); # byte swap &pshufb (@X[-3&7],@X[2]); &pshufb (@X[-2&7],@X[2]); &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot &pshufb (@X[-1&7],@X[2]); &paddd (@X[-4&7],@X[3]); # add K_00_19 &paddd (@X[-3&7],@X[3]); &paddd (@X[-2&7],@X[3]); &movdqa (&QWP(0,"esp"),@X[-4&7]); # X[]+K xfer to IALU &psubd (@X[-4&7],@X[3]); # restore X[] &movdqa (&QWP(0+16,"esp"),@X[-3&7]); &psubd (@X[-3&7],@X[3]); &movdqa (&QWP(0+32,"esp"),@X[-2&7]); &mov (@T[1],$C); &psubd (@X[-2&7],@X[3]); &xor (@T[1],$D); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],@T[1]); &jmp (&label("loop")); ###################################################################### # SSE instruction sequence is first broken to groups of independent # instructions, independent in respect to their inputs and shifter # (not all architectures have more than one). Then IALU instructions # are "knitted in" between the SSE groups. Distance is maintained for # SSE latency of 2 in hope that it fits better upcoming AMD Bulldozer # [which allegedly also implements SSSE3]... # # Temporary registers usage. X[2] is volatile at the entry and at the # end is restored from backtrace ring buffer. X[3] is expected to # contain current K_XX_XX constant and is used to calculate X[-1]+K # from previous round, it becomes volatile the moment the value is # saved to stack for transfer to IALU. X[4] becomes volatile whenever # X[-4] is accumulated and offloaded to backtrace ring buffer, at the # end it is loaded with next K_XX_XX [which becomes X[3] in next # round]... # sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); &movdqa (@X[2],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@X[2],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@X[4],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@X[2],@X[0]); eval(shift(@insns)); &pslldq (@X[4],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[2],31); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (@X[3],@X[4]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrld (@X[4],30); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &pslld (@X[3],2); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[4]); &movdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[3]); # "X[0]"^=("X[0]"<<96)<<<2 &pshufd (@X[1],@X[-3&7],0xee) if ($Xi<7); # was &movdqa (@X[1],@X[-2&7]) &pshufd (@X[3],@X[-1&7],0xee) if ($Xi==7); eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_ssse3_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # body_20_39 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" &punpcklqdq(@X[2],@X[-1&7]); # compose "X[-6]", was &palignr(@X[2],@X[-2&7],8) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" &movdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); if ($Xi%5) { &movdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } eval(shift(@insns)); # ror &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@X[2]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (@X[2],@X[0]); &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pslld (@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &psrld (@X[2],30); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &por (@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &movdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &pshufd (@X[3],@X[-1],0xee) if ($Xi<19); # was &movdqa (@X[3],@X[0]) eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_ssse3_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[3],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &mov ($inp=@T[1],&DWP(192+4,"esp")); &cmp ($inp,&DWP(192+8,"esp")); &je (&label("done")); &movdqa (@X[3],&QWP(112+48,"esp")); # K_00_19 &movdqa (@X[2],&QWP(112+64,"esp")); # pbswap mask &movdqu (@X[-4&7],&QWP(0,$inp)); # load input &movdqu (@X[-3&7],&QWP(16,$inp)); &movdqu (@X[-2&7],&QWP(32,$inp)); &movdqu (@X[-1&7],&QWP(48,$inp)); &add ($inp,64); &pshufb (@X[-4&7],@X[2]); # byte swap &mov (&DWP(192+4,"esp"),$inp); &movdqa (&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot $Xi=0; } sub Xloop_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (&QWP(0+16*$Xi,"esp"),@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@X[3]); foreach (@insns) { eval; } $Xi++; } sub Xtail_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } sub body_00_19 () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&$_ror ($b,$j?7:2);', # $b>>>2 '&xor (@T[0],$d);', '&mov (@T[1],$a);', # $b in next round '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer '&xor ($b,$c);', # $c^$d for next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&and (@T[1],$b);', # ($b&($c^$d)) for next round '&xor ($b,$c);', # restore $b '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_20_39 () { # b^d^c # on entry @T[0]=b^d return &body_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer '&xor (@T[0],$d) if($j==19);'. '&xor (@T[0],$c) if($j> 19);', # ($b^$d^$c) '&mov (@T[1],$a);', # $b in next round '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j< 79);', # $b^$d for next round '&$_ror ($b,7);', # $b>>>2 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_40_59 () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,&DWP(4*($j&15),"esp"));', # X[]+K xfer '&and (@T[0],$c) if ($j>=40);', # (b^c)&(c^d) '&xor ($c,$d) if ($j>=40);', # restore $c '&$_ror ($b,7);', # $b>>>2 '&mov (@T[1],$a);', # $b for next round '&xor (@T[0],$c);', '&$_rol ($a,5);', '&add ($e,@T[0]);', '&xor (@T[1],$c) if ($j==59);'. '&xor (@T[1],$b) if ($j< 59);', # b^c for next round '&xor ($b,$c) if ($j< 59);', # c^d for next round '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } ###### sub bodyx_00_19 () { # ((c^d)&b)^d # on start @T[0]=(b&c)^(~b&d), $e+=X[]+K return &bodyx_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx ($b,$b,2) if ($j==0);'. # $b>>>2 '&rorx ($b,@T[1],7) if ($j!=0);', # $b>>>2 '&lea ($e,&DWP(0,$e,@T[0]));', '&rorx (@T[0],$a,5);', '&andn (@T[1],$a,$c);', '&and ($a,$b)', '&add ($d,&DWP(4*(($j+1)&15),"esp"));', # X[]+K xfer '&xor (@T[1],$a)', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_20_39 () { # b^d^c # on start $b=b^c^d return &bodyx_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,($j==19?@T[0]:$b))', '&rorx ($b,@T[1],7);', # $b>>>2 '&rorx (@T[0],$a,5);', '&xor ($a,$b) if ($j<79);', '&add ($d,&DWP(4*(($j+1)&15),"esp")) if ($j<79);', # X[]+K xfer '&xor ($a,$c) if ($j<79);', '&add ($e,@T[0]);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub bodyx_40_59 () { # ((b^c)&(c^d))^c # on start $b=((b^c)&(c^d))^c return &bodyx_20_39() if ($rx==59); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&rorx (@T[0],$a,5)', '&lea ($e,&DWP(0,$e,$b))', '&rorx ($b,@T[1],7)', # $b>>>2 '&add ($d,&DWP(4*(($j+1)&15),"esp"))', # X[]+K xfer '&mov (@T[1],$c)', '&xor ($a,$b)', # b^c for next round '&xor (@T[1],$b)', # c^d for next round '&and ($a,@T[1])', '&add ($e,@T[0])', '&xor ($a,$b)' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } &set_label("loop",16); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_32_79(\&body_00_19); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov (&DWP(8,@T[1]),$C); &mov ($B,$C); &mov (&DWP(12,@T[1]),$D); &xor ($B,$D); &mov (&DWP(16,@T[1]),$E); &mov (@T[1],@T[0]); &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); &and (@T[0],$B); &mov ($B,$T[1]); &jmp (&label("loop")); &set_label("done",16); $j=$saved_j; @V=@saved_V; &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &mov ("esp",&DWP(192+12,"esp")); # restore %esp &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov (&DWP(8,@T[1]),$C); &mov (&DWP(12,@T[1]),$D); &mov (&DWP(16,@T[1]),$E); &function_end("_sha1_block_data_order_ssse3"); $rx=0; # reset if ($ymm) { my $Xi=4; # 4xSIMD Xupdate round, start pre-seeded my @X=map("xmm$_",(4..7,0..3)); # pre-seeded for $Xi=4 my @V=($A,$B,$C,$D,$E); my $j=0; # hash round my @T=($T,$tmp1); my $inp; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; &function_begin("_sha1_block_data_order_avx"); &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($tmp1); &lea ($tmp1,&DWP(&label("K_XX_XX")."-".&label("pic_point"),$tmp1)); &set_label("avx_shortcut"); &vzeroall(); &vmovdqa(@X[3],&QWP(0,$tmp1)); # K_00_19 &vmovdqa(@X[4],&QWP(16,$tmp1)); # K_20_39 &vmovdqa(@X[5],&QWP(32,$tmp1)); # K_40_59 &vmovdqa(@X[6],&QWP(48,$tmp1)); # K_60_79 &vmovdqa(@X[2],&QWP(64,$tmp1)); # pbswap mask &mov ($E,&wparam(0)); # load argument block &mov ($inp=@T[1],&wparam(1)); &mov ($D,&wparam(2)); &mov (@T[0],"esp"); # stack frame layout # # +0 X[0]+K X[1]+K X[2]+K X[3]+K # XMM->IALU xfer area # X[4]+K X[5]+K X[6]+K X[7]+K # X[8]+K X[9]+K X[10]+K X[11]+K # X[12]+K X[13]+K X[14]+K X[15]+K # # +64 X[0] X[1] X[2] X[3] # XMM->XMM backtrace area # X[4] X[5] X[6] X[7] # X[8] X[9] X[10] X[11] # even borrowed for K_00_19 # # +112 K_20_39 K_20_39 K_20_39 K_20_39 # constants # K_40_59 K_40_59 K_40_59 K_40_59 # K_60_79 K_60_79 K_60_79 K_60_79 # K_00_19 K_00_19 K_00_19 K_00_19 # pbswap mask # # +192 ctx # argument block # +196 inp # +200 end # +204 esp &sub ("esp",208); &and ("esp",-64); &vmovdqa(&QWP(112+0,"esp"),@X[4]); # copy constants &vmovdqa(&QWP(112+16,"esp"),@X[5]); &vmovdqa(&QWP(112+32,"esp"),@X[6]); &shl ($D,6); # len*64 &vmovdqa(&QWP(112+48,"esp"),@X[3]); &add ($D,$inp); # end of input &vmovdqa(&QWP(112+64,"esp"),@X[2]); &add ($inp,64); &mov (&DWP(192+0,"esp"),$E); # save argument block &mov (&DWP(192+4,"esp"),$inp); &mov (&DWP(192+8,"esp"),$D); &mov (&DWP(192+12,"esp"),@T[0]); # save original %esp &mov ($A,&DWP(0,$E)); # load context &mov ($B,&DWP(4,$E)); &mov ($C,&DWP(8,$E)); &mov ($D,&DWP(12,$E)); &mov ($E,&DWP(16,$E)); &mov (@T[0],$B); # magic seed &vmovdqu(@X[-4&7],&QWP(-64,$inp)); # load input to %xmm[0-3] &vmovdqu(@X[-3&7],&QWP(-48,$inp)); &vmovdqu(@X[-2&7],&QWP(-32,$inp)); &vmovdqu(@X[-1&7],&QWP(-16,$inp)); &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap &vpshufb(@X[-3&7],@X[-3&7],@X[2]); &vpshufb(@X[-2&7],@X[-2&7],@X[2]); &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot &vpshufb(@X[-1&7],@X[-1&7],@X[2]); &vpaddd (@X[0],@X[-4&7],@X[3]); # add K_00_19 &vpaddd (@X[1],@X[-3&7],@X[3]); &vpaddd (@X[2],@X[-2&7],@X[3]); &vmovdqa(&QWP(0,"esp"),@X[0]); # X[]+K xfer to IALU &mov (@T[1],$C); &vmovdqa(&QWP(0+16,"esp"),@X[1]); &xor (@T[1],$D); &vmovdqa(&QWP(0+32,"esp"),@X[2]); &and (@T[0],@T[1]); &jmp (&label("loop")); sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[3],@X[3],@X[-1&7]); &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]);# save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@X[2],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[2],@X[2],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@X[2],@X[0],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@X[4],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@X[3],@X[4],30); &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslld (@X[4],@X[4],2); &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if ($Xi>5); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[3]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[4]); # "X[0]"^=("X[0]"<<96)<<<2 eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (@X[4],&QWP(112-16+16*(($Xi)/5),"esp")); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &vpalignr(@X[2],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" &vmovdqa (&QWP(64+16*(($Xi-4)%3),"esp"),@X[-4&7]); # save X[] to backtrace buffer eval(shift(@insns)); eval(shift(@insns)); if ($Xi%5) { &vmovdqa (@X[4],@X[3]); # "perpetuate" K_XX_XX... } else { # ... or load next one &vmovdqa (@X[4],&QWP(112-16+16*($Xi/5),"esp")); } &vpaddd (@X[3],@X[3],@X[-1&7]); eval(shift(@insns)); # ror eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[2]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpsrld (@X[2],@X[0],30); &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpor (@X[0],@X[0],@X[2]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); &vmovdqa (@X[2],&QWP(64+16*(($Xi-6)%3),"esp")) if($Xi<19); # restore X[] from backtrace buffer eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_avx_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); &vpaddd (@X[3],@X[3],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (&QWP(0+16*(($Xi-1)&3),"esp"),@X[3]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &mov ($inp=@T[1],&DWP(192+4,"esp")); &cmp ($inp,&DWP(192+8,"esp")); &je (&label("done")); &vmovdqa(@X[3],&QWP(112+48,"esp")); # K_00_19 &vmovdqa(@X[2],&QWP(112+64,"esp")); # pbswap mask &vmovdqu(@X[-4&7],&QWP(0,$inp)); # load input &vmovdqu(@X[-3&7],&QWP(16,$inp)); &vmovdqu(@X[-2&7],&QWP(32,$inp)); &vmovdqu(@X[-1&7],&QWP(48,$inp)); &add ($inp,64); &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap &mov (&DWP(192+4,"esp"),$inp); &vmovdqa(&QWP(112-16,"esp"),@X[3]); # borrow last backtrace slot $Xi=0; } sub Xloop_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpshufb (@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],@X[3]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (&QWP(0+16*$Xi,"esp"),@X[$Xi&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } $Xi++; } sub Xtail_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } &set_label("loop",16); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_32_79(\&body_00_19); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); &Xuplast_avx_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov ($B,$C); &mov (&DWP(8,@T[1]),$C); &xor ($B,$D); &mov (&DWP(12,@T[1]),$D); &mov (&DWP(16,@T[1]),$E); &mov (@T[1],@T[0]); &and (@T[0],$B); &mov ($B,@T[1]); &jmp (&label("loop")); &set_label("done",16); $j=$saved_j; @V=@saved_V; &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &vzeroall(); &mov (@T[1],&DWP(192,"esp")); # update context &add ($A,&DWP(0,@T[1])); &mov ("esp",&DWP(192+12,"esp")); # restore %esp &add (@T[0],&DWP(4,@T[1])); # $b &add ($C,&DWP(8,@T[1])); &mov (&DWP(0,@T[1]),$A); &add ($D,&DWP(12,@T[1])); &mov (&DWP(4,@T[1]),@T[0]); &add ($E,&DWP(16,@T[1])); &mov (&DWP(8,@T[1]),$C); &mov (&DWP(12,@T[1]),$D); &mov (&DWP(16,@T[1]),$E); &function_end("_sha1_block_data_order_avx"); } &set_label("K_XX_XX",64); &data_word(0x5a827999,0x5a827999,0x5a827999,0x5a827999); # K_00_19 &data_word(0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1); # K_20_39 &data_word(0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc); # K_40_59 &data_word(0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6); # K_60_79 &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # pbswap mask &data_byte(0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0); } &asciz("SHA1 block transform for x86, CRYPTOGAMS by "); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/sha/asm/sha1-mb-x86_64.pl (revision 364963) @@ -1,1628 +1,1628 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Multi-buffer SHA1 procedure processes n buffers in parallel by # placing buffer data to designated lane of SIMD register. n is # naturally limited to 4 on pre-AVX2 processors and to 8 on # AVX2-capable processors such as Haswell. # # this +aesni(i) sha1 aesni-sha1 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 10.7/n +1.28=3.96(n=4) 5.30 6.66 +68% # Atom(ii) 18.1/n +3.93=8.46(n=4) 9.37 12.8 +51% # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80% # Ivy Bridge (8.08 +5.14=13.2)/n 4.60 5.54 +68% # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160% # Skylake (8.70 +5.00=13.7)/n 3.64 4.20 +145% # Bulldozer (9.76 +5.76=15.5)/n 5.95 6.37 +64% # # (i) multi-block CBC encrypt with 128-bit key; # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, # because of lower AES-NI instruction throughput; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 8.00+4.44=12.4; # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 30% to 100% (on Haswell); $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha1_multi_block ( # struct { unsigned int A[8]; # unsigned int B[8]; # unsigned int C[8]; # unsigned int D[8]; # unsigned int E[8]; } *ctx, # struct { void *ptr; int blocks; } inp[8], # int num); /* 1 or 2 */ # $ctx="%rdi"; # 1st arg $inp="%rsi"; # 2nd arg $num="%edx"; @ptr=map("%r$_",(8..11)); $Tbl="%rbp"; @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4)); ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9)); @Xi=map("%xmm$_",(10..14)); $K="%xmm15"; if (1) { # Atom-specific optimization aiming to eliminate pshufb with high # registers [and thus get rid of 48 cycles accumulated penalty] @Xi=map("%xmm$_",(0..4)); ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9)); @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14)); } $REG_SZ=16; sub Xi_off { my $off = shift; $off %= 16; $off *= $REG_SZ; $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; } sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $k=$i+2; # Loads are performed 2+3/4 iterations in advance. 3/4 means that out # of 4 words you would expect to be loaded per given iteration one is # spilled to next iteration. In other words indices in four input # streams are distributed as following: # # $i==0: 0,0,0,0,1,1,1,1,2,2,2, # $i==1: 2,3,3,3, # $i==2: 3,4,4,4, # ... # $i==13: 14,15,15,15, # $i==14: 15 # # Then at $i==15 Xupdate is applied one iteration in advance... $code.=<<___ if ($i==0); movd (@ptr[0]),@Xi[0] lea `16*4`(@ptr[0]),@ptr[0] movd (@ptr[1]),@Xi[2] # borrow @Xi[2] lea `16*4`(@ptr[1]),@ptr[1] movd (@ptr[2]),@Xi[3] # borrow @Xi[3] lea `16*4`(@ptr[2]),@ptr[2] movd (@ptr[3]),@Xi[4] # borrow @Xi[4] lea `16*4`(@ptr[3]),@ptr[3] punpckldq @Xi[3],@Xi[0] movd `4*$j-16*4`(@ptr[0]),@Xi[1] punpckldq @Xi[4],@Xi[2] movd `4*$j-16*4`(@ptr[1]),$t3 punpckldq @Xi[2],@Xi[0] movd `4*$j-16*4`(@ptr[2]),$t2 pshufb $tx,@Xi[0] ___ $code.=<<___ if ($i<14); # just load input movd `4*$j-16*4`(@ptr[3]),$t1 punpckldq $t2,@Xi[1] movdqa $a,$t2 paddd $K,$e # e+=K_00_19 punpckldq $t1,$t3 movdqa $b,$t1 movdqa $b,$t0 pslld \$5,$t2 pandn $d,$t1 pand $c,$t0 punpckldq $t3,@Xi[1] movdqa $a,$t3 movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] movd `4*$k-16*4`(@ptr[0]),@Xi[2] psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 por $t3,$t2 # rol(a,5) movd `4*$k-16*4`(@ptr[1]),$t3 pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) pshufb $tx,@Xi[1] movd `4*$k-16*4`(@ptr[2]),$t2 por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i==14); # just load input movd `4*$j-16*4`(@ptr[3]),$t1 punpckldq $t2,@Xi[1] movdqa $a,$t2 paddd $K,$e # e+=K_00_19 punpckldq $t1,$t3 movdqa $b,$t1 movdqa $b,$t0 pslld \$5,$t2 prefetcht0 63(@ptr[0]) pandn $d,$t1 pand $c,$t0 punpckldq $t3,@Xi[1] movdqa $a,$t3 movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 prefetcht0 63(@ptr[1]) por $t3,$t2 # rol(a,5) pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) prefetcht0 63(@ptr[2]) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) pshufb $tx,@Xi[1] prefetcht0 63(@ptr[3]) por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i>=13 && $i<15); movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" ___ $code.=<<___ if ($i>=15); # apply Xupdate pxor @Xi[-2],@Xi[1] # "X[13]" movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" movdqa $a,$t2 pxor `&Xi_off($j+8)`,@Xi[1] paddd $K,$e # e+=K_00_19 movdqa $b,$t1 pslld \$5,$t2 pxor @Xi[3],@Xi[1] movdqa $b,$t0 pandn $d,$t1 movdqa @Xi[1],$tx pand $c,$t0 movdqa $a,$t3 psrld \$31,$tx paddd @Xi[1],@Xi[1] movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] psrld \$27,$t3 pxor $t1,$t0 # Ch(b,c,d) movdqa $b,$t1 por $t3,$t2 # rol(a,5) pslld \$30,$t1 paddd $t0,$e # e+=Ch(b,c,d) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $tx,@Xi[1] # rol \$1,@Xi[1] por $t1,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i<79); pxor @Xi[-2],@Xi[1] # "X[13]" movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" movdqa $a,$t2 movdqa $d,$t0 pxor `&Xi_off($j+8)`,@Xi[1] paddd $K,$e # e+=K_20_39 pslld \$5,$t2 pxor $b,$t0 movdqa $a,$t3 ___ $code.=<<___ if ($i<72); movdqa @Xi[0],`&Xi_off($i)` ___ $code.=<<___ if ($i<79); paddd @Xi[0],$e # e+=X[i] pxor @Xi[3],@Xi[1] psrld \$27,$t3 pxor $c,$t0 # Parity(b,c,d) movdqa $b,$t1 pslld \$30,$t1 movdqa @Xi[1],$tx por $t3,$t2 # rol(a,5) psrld \$31,$tx paddd $t0,$e # e+=Parity(b,c,d) paddd @Xi[1],@Xi[1] psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $tx,@Xi[1] # rol(@Xi[1],1) por $t1,$b # b=rol(b,30) ___ $code.=<<___ if ($i==79); movdqa $a,$t2 paddd $K,$e # e+=K_20_39 movdqa $d,$t0 pslld \$5,$t2 pxor $b,$t0 movdqa $a,$t3 paddd @Xi[0],$e # e+=X[i] psrld \$27,$t3 movdqa $b,$t1 pxor $c,$t0 # Parity(b,c,d) pslld \$30,$t1 por $t3,$t2 # rol(a,5) paddd $t0,$e # e+=Parity(b,c,d) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $t1,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; pxor @Xi[-2],@Xi[1] # "X[13]" movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" movdqa $a,$t2 movdqa $d,$t1 pxor `&Xi_off($j+8)`,@Xi[1] pxor @Xi[3],@Xi[1] paddd $K,$e # e+=K_40_59 pslld \$5,$t2 movdqa $a,$t3 pand $c,$t1 movdqa $d,$t0 movdqa @Xi[1],$tx psrld \$27,$t3 paddd $t1,$e pxor $c,$t0 movdqa @Xi[0],`&Xi_off($i)` paddd @Xi[0],$e # e+=X[i] por $t3,$t2 # rol(a,5) psrld \$31,$tx pand $b,$t0 movdqa $b,$t1 pslld \$30,$t1 paddd @Xi[1],@Xi[1] paddd $t0,$e # e+=Maj(b,d,c) psrld \$2,$b paddd $t2,$e # e+=rol(a,5) por $tx,@Xi[1] # rol(@X[1],1) por $t1,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl sha1_multi_block .type sha1_multi_block,\@function,3 .align 32 sha1_multi_block: .cfi_startproc mov OPENSSL_ia32cap_P+4(%rip),%rcx bt \$61,%rcx # check SHA bit jc _shaext_shortcut ___ $code.=<<___ if ($avx); test \$`1<<28`,%ecx jnz _avx_shortcut ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbx ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`,%rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody: lea K_XX_XX(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx .Loop_grande: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone movdqu 0x00($ctx),$A # load context lea 128(%rsp),%rax movdqu 0x20($ctx),$B movdqu 0x40($ctx),$C movdqu 0x60($ctx),$D movdqu 0x80($ctx),$E movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 jmp .Loop .align 32 .Loop: ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } $code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; movdqa (%rbx),@Xi[0] # pull counters mov \$1,%ecx cmp 4*0(%rbx),%ecx # examine counters pxor $t2,$t2 cmovge $Tbl,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx movdqa @Xi[0],@Xi[1] cmovge $Tbl,@ptr[1] cmp 4*2(%rbx),%ecx pcmpgtd $t2,@Xi[1] # mask value cmovge $Tbl,@ptr[2] cmp 4*3(%rbx),%ecx paddd @Xi[1],@Xi[0] # counters-- cmovge $Tbl,@ptr[3] movdqu 0x00($ctx),$t0 pand @Xi[1],$A movdqu 0x20($ctx),$t1 pand @Xi[1],$B paddd $t0,$A movdqu 0x40($ctx),$t2 pand @Xi[1],$C paddd $t1,$B movdqu 0x60($ctx),$t3 pand @Xi[1],$D paddd $t2,$C movdqu 0x80($ctx),$tx pand @Xi[1],$E movdqu $A,0x00($ctx) paddd $t3,$D movdqu $B,0x20($ctx) paddd $tx,$E movdqu $C,0x40($ctx) movdqu $D,0x60($ctx) movdqu $E,0x80($ctx) movdqa @Xi[0],(%rbx) # save counters movdqa 0x60($Tbl),$tx # pbswap_mask movdqa -0x20($Tbl),$K # K_00_19 dec $num jnz .Loop mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande .Ldone: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha1_multi_block,.-sha1_multi_block ___ {{{ my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10)); my @MSG0=map("%xmm$_",(4..7)); my @MSG1=map("%xmm$_",(11..14)); $code.=<<___; .type sha1_multi_block_shaext,\@function,3 .align 32 sha1_multi_block_shaext: .cfi_startproc _shaext_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`,%rsp shl \$1,$num # we process pair at a time and \$-256,%rsp lea 0x40($ctx),$ctx # size optimization mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .Lbody_shaext: lea `$REG_SZ*16`(%rsp),%rbx movdqa K_XX_XX+0x80(%rip),$BSWAP # byte-n-word swap .Loop_grande_shaext: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle %rsp,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_shaext movq 0x00-0x40($ctx),$ABCD0 # a1.a0 movq 0x20-0x40($ctx),@MSG0[0]# b1.b0 movq 0x40-0x40($ctx),@MSG0[1]# c1.c0 movq 0x60-0x40($ctx),@MSG0[2]# d1.d0 movq 0x80-0x40($ctx),@MSG0[3]# e1.e0 punpckldq @MSG0[0],$ABCD0 # b1.a1.b0.a0 punpckldq @MSG0[2],@MSG0[1] # d1.c1.d0.c0 movdqa $ABCD0,$ABCD1 punpcklqdq @MSG0[1],$ABCD0 # d0.c0.b0.a0 punpckhqdq @MSG0[1],$ABCD1 # d1.c1.b1.a1 pshufd \$0b00111111,@MSG0[3],$E0 pshufd \$0b01111111,@MSG0[3],$E1 pshufd \$0b00011011,$ABCD0,$ABCD0 pshufd \$0b00011011,$ABCD1,$ABCD1 jmp .Loop_shaext .align 32 .Loop_shaext: movdqu 0x00(@ptr[0]),@MSG0[0] movdqu 0x00(@ptr[1]),@MSG1[0] movdqu 0x10(@ptr[0]),@MSG0[1] movdqu 0x10(@ptr[1]),@MSG1[1] movdqu 0x20(@ptr[0]),@MSG0[2] pshufb $BSWAP,@MSG0[0] movdqu 0x20(@ptr[1]),@MSG1[2] pshufb $BSWAP,@MSG1[0] movdqu 0x30(@ptr[0]),@MSG0[3] lea 0x40(@ptr[0]),@ptr[0] pshufb $BSWAP,@MSG0[1] movdqu 0x30(@ptr[1]),@MSG1[3] lea 0x40(@ptr[1]),@ptr[1] pshufb $BSWAP,@MSG1[1] movdqa $E0,0x50(%rsp) # offload paddd @MSG0[0],$E0 movdqa $E1,0x70(%rsp) paddd @MSG1[0],$E1 movdqa $ABCD0,0x40(%rsp) # offload movdqa $ABCD0,$E0_ movdqa $ABCD1,0x60(%rsp) movdqa $ABCD1,$E1_ sha1rnds4 \$0,$E0,$ABCD0 # 0-3 sha1nexte @MSG0[1],$E0_ sha1rnds4 \$0,$E1,$ABCD1 # 0-3 sha1nexte @MSG1[1],$E1_ pshufb $BSWAP,@MSG0[2] prefetcht0 127(@ptr[0]) sha1msg1 @MSG0[1],@MSG0[0] pshufb $BSWAP,@MSG1[2] prefetcht0 127(@ptr[1]) sha1msg1 @MSG1[1],@MSG1[0] pshufb $BSWAP,@MSG0[3] movdqa $ABCD0,$E0 pshufb $BSWAP,@MSG1[3] movdqa $ABCD1,$E1 sha1rnds4 \$0,$E0_,$ABCD0 # 4-7 sha1nexte @MSG0[2],$E0 sha1rnds4 \$0,$E1_,$ABCD1 # 4-7 sha1nexte @MSG1[2],$E1 pxor @MSG0[2],@MSG0[0] sha1msg1 @MSG0[2],@MSG0[1] pxor @MSG1[2],@MSG1[0] sha1msg1 @MSG1[2],@MSG1[1] ___ for($i=2;$i<20-4;$i++) { $code.=<<___; movdqa $ABCD0,$E0_ movdqa $ABCD1,$E1_ sha1rnds4 \$`int($i/5)`,$E0,$ABCD0 # 8-11 sha1nexte @MSG0[3],$E0_ sha1rnds4 \$`int($i/5)`,$E1,$ABCD1 # 8-11 sha1nexte @MSG1[3],$E1_ sha1msg2 @MSG0[3],@MSG0[0] sha1msg2 @MSG1[3],@MSG1[0] pxor @MSG0[3],@MSG0[1] sha1msg1 @MSG0[3],@MSG0[2] pxor @MSG1[3],@MSG1[1] sha1msg1 @MSG1[3],@MSG1[2] ___ ($E0,$E0_)=($E0_,$E0); ($E1,$E1_)=($E1_,$E1); push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); } $code.=<<___; movdqa $ABCD0,$E0_ movdqa $ABCD1,$E1_ sha1rnds4 \$3,$E0,$ABCD0 # 64-67 sha1nexte @MSG0[3],$E0_ sha1rnds4 \$3,$E1,$ABCD1 # 64-67 sha1nexte @MSG1[3],$E1_ sha1msg2 @MSG0[3],@MSG0[0] sha1msg2 @MSG1[3],@MSG1[0] pxor @MSG0[3],@MSG0[1] pxor @MSG1[3],@MSG1[1] mov \$1,%ecx pxor @MSG0[2],@MSG0[2] # zero cmp 4*0(%rbx),%ecx # examine counters cmovge %rsp,@ptr[0] # cancel input movdqa $ABCD0,$E0 movdqa $ABCD1,$E1 sha1rnds4 \$3,$E0_,$ABCD0 # 68-71 sha1nexte @MSG0[0],$E0 sha1rnds4 \$3,$E1_,$ABCD1 # 68-71 sha1nexte @MSG1[0],$E1 sha1msg2 @MSG0[0],@MSG0[1] sha1msg2 @MSG1[0],@MSG1[1] cmp 4*1(%rbx),%ecx cmovge %rsp,@ptr[1] movq (%rbx),@MSG0[0] # pull counters movdqa $ABCD0,$E0_ movdqa $ABCD1,$E1_ sha1rnds4 \$3,$E0,$ABCD0 # 72-75 sha1nexte @MSG0[1],$E0_ sha1rnds4 \$3,$E1,$ABCD1 # 72-75 sha1nexte @MSG1[1],$E1_ pshufd \$0x00,@MSG0[0],@MSG1[2] pshufd \$0x55,@MSG0[0],@MSG1[3] movdqa @MSG0[0],@MSG0[1] pcmpgtd @MSG0[2],@MSG1[2] pcmpgtd @MSG0[2],@MSG1[3] movdqa $ABCD0,$E0 movdqa $ABCD1,$E1 sha1rnds4 \$3,$E0_,$ABCD0 # 76-79 sha1nexte $MSG0[2],$E0 sha1rnds4 \$3,$E1_,$ABCD1 # 76-79 sha1nexte $MSG0[2],$E1 pcmpgtd @MSG0[2],@MSG0[1] # counter mask pand @MSG1[2],$ABCD0 pand @MSG1[2],$E0 pand @MSG1[3],$ABCD1 pand @MSG1[3],$E1 paddd @MSG0[1],@MSG0[0] # counters-- paddd 0x40(%rsp),$ABCD0 paddd 0x50(%rsp),$E0 paddd 0x60(%rsp),$ABCD1 paddd 0x70(%rsp),$E1 movq @MSG0[0],(%rbx) # save counters dec $num jnz .Loop_shaext mov `$REG_SZ*17+8`(%rsp),$num pshufd \$0b00011011,$ABCD0,$ABCD0 pshufd \$0b00011011,$ABCD1,$ABCD1 movdqa $ABCD0,@MSG0[0] punpckldq $ABCD1,$ABCD0 # b1.b0.a1.a0 punpckhdq $ABCD1,@MSG0[0] # d1.d0.c1.c0 punpckhdq $E1,$E0 # e1.e0.xx.xx movq $ABCD0,0x00-0x40($ctx) # a1.a0 psrldq \$8,$ABCD0 movq @MSG0[0],0x40-0x40($ctx)# c1.c0 psrldq \$8,@MSG0[0] movq $ABCD0,0x20-0x40($ctx) # b1.b0 psrldq \$8,$E0 movq @MSG0[0],0x60-0x40($ctx)# d1.d0 movq $E0,0x80-0x40($ctx) # e1.e0 lea `$REG_SZ/2`($ctx),$ctx lea `16*2`($inp),$inp dec $num jnz .Loop_grande_shaext .Ldone_shaext: #mov `$REG_SZ*17`(%rsp),%rax # original %rsp ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_shaext: ret .cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext ___ }}} if ($avx) {{{ sub BODY_00_19_avx { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $k=$i+2; my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128"; my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4]; $code.=<<___ if ($i==0 && $REG_SZ==16); vmovd (@ptr[0]),@Xi[0] lea `16*4`(@ptr[0]),@ptr[0] vmovd (@ptr[1]),@Xi[2] # borrow Xi[2] lea `16*4`(@ptr[1]),@ptr[1] vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2] lea `16*4`(@ptr[3]),@ptr[3] vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] vpunpckldq @Xi[2],@Xi[0],@Xi[0] vmovd `4*$j-16*4`($ptr_n),$t3 vpshufb $tx,@Xi[0],@Xi[0] ___ $code.=<<___ if ($i<15 && $REG_SZ==16); # just load input vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3 ___ $code.=<<___ if ($i==0 && $REG_SZ==32); vmovd (@ptr[0]),@Xi[0] lea `16*4`(@ptr[0]),@ptr[0] vmovd (@ptr[4]),@Xi[2] # borrow Xi[2] lea `16*4`(@ptr[4]),@ptr[4] vmovd (@ptr[1]),$t2 lea `16*4`(@ptr[1]),@ptr[1] vmovd (@ptr[5]),$t1 lea `16*4`(@ptr[5]),@ptr[5] vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0] lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2] lea `16*4`(@ptr[6]),@ptr[6] vpinsrd \$1,(@ptr[3]),$t2,$t2 lea `16*4`(@ptr[3]),@ptr[3] vpunpckldq $t2,@Xi[0],@Xi[0] vpinsrd \$1,(@ptr[7]),$t1,$t1 lea `16*4`(@ptr[7]),@ptr[7] vpunpckldq $t1,@Xi[2],@Xi[2] vmovd `4*$j-16*4`(@ptr[0]),@Xi[1] vinserti128 @Xi[2],@Xi[0],@Xi[0] vmovd `4*$j-16*4`($ptr_n),$t3 vpshufb $tx,@Xi[0],@Xi[0] ___ $code.=<<___ if ($i<15 && $REG_SZ==32); # just load input vmovd `4*$j-16*4`(@ptr[1]),$t2 vmovd `4*$j-16*4`(@ptr[5]),$t1 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1] vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2 vpunpckldq $t2,@Xi[1],@Xi[1] vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1 vpunpckldq $t1,$t3,$t3 ___ $code.=<<___ if ($i<14); vpaddd $K,$e,$e # e+=K_00_19 vpslld \$5,$a,$t2 vpandn $d,$b,$t1 vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] $vpack $t3,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 vpxor $t1,$t0,$t0 # Ch(b,c,d) vmovd `4*$k-16*4`(@ptr[0]),@Xi[2] vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vmovd `4*$k-16*4`($ptr_n),$t3 vpaddd $t0,$e,$e # e+=Ch(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpshufb $tx,@Xi[1],@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ $code.=<<___ if ($i==14); vpaddd $K,$e,$e # e+=K_00_19 prefetcht0 63(@ptr[0]) vpslld \$5,$a,$t2 vpandn $d,$b,$t1 vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] $vpack $t3,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 prefetcht0 63(@ptr[1]) vpxor $t1,$t0,$t0 # Ch(b,c,d) vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) prefetcht0 63(@ptr[2]) vpaddd $t0,$e,$e # e+=Ch(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) prefetcht0 63(@ptr[3]) vpshufb $tx,@Xi[1],@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ $code.=<<___ if ($i>=13 && $i<15); vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]" ___ $code.=<<___ if ($i>=15); # apply Xupdate vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" vpaddd $K,$e,$e # e+=K_00_19 vpslld \$5,$a,$t2 vpandn $d,$b,$t1 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpand $c,$b,$t0 vmovdqa @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 vpxor $t1,$t0,$t0 # Ch(b,c,d) vpxor @Xi[3],@Xi[1],@Xi[1] `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Ch(b,c,d) `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpsrld \$31,@Xi[1],$tx vpaddd @Xi[1],@Xi[1],@Xi[1] vpsrld \$2,$b,$b `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1] vpor $t1,$b,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_20_39_avx { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i<79); vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" vpslld \$5,$a,$t2 vpaddd $K,$e,$e # e+=K_20_39 vpxor $b,$d,$t0 ___ $code.=<<___ if ($i<72); vmovdqa @Xi[0],`&Xi_off($i)` ___ $code.=<<___ if ($i<79); vpaddd @Xi[0],$e,$e # e+=X[i] vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] vpsrld \$27,$a,$t3 vpxor $c,$t0,$t0 # Parity(b,c,d) vpxor @Xi[3],@Xi[1],@Xi[1] vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Parity(b,c,d) vpsrld \$31,@Xi[1],$tx vpaddd @Xi[1],@Xi[1],@Xi[1] vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1) vpor $t1,$b,$b # b=rol(b,30) ___ $code.=<<___ if ($i==79); vpslld \$5,$a,$t2 vpaddd $K,$e,$e # e+=K_20_39 vpxor $b,$d,$t0 vpsrld \$27,$a,$t3 vpaddd @Xi[0],$e,$e # e+=X[i] vpxor $c,$t0,$t0 # Parity(b,c,d) vpslld \$30,$b,$t1 vpor $t3,$t2,$t2 # rol(a,5) vpaddd $t0,$e,$e # e+=Parity(b,c,d) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpor $t1,$b,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } sub BODY_40_59_avx { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]" vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]" vpaddd $K,$e,$e # e+=K_40_59 vpslld \$5,$a,$t2 vpand $c,$d,$t1 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1] vpaddd $t1,$e,$e vpsrld \$27,$a,$t3 vpxor $c,$d,$t0 vpxor @Xi[3],@Xi[1],@Xi[1] vmovdqu @Xi[0],`&Xi_off($i)` vpaddd @Xi[0],$e,$e # e+=X[i] vpor $t3,$t2,$t2 # rol(a,5) vpsrld \$31,@Xi[1],$tx vpand $b,$t0,$t0 vpaddd @Xi[1],@Xi[1],@Xi[1] vpslld \$30,$b,$t1 vpaddd $t0,$e,$e # e+=Maj(b,d,c) vpsrld \$2,$b,$b vpaddd $t2,$e,$e # e+=rol(a,5) vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1) vpor $t1,$b,$b # b=rol(b,30) ___ push(@Xi,shift(@Xi)); } $code.=<<___; .type sha1_multi_block_avx,\@function,3 .align 32 sha1_multi_block_avx: .cfi_startproc _avx_shortcut: ___ $code.=<<___ if ($avx>1); shr \$32,%rcx cmp \$2,$num jb .Lavx test \$`1<<5`,%ecx jnz _avx2_shortcut jmp .Lavx .align 32 .Lavx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx: lea K_XX_XX(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx vzeroupper .Loop_grande_avx: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_avx vmovdqu 0x00($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20($ctx),$B vmovdqu 0x40($ctx),$C vmovdqu 0x60($ctx),$D vmovdqu 0x80($ctx),$E vmovdqu 0x60($Tbl),$tx # pbswap_mask jmp .Loop_avx .align 32 .Loop_avx: ___ $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; mov \$1,%ecx ___ for($i=0;$i<4;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu (%rbx),$t0 # pull counters vpxor $t2,$t2,$t2 vmovdqa $t0,$t1 vpcmpgtd $t2,$t1,$t1 # mask value vpaddd $t1,$t0,$t0 # counters-- vpand $t1,$A,$A vpand $t1,$B,$B vpaddd 0x00($ctx),$A,$A vpand $t1,$C,$C vpaddd 0x20($ctx),$B,$B vpand $t1,$D,$D vpaddd 0x40($ctx),$C,$C vpand $t1,$E,$E vpaddd 0x60($ctx),$D,$D vpaddd 0x80($ctx),$E,$E vmovdqu $A,0x00($ctx) vmovdqu $B,0x20($ctx) vmovdqu $C,0x40($ctx) vmovdqu $D,0x60($ctx) vmovdqu $E,0x80($ctx) vmovdqu $t0,(%rbx) # save counters vmovdqu 0x60($Tbl),$tx # pbswap_mask dec $num jnz .Loop_avx mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande_avx .Ldone_avx: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha1_multi_block_avx,.-sha1_multi_block_avx ___ if ($avx>1) { $code =~ s/\`([^\`]*)\`/eval $1/gem; $REG_SZ=32; @ptr=map("%r$_",(12..15,8..11)); @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4)); ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9)); @Xi=map("%ymm$_",(10..14)); $K="%ymm15"; $code.=<<___; .type sha1_multi_block_avx2,\@function,3 .align 32 sha1_multi_block_avx2: .cfi_startproc _avx2_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx2: lea K_XX_XX(%rip),$Tbl shr \$1,$num vzeroupper .Loop_grande_avx2: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu 0x00($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20($ctx),$B lea 256+128(%rsp),%rbx vmovdqu 0x40($ctx),$C vmovdqu 0x60($ctx),$D vmovdqu 0x80($ctx),$E vmovdqu 0x60($Tbl),$tx # pbswap_mask jmp .Loop_avx2 .align 32 .Loop_avx2: ___ $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19 for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39 for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59 for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); } $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79 for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; mov \$1,%ecx lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu (%rbx),$t0 # pull counters vpxor $t2,$t2,$t2 vmovdqa $t0,$t1 vpcmpgtd $t2,$t1,$t1 # mask value vpaddd $t1,$t0,$t0 # counters-- vpand $t1,$A,$A vpand $t1,$B,$B vpaddd 0x00($ctx),$A,$A vpand $t1,$C,$C vpaddd 0x20($ctx),$B,$B vpand $t1,$D,$D vpaddd 0x40($ctx),$C,$C vpand $t1,$E,$E vpaddd 0x60($ctx),$D,$D vpaddd 0x80($ctx),$E,$E vmovdqu $A,0x00($ctx) vmovdqu $B,0x20($ctx) vmovdqu $C,0x40($ctx) vmovdqu $D,0x60($ctx) vmovdqu $E,0x80($ctx) vmovdqu $t0,(%rbx) # save counters lea 256+128(%rsp),%rbx vmovdqu 0x60($Tbl),$tx # pbswap_mask dec $num jnz .Loop_avx2 #mov `$REG_SZ*17+8`(%rsp),$num #lea $REG_SZ($ctx),$ctx #lea `16*$REG_SZ/4`($inp),$inp #dec $num #jnz .Loop_grande_avx2 .Ldone_avx2: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size sha1_multi_block_avx2,.-sha1_multi_block_avx2 ___ } }}} $code.=<<___; .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 K_XX_XX: .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .asciz "SHA1 multi-block transform for x86_64, CRYPTOGAMS by " ___ if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lbody jb .Lin_prologue mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue mov `16*17`(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp lea -24-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($avx>1); .type avx2_handler,\@abi-omnipotent .align 16 avx2_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue mov `32*17`($context),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size avx2_handler,.-avx2_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_sha1_multi_block .rva .LSEH_end_sha1_multi_block .rva .LSEH_info_sha1_multi_block .rva .LSEH_begin_sha1_multi_block_shaext .rva .LSEH_end_sha1_multi_block_shaext .rva .LSEH_info_sha1_multi_block_shaext ___ $code.=<<___ if ($avx); .rva .LSEH_begin_sha1_multi_block_avx .rva .LSEH_end_sha1_multi_block_avx .rva .LSEH_info_sha1_multi_block_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_sha1_multi_block_avx2 .rva .LSEH_end_sha1_multi_block_avx2 .rva .LSEH_info_sha1_multi_block_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_multi_block: .byte 9,0,0,0 .rva se_handler .rva .Lbody,.Lepilogue # HandlerData[] .LSEH_info_sha1_multi_block_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_sha1_multi_block_avx: .byte 9,0,0,0 .rva se_handler .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_sha1_multi_block_avx2: .byte 9,0,0,0 .rva avx2_handler .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if ($dst>=8); $rex|=0x01 if ($src>=8); unshift @opcode,$rex|0x40 if ($rex); } sub sha1rnds4 { if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x3a,0xcc); rex(\@opcode,$3,$2); push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } else { return "sha1rnds4\t".@_[0]; } } sub sha1op38 { my $instr = shift; my %opcodelet = ( "sha1nexte" => 0xc8, "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo or s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/sha/asm/sha1-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/sha/asm/sha1-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/sha/asm/sha1-x86_64.pl (revision 364963) @@ -1,2132 +1,2132 @@ #! /usr/bin/env perl # Copyright 2006-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # sha1_block procedure for x86_64. # # It was brought to my attention that on EM64T compiler-generated code # was far behind 32-bit assembler implementation. This is unlike on # Opteron where compiler-generated code was only 15% behind 32-bit # assembler, which originally made it hard to motivate the effort. # There was suggestion to mechanically translate 32-bit code, but I # dismissed it, reasoning that x86_64 offers enough register bank # capacity to fully utilize SHA-1 parallelism. Therefore this fresh # implementation:-) However! While 64-bit code does perform better # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, # x86_64 does offer larger *addressable* bank, but out-of-order core # reaches for even more registers through dynamic aliasing, and EM64T # core must have managed to run-time optimize even 32-bit code just as # good as 64-bit one. Performance improvement is summarized in the # following table: # # gcc 3.4 32-bit asm cycles/byte # Opteron +45% +20% 6.8 # Xeon P4 +65% +0% 9.9 # Core2 +60% +10% 7.0 # August 2009. # # The code was revised to minimize code size and to maximize # "distance" between instructions producing input to 'lea' # instruction and the 'lea' instruction itself, which is essential # for Intel Atom core. # October 2010. # # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it # is to offload message schedule denoted by Wt in NIST specification, # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module # for background and implementation details. The only difference from # 32-bit code is that 64-bit code doesn't have to spill @X[] elements # to free temporary registers. # April 2011. # # Add AVX code path. See sha1-586.pl for further information. # May 2013. # # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions # and loading pair of consecutive blocks to 256-bit %ymm registers) # did not provide impressive performance improvement till a crucial # hint regarding the number of Xupdate iterations to pre-compute in # advance was provided by Ilya Albrekht of Intel Corp. # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance is summarized in following table. Numbers are # CPU clock cycles spent to process single byte (less is better). # # x86_64 SSSE3 AVX[2] # P4 9.05 - # Opteron 6.26 - # Core2 6.55 6.05/+8% - # Westmere 6.73 5.30/+27% - # Sandy Bridge 7.70 6.10/+26% 4.99/+54% # Ivy Bridge 6.06 4.67/+30% 4.60/+32% # Haswell 5.45 4.15/+31% 3.57/+53% # Skylake 5.18 4.06/+28% 3.54/+46% # Bulldozer 9.11 5.95/+53% # Ryzen 4.75 3.80/+24% 1.93/+150%(**) # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% # Knights L 13.2(*) 9.68/+36% 8.30/+59% # Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, # because SSSE3 code is compiled unconditionally; # (**) SHAEXT result $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; $ctx="%rdi"; # 1st arg $inp="%rsi"; # 2nd arg $num="%rdx"; # 3rd arg # reassign arguments in order to produce more compact code $ctx="%r8"; $inp="%r9"; $num="%r10"; $t0="%eax"; $t1="%ebx"; $t2="%ecx"; @xi=("%edx","%ebp","%r14d"); $A="%esi"; $B="%edi"; $C="%r11d"; $D="%r12d"; $E="%r13d"; @V=($A,$B,$C,$D,$E); sub BODY_00_19 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___ if ($i==0); mov `4*$i`($inp),$xi[0] bswap $xi[0] ___ $code.=<<___ if ($i<15); mov `4*$j`($inp),$xi[1] mov $d,$t0 mov $xi[0],`4*$i`(%rsp) mov $a,$t2 bswap $xi[1] xor $c,$t0 rol \$5,$t2 and $b,$t0 lea 0x5a827999($xi[0],$e),$e add $t2,$e xor $d,$t0 rol \$30,$b add $t0,$e ___ $code.=<<___ if ($i>=15); xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $c,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] and $b,$t0 lea 0x5a827999($xi[0],$e),$e rol \$30,$b xor $d,$t0 add $t2,$e rol \$1,$xi[1] add $t0,$e ___ push(@xi,shift(@xi)); } sub BODY_20_39 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; my $K=($i<40)?0x6ed9eba1:0xca62c1d6; $code.=<<___ if ($i<79); xor `4*($j%16)`(%rsp),$xi[1] mov $b,$t0 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)` mov $a,$t2 xor `4*(($j+2)%16)`(%rsp),$xi[1] xor $d,$t0 rol \$5,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] lea $K($xi[0],$e),$e xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e rol \$1,$xi[1] ___ $code.=<<___ if ($i==79); mov $b,$t0 mov $a,$t2 xor $d,$t0 lea $K($xi[0],$e),$e rol \$5,$t2 xor $c,$t0 add $t2,$e rol \$30,$b add $t0,$e ___ push(@xi,shift(@xi)); } sub BODY_40_59 { my ($i,$a,$b,$c,$d,$e)=@_; my $j=$i+1; $code.=<<___; xor `4*($j%16)`(%rsp),$xi[1] mov $d,$t0 mov $xi[0],`4*($i%16)`(%rsp) mov $d,$t1 xor `4*(($j+2)%16)`(%rsp),$xi[1] and $c,$t0 mov $a,$t2 xor `4*(($j+8)%16)`(%rsp),$xi[1] lea 0x8f1bbcdc($xi[0],$e),$e xor $c,$t1 rol \$5,$t2 add $t0,$e rol \$1,$xi[1] and $b,$t1 add $t2,$e rol \$30,$b add $t1,$e ___ push(@xi,shift(@xi)); } $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl sha1_block_data_order .type sha1_block_data_order,\@function,3 .align 16 sha1_block_data_order: .cfi_startproc mov OPENSSL_ia32cap_P+0(%rip),%r9d mov OPENSSL_ia32cap_P+4(%rip),%r8d mov OPENSSL_ia32cap_P+8(%rip),%r10d test \$`1<<9`,%r8d # check SSSE3 bit jz .Lialu ___ $code.=<<___ if ($shaext); test \$`1<<29`,%r10d # check SHA bit jnz _shaext_shortcut ___ $code.=<<___ if ($avx>1); and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2 cmp \$`1<<3|1<<5|1<<8`,%r10d je _avx2_shortcut ___ $code.=<<___ if ($avx); and \$`1<<28`,%r8d # mask AVX bit and \$`1<<30`,%r9d # mask "Intel CPU" bit or %r9d,%r8d cmp \$`1<<28|1<<30`,%r8d je _avx_shortcut ___ $code.=<<___; jmp _ssse3_shortcut .align 16 .Lialu: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 mov %rdi,$ctx # reassigned argument sub \$`8+16*4`,%rsp mov %rsi,$inp # reassigned argument and \$-64,%rsp mov %rdx,$num # reassigned argument mov %rax,`16*4`(%rsp) .cfi_cfa_expression %rsp+64,deref,+8 .Lprologue: mov 0($ctx),$A mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E jmp .Lloop .align 16 .Lloop: ___ for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); } for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); } for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); } $code.=<<___; add 0($ctx),$A add 4($ctx),$B add 8($ctx),$C add 12($ctx),$D add 16($ctx),$E mov $A,0($ctx) mov $B,4($ctx) mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) sub \$1,$num lea `16*4`($inp),$inp jnz .Lloop mov `16*4`(%rsp),%rsi .cfi_def_cfa %rsi,8 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order ___ if ($shaext) {{{ ###################################################################### # Intel SHA Extensions implementation of SHA1 update function. # my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx"); my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9)); my @MSG=map("%xmm$_",(4..7)); $code.=<<___; .type sha1_block_data_order_shaext,\@function,3 .align 32 sha1_block_data_order_shaext: _shaext_shortcut: .cfi_startproc ___ $code.=<<___ if ($win64); lea `-8-4*16`(%rsp),%rsp movaps %xmm6,-8-4*16(%rax) movaps %xmm7,-8-3*16(%rax) movaps %xmm8,-8-2*16(%rax) movaps %xmm9,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; movdqu ($ctx),$ABCD movd 16($ctx),$E movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap movdqu ($inp),@MSG[0] pshufd \$0b00011011,$ABCD,$ABCD # flip word order movdqu 0x10($inp),@MSG[1] pshufd \$0b00011011,$E,$E # flip word order movdqu 0x20($inp),@MSG[2] pshufb $BSWAP,@MSG[0] movdqu 0x30($inp),@MSG[3] pshufb $BSWAP,@MSG[1] pshufb $BSWAP,@MSG[2] movdqa $E,$E_SAVE # offload $E pshufb $BSWAP,@MSG[3] jmp .Loop_shaext .align 16 .Loop_shaext: dec $num lea 0x40($inp),%r8 # next input block paddd @MSG[0],$E cmovne %r8,$inp movdqa $ABCD,$ABCD_SAVE # offload $ABCD ___ for($i=0;$i<20-4;$i+=2) { $code.=<<___; sha1msg1 @MSG[1],@MSG[0] movdqa $ABCD,$E_ sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3... sha1nexte @MSG[1],$E_ pxor @MSG[2],@MSG[0] sha1msg1 @MSG[2],@MSG[1] sha1msg2 @MSG[3],@MSG[0] movdqa $ABCD,$E sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD sha1nexte @MSG[2],$E pxor @MSG[3],@MSG[1] sha1msg2 @MSG[0],@MSG[1] ___ push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG)); } $code.=<<___; movdqu ($inp),@MSG[0] movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 64-67 sha1nexte @MSG[1],$E_ movdqu 0x10($inp),@MSG[1] pshufb $BSWAP,@MSG[0] movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 68-71 sha1nexte @MSG[2],$E movdqu 0x20($inp),@MSG[2] pshufb $BSWAP,@MSG[1] movdqa $ABCD,$E_ sha1rnds4 \$3,$E,$ABCD # 72-75 sha1nexte @MSG[3],$E_ movdqu 0x30($inp),@MSG[3] pshufb $BSWAP,@MSG[2] movdqa $ABCD,$E sha1rnds4 \$3,$E_,$ABCD # 76-79 sha1nexte $E_SAVE,$E pshufb $BSWAP,@MSG[3] paddd $ABCD_SAVE,$ABCD movdqa $E,$E_SAVE # offload $E jnz .Loop_shaext pshufd \$0b00011011,$ABCD,$ABCD pshufd \$0b00011011,$E,$E movdqu $ABCD,($ctx) movd $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-4*16(%rax),%xmm6 movaps -8-3*16(%rax),%xmm7 movaps -8-2*16(%rax),%xmm8 movaps -8-1*16(%rax),%xmm9 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext ___ }}} {{{ my $Xi=4; my @X=map("%xmm$_",(4..7,0..3)); my @Tx=map("%xmm$_",(8..10)); my $Kx="%xmm11"; my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization my @T=("%esi","%edi"); my $j=0; my $rx=0; my $K_XX_XX="%r14"; my $fp="%r11"; my $_rol=sub { &rol(@_) }; my $_ror=sub { &ror(@_) }; { my $sn; sub align32() { ++$sn; $code.=<<___; jmp .Lalign32_$sn # see "Decoded ICache" in manual .align 32 .Lalign32_$sn: ___ } } $code.=<<___; .type sha1_block_data_order_ssse3,\@function,3 .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: .cfi_startproc mov %rsp,$fp # frame pointer .cfi_def_cfa_register $fp push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler .cfi_push %r13 push %r14 .cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp ___ $code.=<<___ if ($win64); movaps %xmm6,-40-6*16($fp) movaps %xmm7,-40-5*16($fp) movaps %xmm8,-40-4*16($fp) movaps %xmm9,-40-3*16($fp) movaps %xmm10,-40-2*16($fp) movaps %xmm11,-40-1*16($fp) .Lprologue_ssse3: ___ $code.=<<___; and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument shl \$6,$num add $inp,$num lea K_XX_XX+64(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] movdqa 64($K_XX_XX),@X[2] # pbswap mask movdqa -64($K_XX_XX),@Tx[1] # K_00_19 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] movdqu 16($inp),@X[-3&7] movdqu 32($inp),@X[-2&7] movdqu 48($inp),@X[-1&7] pshufb @X[2],@X[-4&7] # byte swap pshufb @X[2],@X[-3&7] pshufb @X[2],@X[-2&7] add \$64,$inp paddd @Tx[1],@X[-4&7] # add K_00_19 pshufb @X[2],@X[-1&7] paddd @Tx[1],@X[-3&7] paddd @Tx[1],@X[-2&7] movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU psubd @Tx[1],@X[-4&7] # restore X[] movdqa @X[-3&7],16(%rsp) psubd @Tx[1],@X[-3&7] movdqa @X[-2&7],32(%rsp) psubd @Tx[1],@X[-2&7] jmp .Loop_ssse3 ___ sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); # ror &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]); eval(shift(@insns)); &movdqa (@Tx[0],@X[-1&7]); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &psrldq (@Tx[0],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); # ror &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); # rol &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &movdqa (@Tx[2],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword &paddd (@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],31); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); &movdqa (@Tx[1],@Tx[2]); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[2],30); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pslld (@Tx[1],2); &pxor (@X[0],@Tx[2]); eval(shift(@insns)); &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79 foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xupdate_ssse3_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)) if ($Xi==8); &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)) if ($Xi==8); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)) if (@insns[1] =~ /_ror/); eval(shift(@insns)) if (@insns[0] =~ /_ror/); &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8); eval(shift(@insns)); eval(shift(@insns)); # rol &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)); if ($Xi%5) { &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX... } else { # ... or load next one &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)"); } eval(shift(@insns)); # ror &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)) if (@insns[0] =~ /_ror/); &movdqa (@Tx[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); # ror eval(shift(@insns)); eval(shift(@insns)); # body_20_39 &pslld (@X[0],2); eval(shift(@insns)); eval(shift(@insns)); &psrld (@Tx[0],30); eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror &por (@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); eval(shift(@insns)); # body_20_39 eval(shift(@insns)) if (@insns[1] =~ /_rol/); eval(shift(@insns)) if (@insns[0] =~ /_rol/); &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0]) eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] push(@Tx,shift(@Tx)); } sub Xuplast_ssse3_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@Tx[1],@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$num); &je (".Ldone_ssse3"); unshift(@Tx,pop(@Tx)); &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19 &movdqu (@X[-4&7],"0($inp)"); # load input &movdqu (@X[-3&7],"16($inp)"); &movdqu (@X[-2&7],"32($inp)"); &movdqu (@X[-1&7],"48($inp)"); &pshufb (@X[-4&7],@X[2]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufb (@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[($Xi-4)&7],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psubd (@X[($Xi-4)&7],@Tx[1]); foreach (@insns) { eval; } $Xi++; } sub Xtail_ssse3() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } sub body_00_19 () { # ((c^d)&b)^d # on start @T[0]=(c^d)&b return &body_20_39() if ($rx==19); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&$_ror ($b,$j?7:2)', # $b>>>2 '&xor (@T[0],$d)', '&mov (@T[1],$a)', # $b for next round '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer '&xor ($b,$c)', # $c^$d for next round '&$_rol ($a,5)', '&add ($e,@T[0])', '&and (@T[1],$b)', # ($b&($c^$d)) for next round '&xor ($b,$c)', # restore $b '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_20_39 () { # b^d^c # on entry @T[0]=b^d return &body_40_59() if ($rx==39); $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer '&xor (@T[0],$d) if($j==19);'. '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c) '&mov (@T[1],$a)', # $b for next round '&$_rol ($a,5)', '&add ($e,@T[0])', '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round '&$_ror ($b,7)', # $b>>>2 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } sub body_40_59 () { # ((b^c)&(c^d))^c # on entry @T[0]=(b^c), (c^=d) $rx++; ( '($a,$b,$c,$d,$e)=@V;'. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d) '&xor ($c,$d) if ($j>=40)', # restore $c '&$_ror ($b,7)', # $b>>>2 '&mov (@T[1],$a)', # $b for next round '&xor (@T[0],$c)', '&$_rol ($a,5)', '&add ($e,@T[0])', '&xor (@T[1],$c) if ($j==59);'. '&xor (@T[1],$b) if ($j< 59)', # b^c for next round '&xor ($b,$c) if ($j< 59)', # c^d for next round '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));' ); } $code.=<<___; .align 16 .Loop_ssse3: ___ &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_16_31(\&body_00_19); &Xupdate_ssse3_32_79(\&body_00_19); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_20_39); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_40_59); &Xupdate_ssse3_32_79(\&body_20_39); &Xuplast_ssse3_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); &Xloop_ssse3(\&body_20_39); $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_ssse3 .align 16 .Ldone_ssse3: ___ $j=$saved_j; @V=@saved_V; &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); &Xtail_ssse3(\&body_20_39); $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -40-6*16($fp),%xmm6 movaps -40-5*16($fp),%xmm7 movaps -40-4*16($fp),%xmm8 movaps -40-3*16($fp),%xmm9 movaps -40-2*16($fp),%xmm10 movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; mov -40($fp),%r14 .cfi_restore %r14 mov -32($fp),%r13 .cfi_restore %r13 mov -24($fp),%r12 .cfi_restore %r12 mov -16($fp),%rbp .cfi_restore %rbp mov -8($fp),%rbx .cfi_restore %rbx lea ($fp),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret .cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 ___ if ($avx) { $Xi=4; # reset variables @X=map("%xmm$_",(4..7,0..3)); @Tx=map("%xmm$_",(8..10)); $j=0; $rx=0; my $done_avx_label=".Ldone_avx"; my $_rol=sub { &shld(@_[0],@_) }; my $_ror=sub { &shrd(@_[0],@_) }; $code.=<<___; .type sha1_block_data_order_avx,\@function,3 .align 16 sha1_block_data_order_avx: _avx_shortcut: .cfi_startproc mov %rsp,$fp .cfi_def_cfa_register $fp push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 # redundant, done to share Win64 SE handler .cfi_push %r13 push %r14 .cfi_push %r14 lea `-64-($win64?6*16:0)`(%rsp),%rsp vzeroupper ___ $code.=<<___ if ($win64); vmovaps %xmm6,-40-6*16($fp) vmovaps %xmm7,-40-5*16($fp) vmovaps %xmm8,-40-4*16($fp) vmovaps %xmm9,-40-3*16($fp) vmovaps %xmm10,-40-2*16($fp) vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx: ___ $code.=<<___; and \$-64,%rsp mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument shl \$6,$num add $inp,$num lea K_XX_XX+64(%rip),$K_XX_XX mov 0($ctx),$A # load context mov 4($ctx),$B mov 8($ctx),$C mov 12($ctx),$D mov $B,@T[0] # magic seed mov 16($ctx),$E mov $C,@T[1] xor $D,@T[1] and @T[1],@T[0] vmovdqa 64($K_XX_XX),@X[2] # pbswap mask vmovdqa -64($K_XX_XX),$Kx # K_00_19 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3] vmovdqu 16($inp),@X[-3&7] vmovdqu 32($inp),@X[-2&7] vmovdqu 48($inp),@X[-1&7] vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap add \$64,$inp vpshufb @X[2],@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vpaddd $Kx,@X[-2&7],@X[2] vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU vmovdqa @X[1],16(%rsp) vmovdqa @X[2],32(%rsp) jmp .Loop_avx ___ sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],31); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslld (@Tx[2],@Tx[2],2); &vpxor (@X[0],@X[0],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" eval(shift(@insns)); eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/); &vpaddd (@Tx[1],$Kx,@X[-1&7]); &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); # ror eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol &vpsrld (@Tx[0],@X[0],30); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # ror eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); # body_20_39 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # rol eval(shift(@insns)); foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xuplast_avx_80() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); &vpaddd (@Tx[1],$Kx,@X[-1&7]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU foreach (@insns) { eval; } # remaining instructions &cmp ($inp,$num); &je ($done_avx_label); &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19 &vmovdqu(@X[-4&7],"0($inp)"); # load input &vmovdqu(@X[-3&7],"16($inp)"); &vmovdqu(@X[-2&7],"32($inp)"); &vmovdqu(@X[-1&7],"48($inp)"); &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap &add ($inp,64); $Xi=0; } sub Xloop_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); eval(shift(@insns)); eval(shift(@insns)); &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU eval(shift(@insns)); eval(shift(@insns)); foreach (@insns) { eval; } $Xi++; } sub Xtail_avx() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } $code.=<<___; .align 16 .Loop_avx: ___ &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_16_31(\&body_00_19); &Xupdate_avx_32_79(\&body_00_19); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_20_39); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_40_59); &Xupdate_avx_32_79(\&body_20_39); &Xuplast_avx_80(\&body_20_39); # can jump to "done" $saved_j=$j; @saved_V=@V; &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); &Xloop_avx(\&body_20_39); $code.=<<___; add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C add 12($ctx),$D mov $A,0($ctx) add 16($ctx),$E mov @T[0],4($ctx) mov @T[0],$B # magic seed mov $C,8($ctx) mov $C,@T[1] mov $D,12($ctx) xor $D,@T[1] mov $E,16($ctx) and @T[1],@T[0] jmp .Loop_avx .align 16 $done_avx_label: ___ $j=$saved_j; @V=@saved_V; &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); &Xtail_avx(\&body_20_39); $code.=<<___; vzeroupper add 0($ctx),$A # update context add 4($ctx),@T[0] add 8($ctx),$C mov $A,0($ctx) add 12($ctx),$D mov @T[0],4($ctx) add 16($ctx),$E mov $C,8($ctx) mov $D,12($ctx) mov $E,16($ctx) ___ $code.=<<___ if ($win64); movaps -40-6*16($fp),%xmm6 movaps -40-5*16($fp),%xmm7 movaps -40-4*16($fp),%xmm8 movaps -40-3*16($fp),%xmm9 movaps -40-2*16($fp),%xmm10 movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; mov -40($fp),%r14 .cfi_restore %r14 mov -32($fp),%r13 .cfi_restore %r13 mov -24($fp),%r12 .cfi_restore %r12 mov -16($fp),%rbp .cfi_restore %rbp mov -8($fp),%rbx .cfi_restore %rbx lea ($fp),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha1_block_data_order_avx,.-sha1_block_data_order_avx ___ if ($avx>1) { use integer; $Xi=4; # reset variables @X=map("%ymm$_",(4..7,0..3)); @Tx=map("%ymm$_",(8..10)); $Kx="%ymm11"; $j=0; my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi"); my ($a5,$t0)=("%r12d","%edi"); my ($A,$F,$B,$C,$D,$E)=@ROTX; my $rx=0; my $frame="%r13"; $code.=<<___; .type sha1_block_data_order_avx2,\@function,3 .align 16 sha1_block_data_order_avx2: _avx2_shortcut: .cfi_startproc mov %rsp,$fp .cfi_def_cfa_register $fp push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 vzeroupper ___ $code.=<<___ if ($win64); lea -6*16(%rsp),%rsp vmovaps %xmm6,-40-6*16($fp) vmovaps %xmm7,-40-5*16($fp) vmovaps %xmm8,-40-4*16($fp) vmovaps %xmm9,-40-3*16($fp) vmovaps %xmm10,-40-2*16($fp) vmovaps %xmm11,-40-1*16($fp) .Lprologue_avx2: ___ $code.=<<___; mov %rdi,$ctx # reassigned argument mov %rsi,$inp # reassigned argument mov %rdx,$num # reassigned argument lea -640(%rsp),%rsp shl \$6,$num lea 64($inp),$frame and \$-128,%rsp add $inp,$num lea K_XX_XX+64(%rip),$K_XX_XX mov 0($ctx),$A # load context cmp $num,$frame cmovae $inp,$frame # next or same block mov 4($ctx),$F mov 8($ctx),$C mov 12($ctx),$D mov 16($ctx),$E vmovdqu 64($K_XX_XX),@X[2] # pbswap mask vmovdqu ($inp),%xmm0 vmovdqu 16($inp),%xmm1 vmovdqu 32($inp),%xmm2 vmovdqu 48($inp),%xmm3 lea 64($inp),$inp vinserti128 \$1,($frame),@X[-4&7],@X[-4&7] vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] vpshufb @X[2],@X[-4&7],@X[-4&7] vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] vpshufb @X[2],@X[-3&7],@X[-3&7] vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] vpshufb @X[2],@X[-2&7],@X[-2&7] vmovdqu -64($K_XX_XX),$Kx # K_00_19 vpshufb @X[2],@X[-1&7],@X[-1&7] vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19 vpaddd $Kx,@X[-3&7],@X[1] vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU vpaddd $Kx,@X[-2&7],@X[2] vmovdqu @X[1],32(%rsp) vpaddd $Kx,@X[-1&7],@X[3] vmovdqu @X[2],64(%rsp) vmovdqu @X[3],96(%rsp) ___ for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31 use integer; &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" &vpsrld (@Tx[0],@X[0],31); &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 &vpslld (@Tx[2],@Tx[2],2); &vpxor (@X[0],@X[0],@Tx[1]); &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 &vpaddd (@Tx[1],@X[0],$Kx); &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU push(@X,shift(@X)); # "rotate" X[] } $code.=<<___; lea 128(%rsp),$frame jmp .Loop_avx2 .align 32 .Loop_avx2: rorx \$2,$F,$B andn $D,$F,$t0 and $C,$F xor $t0,$F ___ sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path # at start $f=(b&c)^(~b&d), $b>>>=2 return &bodyx_20_39() if ($rx==19); $rx++; ( '($a,$f,$b,$c,$d,$e)=@ROTX;'. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K '&lea ($frame,"256($frame)") if ($j%32==31);', '&andn ($t0,$a,$c)', # ~b&d for next round '&add ($e,$f)', # e+=(b&c)^(~b&d) '&rorx ($a5,$a,27)', # a<<<5 '&rorx ($f,$a,2)', # b>>>2 for next round '&and ($a,$b)', # b&c for next round '&add ($e,$a5)', # e+=a<<<5 '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round 'unshift(@ROTX,pop(@ROTX)); $j++;' ) } sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path # on entry $f=b^c^d, $b>>>=2 return &bodyx_40_59() if ($rx==39); $rx++; ( '($a,$f,$b,$c,$d,$e)=@ROTX;'. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K '&lea ($frame,"256($frame)") if ($j%32==31);', '&lea ($e,"($e,$f)")', # e+=b^c^d '&rorx ($a5,$a,27)', # a<<<5 '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round '&xor ($a,$b) if ($j<79)', # b^c for next round '&add ($e,$a5)', # e+=a<<<5 '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round 'unshift(@ROTX,pop(@ROTX)); $j++;' ) } sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path # on entry $f=((b^c)&(c^d)), $b>>>=2 $rx++; ( '($a,$f,$b,$c,$d,$e)=@ROTX;'. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K '&lea ($frame,"256($frame)") if ($j%32==31);', '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c '&mov ($t0,$b) if ($j<59)', # count on zero latency '&xor ($t0,$c) if ($j<59)', # c^d for next round '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c '&rorx ($a5,$a,27)', # a<<<5 '&rorx ($f,$a,2)', # b>>>2 in next round '&xor ($a,$b)', # b^c for next round '&add ($e,$a5)', # e+=a<<<5 '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round 'unshift(@ROTX,pop(@ROTX)); $j++;' ) } sub Xupdate_avx2_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions my ($a,$b,$c,$d,$e); &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]" &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],31); &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword &vpaddd (@X[0],@X[0],@X[0]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[1],@Tx[2],30); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1 eval(shift(@insns)); eval(shift(@insns)); &vpslld (@Tx[2],@Tx[2],2); &vpxor (@X[0],@X[0],@Tx[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],@X[0],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU foreach (@insns) { eval; } # remaining instructions [if any] $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xupdate_avx2_32_79() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions my ($a,$b,$c,$d,$e); &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]" &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]" eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]" &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]" eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrld (@Tx[0],@X[0],30); &vpslld (@X[0],@X[0],2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #&vpslld (@X[0],@X[0],2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2 eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@Tx[1],@X[0],$Kx); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU foreach (@insns) { eval; } # remaining instructions $Xi++; push(@X,shift(@X)); # "rotate" X[] } sub Xloop_avx2() { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions my ($a,$b,$c,$d,$e); foreach (@insns) { eval; } } &align32(); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_00_19); &Xupdate_avx2_32_79(\&bodyx_20_39); &Xupdate_avx2_32_79(\&bodyx_20_39); &Xupdate_avx2_32_79(\&bodyx_20_39); &Xupdate_avx2_32_79(\&bodyx_20_39); &align32(); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xupdate_avx2_32_79(\&bodyx_40_59); &Xloop_avx2(\&bodyx_20_39); &Xloop_avx2(\&bodyx_20_39); &Xloop_avx2(\&bodyx_20_39); &Xloop_avx2(\&bodyx_20_39); $code.=<<___; lea 128($inp),$frame lea 128($inp),%rdi # borrow $t0 cmp $num,$frame cmovae $inp,$frame # next or previous block # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c add 0($ctx),@ROTX[0] # update context add 4($ctx),@ROTX[1] add 8($ctx),@ROTX[3] mov @ROTX[0],0($ctx) add 12($ctx),@ROTX[4] mov @ROTX[1],4($ctx) mov @ROTX[0],$A # A=d add 16($ctx),@ROTX[5] mov @ROTX[3],$a5 mov @ROTX[3],8($ctx) mov @ROTX[4],$D # D=b #xchg @ROTX[5],$F # F=c, C=f mov @ROTX[4],12($ctx) mov @ROTX[1],$F # F=e mov @ROTX[5],16($ctx) #mov $F,16($ctx) mov @ROTX[5],$E # E=c mov $a5,$C # C=f #xchg $F,$E # E=c, F=e cmp $num,$inp je .Ldone_avx2 ___ $Xi=4; # reset variables @X=map("%ymm$_",(4..7,0..3)); $code.=<<___; vmovdqu 64($K_XX_XX),@X[2] # pbswap mask cmp $num,%rdi # borrowed $t0 ja .Last_avx2 vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7] vmovdqu -48(%rdi),%xmm1 vmovdqu -32(%rdi),%xmm2 vmovdqu -16(%rdi),%xmm3 vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7] vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7] vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7] vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7] jmp .Last_avx2 .align 32 .Last_avx2: lea 128+16(%rsp),$frame rorx \$2,$F,$B andn $D,$F,$t0 and $C,$F xor $t0,$F sub \$-128,$inp ___ $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_00_19); &Xloop_avx2 (\&bodyx_20_39); &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19 &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap &Xloop_avx2 (\&bodyx_20_39); &vpshufb (@X[-3&7],@X[-3&7],@X[2]); &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19 &Xloop_avx2 (\&bodyx_20_39); &vmovdqu ("0(%rsp)",@Tx[0]); &vpshufb (@X[-2&7],@X[-2&7],@X[2]); &vpaddd (@Tx[1],@X[-3&7],$Kx); &Xloop_avx2 (\&bodyx_20_39); &vmovdqu ("32(%rsp)",@Tx[1]); &vpshufb (@X[-1&7],@X[-1&7],@X[2]); &vpaddd (@X[2],@X[-2&7],$Kx); &Xloop_avx2 (\&bodyx_40_59); &align32 (); &vmovdqu ("64(%rsp)",@X[2]); &vpaddd (@X[3],@X[-1&7],$Kx); &Xloop_avx2 (\&bodyx_40_59); &vmovdqu ("96(%rsp)",@X[3]); &Xloop_avx2 (\&bodyx_40_59); &Xupdate_avx2_16_31(\&bodyx_40_59); &Xupdate_avx2_16_31(\&bodyx_20_39); &Xupdate_avx2_16_31(\&bodyx_20_39); &Xupdate_avx2_16_31(\&bodyx_20_39); &Xloop_avx2 (\&bodyx_20_39); $code.=<<___; lea 128(%rsp),$frame # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c add 0($ctx),@ROTX[0] # update context add 4($ctx),@ROTX[1] add 8($ctx),@ROTX[3] mov @ROTX[0],0($ctx) add 12($ctx),@ROTX[4] mov @ROTX[1],4($ctx) mov @ROTX[0],$A # A=d add 16($ctx),@ROTX[5] mov @ROTX[3],$a5 mov @ROTX[3],8($ctx) mov @ROTX[4],$D # D=b #xchg @ROTX[5],$F # F=c, C=f mov @ROTX[4],12($ctx) mov @ROTX[1],$F # F=e mov @ROTX[5],16($ctx) #mov $F,16($ctx) mov @ROTX[5],$E # E=c mov $a5,$C # C=f #xchg $F,$E # E=c, F=e cmp $num,$inp jbe .Loop_avx2 .Ldone_avx2: vzeroupper ___ $code.=<<___ if ($win64); movaps -40-6*16($fp),%xmm6 movaps -40-5*16($fp),%xmm7 movaps -40-4*16($fp),%xmm8 movaps -40-3*16($fp),%xmm9 movaps -40-2*16($fp),%xmm10 movaps -40-1*16($fp),%xmm11 ___ $code.=<<___; mov -40($fp),%r14 .cfi_restore %r14 mov -32($fp),%r13 .cfi_restore %r13 mov -24($fp),%r12 .cfi_restore %r12 mov -16($fp),%rbp .cfi_restore %rbp mov -8($fp),%rbx .cfi_restore %rbx lea ($fp),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 ___ } } $code.=<<___; .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 ___ }}} $code.=<<___; .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " .align 64 ___ # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail mov 152($context),%rax # pull context->Rsp lea .Lepilogue(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail mov `16*4`(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 jmp .Lcommon_seh_tail .size se_handler,.-se_handler ___ $code.=<<___ if ($shaext); .type shaext_handler,\@abi-omnipotent .align 16 shaext_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lcommon_seh_tail lea .Lepilogue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lcommon_seh_tail lea -8-4*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$8,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lcommon_seh_tail .size shaext_handler,.-shaext_handler ___ $code.=<<___; .type ssse3_handler,\@abi-omnipotent .align 16 ssse3_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipR11 mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lcommon_seh_tail lea -40-6*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$12,%ecx .long 0xa548f3fc # cld; rep movsq mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 .Lcommon_seh_tail: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size ssse3_handler,.-ssse3_handler .section .pdata .align 4 .rva .LSEH_begin_sha1_block_data_order .rva .LSEH_end_sha1_block_data_order .rva .LSEH_info_sha1_block_data_order ___ $code.=<<___ if ($shaext); .rva .LSEH_begin_sha1_block_data_order_shaext .rva .LSEH_end_sha1_block_data_order_shaext .rva .LSEH_info_sha1_block_data_order_shaext ___ $code.=<<___; .rva .LSEH_begin_sha1_block_data_order_ssse3 .rva .LSEH_end_sha1_block_data_order_ssse3 .rva .LSEH_info_sha1_block_data_order_ssse3 ___ $code.=<<___ if ($avx); .rva .LSEH_begin_sha1_block_data_order_avx .rva .LSEH_end_sha1_block_data_order_avx .rva .LSEH_info_sha1_block_data_order_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_sha1_block_data_order_avx2 .rva .LSEH_end_sha1_block_data_order_avx2 .rva .LSEH_info_sha1_block_data_order_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha1_block_data_order: .byte 9,0,0,0 .rva se_handler ___ $code.=<<___ if ($shaext); .LSEH_info_sha1_block_data_order_shaext: .byte 9,0,0,0 .rva shaext_handler ___ $code.=<<___; .LSEH_info_sha1_block_data_order_ssse3: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_sha1_block_data_order_avx: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_sha1_block_data_order_avx2: .byte 9,0,0,0 .rva ssse3_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ } #################################################################### sub sha1rnds4 { if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) { my @opcode=(0x0f,0x3a,0xcc); push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M my $c=$1; push @opcode,$c=~/^0/?oct($c):$c; return ".byte\t".join(',',@opcode); } else { return "sha1rnds4\t".@_[0]; } } sub sha1op38 { my $instr = shift; my %opcodelet = ( "sha1nexte" => 0xc8, "sha1msg1" => 0xc9, "sha1msg2" => 0xca ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); my $rex=0; $rex|=0x04 if ($2>=8); $rex|=0x01 if ($1>=8); unshift @opcode,0x40|$rex if ($rex); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/sha/asm/sha256-586.pl =================================================================== --- stable/12/crypto/openssl/crypto/sha/asm/sha256-586.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/sha/asm/sha256-586.pl (revision 364963) @@ -1,1296 +1,1296 @@ #! /usr/bin/env perl # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # SHA256 block transform for x86. September 2007. # # Performance improvement over compiler generated code varies from # 10% to 40% [see below]. Not very impressive on some µ-archs, but # it's 5 times smaller and optimizes amount of writes. # # May 2012. # # Optimization including two of Pavel Semjanov's ideas, alternative # Maj and full unroll, resulted in ~20-25% improvement on most CPUs, # ~7% on Pentium, ~40% on Atom. As fully unrolled loop body is almost # 15x larger, 8KB vs. 560B, it's fired only for longer inputs. But not # on P4, where it kills performance, nor Sandy Bridge, where folded # loop is approximately as fast... # # June 2012. # # Add AMD XOP-specific code path, >30% improvement on Bulldozer over # May version, >60% over original. Add AVX+shrd code path, >25% # improvement on Sandy Bridge over May version, 60% over original. # # May 2013. # # Replace AMD XOP code path with SSSE3 to cover more processors. # (Biggest improvement coefficient is on upcoming Atom Silvermont, # not shown.) Add AVX+BMI code path. # # March 2014. # # Add support for Intel SHA Extensions. # # Performance in clock cycles per processed byte (less is better): # # gcc icc x86 asm(*) SIMD x86_64 asm(**) # Pentium 46 57 40/38 - - # PIII 36 33 27/24 - - # P4 41 38 28 - 17.3 # AMD K8 27 25 19/15.5 - 14.9 # Core2 26 23 18/15.6 14.3 13.8 # Westmere 27 - 19/15.7 13.4 12.3 # Sandy Bridge 25 - 15.9 12.4 11.6 # Ivy Bridge 24 - 15.0 11.4 10.3 # Haswell 22 - 13.9 9.46 7.80 # Skylake 20 - 14.9 9.50 7.70 # Bulldozer 36 - 27/22 17.0 13.6 # VIA Nano 36 - 25/22 16.8 16.5 # Atom 50 - 30/25 21.9 18.9 # Silvermont 40 - 34/31 22.9 20.6 # Goldmont 29 - 20 16.3(***) # # (*) numbers after slash are for unrolled loop, where applicable; # (**) x86_64 assembly performance is presented for reference # purposes, results are best-available; # (***) SHAEXT result is 4.1, strangely enough better than 64-bit one; $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; $output=pop; open STDOUT,">$output"; &asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$avx=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } if ($xmm && `$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if ($xmm && !$avx && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.03) + ($1>=2.10); } if ($xmm && !$avx && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { +if ($xmm && !$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=$xmm; ### set to zero if compiling for 1.0.1 $unroll_after = 64*4; # If pre-evicted from L1P cache first spin of # fully unrolled loop was measured to run about # 3-4x slower. If slowdown coefficient is N and # unrolled loop is m times faster, then you break # even at (N-1)/(m-1) blocks. Then it needs to be # adjusted for probability of code being evicted, # code size/cache size=1/4. Typical m is 1.15... $A="eax"; $E="edx"; $T="ebx"; $Aoff=&DWP(4,"esp"); $Boff=&DWP(8,"esp"); $Coff=&DWP(12,"esp"); $Doff=&DWP(16,"esp"); $Eoff=&DWP(20,"esp"); $Foff=&DWP(24,"esp"); $Goff=&DWP(28,"esp"); $Hoff=&DWP(32,"esp"); $Xoff=&DWP(36,"esp"); $K256="ebp"; sub BODY_16_63() { &mov ($T,"ecx"); # "ecx" is preloaded &mov ("esi",&DWP(4*(9+15+16-14),"esp")); &ror ("ecx",18-7); &mov ("edi","esi"); &ror ("esi",19-17); &xor ("ecx",$T); &shr ($T,3); &ror ("ecx",7); &xor ("esi","edi"); &xor ($T,"ecx"); # T = sigma0(X[-15]) &ror ("esi",17); &add ($T,&DWP(4*(9+15+16),"esp")); # T += X[-16] &shr ("edi",10); &add ($T,&DWP(4*(9+15+16-9),"esp")); # T += X[-7] #&xor ("edi","esi") # sigma1(X[-2]) # &add ($T,"edi"); # T += sigma1(X[-2]) # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] &BODY_00_15(1); } sub BODY_00_15() { my $in_16_63=shift; &mov ("ecx",$E); &xor ("edi","esi") if ($in_16_63); # sigma1(X[-2]) &mov ("esi",$Foff); &ror ("ecx",25-11); &add ($T,"edi") if ($in_16_63); # T += sigma1(X[-2]) &mov ("edi",$Goff); &xor ("ecx",$E); &xor ("esi","edi"); &mov ($T,&DWP(4*(9+15),"esp")) if (!$in_16_63); &mov (&DWP(4*(9+15),"esp"),$T) if ($in_16_63); # save X[0] &ror ("ecx",11-6); &and ("esi",$E); &mov ($Eoff,$E); # modulo-scheduled &xor ($E,"ecx"); &add ($T,$Hoff); # T += h &xor ("esi","edi"); # Ch(e,f,g) &ror ($E,6); # Sigma1(e) &mov ("ecx",$A); &add ($T,"esi"); # T += Ch(e,f,g) &ror ("ecx",22-13); &add ($T,$E); # T += Sigma1(e) &mov ("edi",$Boff); &xor ("ecx",$A); &mov ($Aoff,$A); # modulo-scheduled &lea ("esp",&DWP(-4,"esp")); &ror ("ecx",13-2); &mov ("esi",&DWP(0,$K256)); &xor ("ecx",$A); &mov ($E,$Eoff); # e in next iteration, d in this one &xor ($A,"edi"); # a ^= b &ror ("ecx",2); # Sigma0(a) &add ($T,"esi"); # T+= K[i] &mov (&DWP(0,"esp"),$A); # (b^c) in next round &add ($E,$T); # d += T &and ($A,&DWP(4,"esp")); # a &= (b^c) &add ($T,"ecx"); # T += Sigma0(a) &xor ($A,"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) &mov ("ecx",&DWP(4*(9+15+16-1),"esp")) if ($in_16_63); # preload T &add ($K256,4); &add ($A,$T); # h += T } &external_label("OPENSSL_ia32cap_P") if (!$i386); &function_begin("sha256_block_data_order"); &mov ("esi",wparam(0)); # ctx &mov ("edi",wparam(1)); # inp &mov ("eax",wparam(2)); # num &mov ("ebx","esp"); # saved sp &call (&label("pic_point")); # make it PIC! &set_label("pic_point"); &blindpop($K256); &lea ($K256,&DWP(&label("K256")."-".&label("pic_point"),$K256)); &sub ("esp",16); &and ("esp",-64); &shl ("eax",6); &add ("eax","edi"); &mov (&DWP(0,"esp"),"esi"); # ctx &mov (&DWP(4,"esp"),"edi"); # inp &mov (&DWP(8,"esp"),"eax"); # inp+num*128 &mov (&DWP(12,"esp"),"ebx"); # saved sp if (!$i386 && $xmm) { &picmeup("edx","OPENSSL_ia32cap_P",$K256,&label("K256")); &mov ("ecx",&DWP(0,"edx")); &mov ("ebx",&DWP(4,"edx")); &test ("ecx",1<<20); # check for P4 &jnz (&label("loop")); &mov ("edx",&DWP(8,"edx")) if ($xmm); &test ("ecx",1<<24); # check for FXSR &jz ($unroll_after?&label("no_xmm"):&label("loop")); &and ("ecx",1<<30); # mask "Intel CPU" bit &and ("ebx",1<<28|1<<9); # mask AVX and SSSE3 bits &test ("edx",1<<29) if ($shaext); # check for SHA &jnz (&label("shaext")) if ($shaext); &or ("ecx","ebx"); &and ("ecx",1<<28|1<<30); &cmp ("ecx",1<<28|1<<30); if ($xmm) { &je (&label("AVX")) if ($avx); &test ("ebx",1<<9); # check for SSSE3 &jnz (&label("SSSE3")); } else { &je (&label("loop_shrd")); } if ($unroll_after) { &set_label("no_xmm"); &sub ("eax","edi"); &cmp ("eax",$unroll_after); &jae (&label("unrolled")); } } &jmp (&label("loop")); sub COMPACT_LOOP() { my $suffix=shift; &set_label("loop$suffix",$suffix?32:16); # copy input block to stack reversing byte and dword order for($i=0;$i<4;$i++) { &mov ("eax",&DWP($i*16+0,"edi")); &mov ("ebx",&DWP($i*16+4,"edi")); &mov ("ecx",&DWP($i*16+8,"edi")); &bswap ("eax"); &mov ("edx",&DWP($i*16+12,"edi")); &bswap ("ebx"); &push ("eax"); &bswap ("ecx"); &push ("ebx"); &bswap ("edx"); &push ("ecx"); &push ("edx"); } &add ("edi",64); &lea ("esp",&DWP(-4*9,"esp"));# place for A,B,C,D,E,F,G,H &mov (&DWP(4*(9+16)+4,"esp"),"edi"); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($A,&DWP(0,"esi")); &mov ("ebx",&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); # &mov ($Aoff,$A); &mov ($Boff,"ebx"); &xor ("ebx","ecx"); &mov ($Coff,"ecx"); &mov ($Doff,"edi"); &mov (&DWP(0,"esp"),"ebx"); # magic &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("edi",&DWP(28,"esi")); # &mov ($Eoff,$E); &mov ($Foff,"ebx"); &mov ($Goff,"ecx"); &mov ($Hoff,"edi"); &set_label("00_15$suffix",16); &BODY_00_15(); &cmp ("esi",0xc19bf174); &jne (&label("00_15$suffix")); &mov ("ecx",&DWP(4*(9+15+16-1),"esp")); # preloaded in BODY_00_15(1) &jmp (&label("16_63$suffix")); &set_label("16_63$suffix",16); &BODY_16_63(); &cmp ("esi",0xc67178f2); &jne (&label("16_63$suffix")); &mov ("esi",&DWP(4*(9+16+64)+0,"esp"));#ctx # &mov ($A,$Aoff); &mov ("ebx",$Boff); # &mov ("edi",$Coff); &mov ("ecx",$Doff); &add ($A,&DWP(0,"esi")); &add ("ebx",&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$A); &mov (&DWP(4,"esi"),"ebx"); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); # &mov ($E,$Eoff); &mov ("eax",$Foff); &mov ("ebx",$Goff); &mov ("ecx",$Hoff); &mov ("edi",&DWP(4*(9+16+64)+4,"esp"));#inp &add ($E,&DWP(16,"esi")); &add ("eax",&DWP(20,"esi")); &add ("ebx",&DWP(24,"esi")); &add ("ecx",&DWP(28,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"eax"); &mov (&DWP(24,"esi"),"ebx"); &mov (&DWP(28,"esi"),"ecx"); &lea ("esp",&DWP(4*(9+16+64),"esp"));# destroy frame &sub ($K256,4*64); # rewind K &cmp ("edi",&DWP(8,"esp")); # are we done yet? &jb (&label("loop$suffix")); } &COMPACT_LOOP(); &mov ("esp",&DWP(12,"esp")); # restore sp &function_end_A(); if (!$i386 && !$xmm) { # ~20% improvement on Sandy Bridge local *ror = sub { &shrd(@_[0],@_) }; &COMPACT_LOOP("_shrd"); &mov ("esp",&DWP(12,"esp")); # restore sp &function_end_A(); } &set_label("K256",64); # Yes! I keep it in the code segment! @K256=( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); &data_word(@K256); &data_word(0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f); # byte swap mask &asciz("SHA256 block transform for x86, CRYPTOGAMS by "); ($a,$b,$c,$d,$e,$f,$g,$h)=(0..7); # offsets sub off { &DWP(4*(((shift)-$i)&7),"esp"); } if (!$i386 && $unroll_after) { my @AH=($A,$K256); &set_label("unrolled",16); &lea ("esp",&DWP(-96,"esp")); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("ebx",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"ebx"); &mov ($E,&DWP(16,"esi")); &mov ("ebx",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"ebx"); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &jmp (&label("grand_loop")); &set_label("grand_loop",16); # copy input block to stack reversing byte order for($i=0;$i<5;$i++) { &mov ("ebx",&DWP(12*$i+0,"edi")); &mov ("ecx",&DWP(12*$i+4,"edi")); &bswap ("ebx"); &mov ("esi",&DWP(12*$i+8,"edi")); &bswap ("ecx"); &mov (&DWP(32+12*$i+0,"esp"),"ebx"); &bswap ("esi"); &mov (&DWP(32+12*$i+4,"esp"),"ecx"); &mov (&DWP(32+12*$i+8,"esp"),"esi"); } &mov ("ebx",&DWP($i*12,"edi")); &add ("edi",64); &bswap ("ebx"); &mov (&DWP(96+4,"esp"),"edi"); &mov (&DWP(32+12*$i,"esp"),"ebx"); my ($t1,$t2) = ("ecx","esi"); for ($i=0;$i<64;$i++) { if ($i>=16) { &mov ($T,$t1); # $t1 is preloaded # &mov ($t2,&DWP(32+4*(($i+14)&15),"esp")); &ror ($t1,18-7); &mov ("edi",$t2); &ror ($t2,19-17); &xor ($t1,$T); &shr ($T,3); &ror ($t1,7); &xor ($t2,"edi"); &xor ($T,$t1); # T = sigma0(X[-15]) &ror ($t2,17); &add ($T,&DWP(32+4*($i&15),"esp")); # T += X[-16] &shr ("edi",10); &add ($T,&DWP(32+4*(($i+9)&15),"esp")); # T += X[-7] #&xor ("edi",$t2) # sigma1(X[-2]) # &add ($T,"edi"); # T += sigma1(X[-2]) # &mov (&DWP(4*(9+15),"esp"),$T); # save X[0] } &mov ($t1,$E); &xor ("edi",$t2) if ($i>=16); # sigma1(X[-2]) &mov ($t2,&off($f)); &ror ($E,25-11); &add ($T,"edi") if ($i>=16); # T += sigma1(X[-2]) &mov ("edi",&off($g)); &xor ($E,$t1); &mov ($T,&DWP(32+4*($i&15),"esp")) if ($i<16); # X[i] &mov (&DWP(32+4*($i&15),"esp"),$T) if ($i>=16 && $i<62); # save X[0] &xor ($t2,"edi"); &ror ($E,11-6); &and ($t2,$t1); &mov (&off($e),$t1); # save $E, modulo-scheduled &xor ($E,$t1); &add ($T,&off($h)); # T += h &xor ("edi",$t2); # Ch(e,f,g) &ror ($E,6); # Sigma1(e) &mov ($t1,$AH[0]); &add ($T,"edi"); # T += Ch(e,f,g) &ror ($t1,22-13); &mov ($t2,$AH[0]); &mov ("edi",&off($b)); &xor ($t1,$AH[0]); &mov (&off($a),$AH[0]); # save $A, modulo-scheduled &xor ($AH[0],"edi"); # a ^= b, (b^c) in next round &ror ($t1,13-2); &and ($AH[1],$AH[0]); # (b^c) &= (a^b) &lea ($E,&DWP(@K256[$i],$T,$E)); # T += Sigma1(1)+K[i] &xor ($t1,$t2); &xor ($AH[1],"edi"); # h = Maj(a,b,c) = Ch(a^b,c,b) &mov ($t2,&DWP(32+4*(($i+2)&15),"esp")) if ($i>=15 && $i<63); &ror ($t1,2); # Sigma0(a) &add ($AH[1],$E); # h += T &add ($E,&off($d)); # d += T &add ($AH[1],$t1); # h += Sigma0(a) &mov ($t1,&DWP(32+4*(($i+15)&15),"esp")) if ($i>=15 && $i<63); @AH = reverse(@AH); # rotate(a,h) ($t1,$t2) = ($t2,$t1); # rotate(t1,t2) } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ebx",&DWP(24,"esp")); &mov ("ecx",&DWP(28,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ebx",&DWP(24,"esi")); &add ("ecx",&DWP(28,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(24,"esi"),"ebx"); &mov (&DWP(28,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ebx"); &mov (&DWP(28,"esp"),"ecx"); &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_loop")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &function_end_A(); } if (!$i386 && $xmm) {{{ if ($shaext) { ###################################################################### # Intel SHA Extensions implementation of SHA256 update function. # my ($ctx,$inp,$end)=("esi","edi","eax"); my ($Wi,$ABEF,$CDGH,$TMP)=map("xmm$_",(0..2,7)); my @MSG=map("xmm$_",(3..6)); sub sha256op38 { my ($opcodelet,$dst,$src)=@_; if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/) { &data_byte(0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2); } } sub sha256rnds2 { sha256op38(0xcb,@_); } sub sha256msg1 { sha256op38(0xcc,@_); } sub sha256msg2 { sha256op38(0xcd,@_); } &set_label("shaext",32); &sub ("esp",32); &movdqu ($ABEF,&QWP(0,$ctx)); # DCBA &lea ($K256,&DWP(0x80,$K256)); &movdqu ($CDGH,&QWP(16,$ctx)); # HGFE &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask &pshufd ($Wi,$ABEF,0x1b); # ABCD &pshufd ($ABEF,$ABEF,0xb1); # CDAB &pshufd ($CDGH,$CDGH,0x1b); # EFGH &palignr ($ABEF,$CDGH,8); # ABEF &punpcklqdq ($CDGH,$Wi); # CDGH &jmp (&label("loop_shaext")); &set_label("loop_shaext",16); &movdqu (@MSG[0],&QWP(0,$inp)); &movdqu (@MSG[1],&QWP(0x10,$inp)); &movdqu (@MSG[2],&QWP(0x20,$inp)); &pshufb (@MSG[0],$TMP); &movdqu (@MSG[3],&QWP(0x30,$inp)); &movdqa (&QWP(16,"esp"),$CDGH); # offload &movdqa ($Wi,&QWP(0*16-0x80,$K256)); &paddd ($Wi,@MSG[0]); &pshufb (@MSG[1],$TMP); &sha256rnds2 ($CDGH,$ABEF); # 0-3 &pshufd ($Wi,$Wi,0x0e); &nop (); &movdqa (&QWP(0,"esp"),$ABEF); # offload &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(1*16-0x80,$K256)); &paddd ($Wi,@MSG[1]); &pshufb (@MSG[2],$TMP); &sha256rnds2 ($CDGH,$ABEF); # 4-7 &pshufd ($Wi,$Wi,0x0e); &lea ($inp,&DWP(0x40,$inp)); &sha256msg1 (@MSG[0],@MSG[1]); &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(2*16-0x80,$K256)); &paddd ($Wi,@MSG[2]); &pshufb (@MSG[3],$TMP); &sha256rnds2 ($CDGH,$ABEF); # 8-11 &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[3]); &palignr ($TMP,@MSG[2],4); &nop (); &paddd (@MSG[0],$TMP); &sha256msg1 (@MSG[1],@MSG[2]); &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(3*16-0x80,$K256)); &paddd ($Wi,@MSG[3]); &sha256msg2 (@MSG[0],@MSG[3]); &sha256rnds2 ($CDGH,$ABEF); # 12-15 &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[0]); &palignr ($TMP,@MSG[3],4); &nop (); &paddd (@MSG[1],$TMP); &sha256msg1 (@MSG[2],@MSG[3]); &sha256rnds2 ($ABEF,$CDGH); for($i=4;$i<16-3;$i++) { &movdqa ($Wi,&QWP($i*16-0x80,$K256)); &paddd ($Wi,@MSG[0]); &sha256msg2 (@MSG[1],@MSG[0]); &sha256rnds2 ($CDGH,$ABEF); # 16-19... &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[1]); &palignr ($TMP,@MSG[0],4); &nop (); &paddd (@MSG[2],$TMP); &sha256msg1 (@MSG[3],@MSG[0]); &sha256rnds2 ($ABEF,$CDGH); push(@MSG,shift(@MSG)); } &movdqa ($Wi,&QWP(13*16-0x80,$K256)); &paddd ($Wi,@MSG[0]); &sha256msg2 (@MSG[1],@MSG[0]); &sha256rnds2 ($CDGH,$ABEF); # 52-55 &pshufd ($Wi,$Wi,0x0e); &movdqa ($TMP,@MSG[1]) &palignr ($TMP,@MSG[0],4); &sha256rnds2 ($ABEF,$CDGH); &paddd (@MSG[2],$TMP); &movdqa ($Wi,&QWP(14*16-0x80,$K256)); &paddd ($Wi,@MSG[1]); &sha256rnds2 ($CDGH,$ABEF); # 56-59 &pshufd ($Wi,$Wi,0x0e); &sha256msg2 (@MSG[2],@MSG[1]); &movdqa ($TMP,&QWP(0x100-0x80,$K256)); # byte swap mask &sha256rnds2 ($ABEF,$CDGH); &movdqa ($Wi,&QWP(15*16-0x80,$K256)); &paddd ($Wi,@MSG[2]); &nop (); &sha256rnds2 ($CDGH,$ABEF); # 60-63 &pshufd ($Wi,$Wi,0x0e); &cmp ($end,$inp); &nop (); &sha256rnds2 ($ABEF,$CDGH); &paddd ($CDGH,&QWP(16,"esp")); &paddd ($ABEF,&QWP(0,"esp")); &jnz (&label("loop_shaext")); &pshufd ($CDGH,$CDGH,0xb1); # DCHG &pshufd ($TMP,$ABEF,0x1b); # FEBA &pshufd ($ABEF,$ABEF,0xb1); # BAFE &punpckhqdq ($ABEF,$CDGH); # DCBA &palignr ($CDGH,$TMP,8); # HGFE &mov ("esp",&DWP(32+12,"esp")); &movdqu (&QWP(0,$ctx),$ABEF); &movdqu (&QWP(16,$ctx),$CDGH); &function_end_A(); } my @X = map("xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("xmm$_",(4..7)); my @AH = ($A,$T); &set_label("SSSE3",32); &lea ("esp",&DWP(-96,"esp")); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"edi"); &mov ($E,&DWP(16,"esi")); &mov ("edi",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &movdqa ($t3,&QWP(256,$K256)); &jmp (&label("grand_ssse3")); &set_label("grand_ssse3",16); # load input, reverse byte order, add K256[0..15], save to stack &movdqu (@X[0],&QWP(0,"edi")); &movdqu (@X[1],&QWP(16,"edi")); &movdqu (@X[2],&QWP(32,"edi")); &movdqu (@X[3],&QWP(48,"edi")); &add ("edi",64); &pshufb (@X[0],$t3); &mov (&DWP(96+4,"esp"),"edi"); &pshufb (@X[1],$t3); &movdqa ($t0,&QWP(0,$K256)); &pshufb (@X[2],$t3); &movdqa ($t1,&QWP(16,$K256)); &paddd ($t0,@X[0]); &pshufb (@X[3],$t3); &movdqa ($t2,&QWP(32,$K256)); &paddd ($t1,@X[1]); &movdqa ($t3,&QWP(48,$K256)); &movdqa (&QWP(32+0,"esp"),$t0); &paddd ($t2,@X[2]); &movdqa (&QWP(32+16,"esp"),$t1); &paddd ($t3,@X[3]); &movdqa (&QWP(32+32,"esp"),$t2); &movdqa (&QWP(32+48,"esp"),$t3); &jmp (&label("ssse3_00_47")); &set_label("ssse3_00_47",16); &add ($K256,64); sub SSSE3_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 120 instructions eval(shift(@insns)); &movdqa ($t0,@X[1]); eval(shift(@insns)); # @ eval(shift(@insns)); &movdqa ($t3,@X[3]); eval(shift(@insns)); eval(shift(@insns)); &palignr ($t0,@X[0],4); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &palignr ($t3,@X[2],4); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t1,$t0); eval(shift(@insns)); # @ eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t0,3); eval(shift(@insns)); eval(shift(@insns)); # @ &paddd (@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); &psrld ($t2,7); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[14..15] eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,32-18); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t2,18-7); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,18-7); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t0,$t1); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,10); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &psrlq ($t2,17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); &psrlq ($t2,19-17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); # @ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); # @ &psrld ($t3,10); eval(shift(@insns)); &psrlq ($t2,17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); &psrlq ($t2,19-17); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); eval(shift(@insns)); # @ &movdqa ($t2,&QWP(16*$j,$K256)); eval(shift(@insns)); eval(shift(@insns)); &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); # @ &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd ($t2,@X[0]); eval(shift(@insns)); # @ foreach (@insns) { eval; } # remaining instructions &movdqa (&QWP(32+16*$j,"esp"),$t2); } sub body_00_15 () { ( '&mov ("ecx",$E);', '&ror ($E,25-11);', '&mov ("esi",&off($f));', '&xor ($E,"ecx");', '&mov ("edi",&off($g));', '&xor ("esi","edi");', '&ror ($E,11-6);', '&and ("esi","ecx");', '&mov (&off($e),"ecx");', # save $E, modulo-scheduled '&xor ($E,"ecx");', '&xor ("edi","esi");', # Ch(e,f,g) '&ror ($E,6);', # T = Sigma1(e) '&mov ("ecx",$AH[0]);', '&add ($E,"edi");', # T += Ch(e,f,g) '&mov ("edi",&off($b));', '&mov ("esi",$AH[0]);', '&ror ("ecx",22-13);', '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled '&xor ("ecx",$AH[0]);', '&xor ($AH[0],"edi");', # a ^= b, (b^c) in next round '&add ($E,&off($h));', # T += h '&ror ("ecx",13-2);', '&and ($AH[1],$AH[0]);', # (b^c) &= (a^b) '&xor ("ecx","esi");', '&add ($E,&DWP(32+4*($i&15),"esp"));', # T += K[i]+X[i] '&xor ($AH[1],"edi");', # h = Maj(a,b,c) = Ch(a^b,c,b) '&ror ("ecx",2);', # Sigma0(a) '&add ($AH[1],$E);', # h += T '&add ($E,&off($d));', # d += T '&add ($AH[1],"ecx");'. # h += Sigma0(a) '@AH = reverse(@AH); $i++;' # rotate(a,h) ); } for ($i=0,$j=0; $j<4; $j++) { &SSSE3_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmp (&DWP(16*$j,$K256),0x00010203); &jne (&label("ssse3_00_47")); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ecx",&DWP(24,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ecx",&DWP(24,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(28,"esp")); &mov (&DWP(24,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &add ("edi",&DWP(28,"esi")); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esi"),"edi"); &mov (&DWP(28,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &movdqa ($t3,&QWP(64,$K256)); &sub ($K256,3*64); # rewind K &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_ssse3")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &function_end_A(); if ($avx) { &set_label("AVX",32); if ($avx>1) { &and ("edx",1<<8|1<<3); # check for BMI2+BMI1 &cmp ("edx",1<<8|1<<3); &je (&label("AVX_BMI")); } &lea ("esp",&DWP(-96,"esp")); &vzeroall (); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"edi"); &mov ($E,&DWP(16,"esi")); &mov ("edi",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &vmovdqa ($t3,&QWP(256,$K256)); &jmp (&label("grand_avx")); &set_label("grand_avx",32); # load input, reverse byte order, add K256[0..15], save to stack &vmovdqu (@X[0],&QWP(0,"edi")); &vmovdqu (@X[1],&QWP(16,"edi")); &vmovdqu (@X[2],&QWP(32,"edi")); &vmovdqu (@X[3],&QWP(48,"edi")); &add ("edi",64); &vpshufb (@X[0],@X[0],$t3); &mov (&DWP(96+4,"esp"),"edi"); &vpshufb (@X[1],@X[1],$t3); &vpshufb (@X[2],@X[2],$t3); &vpaddd ($t0,@X[0],&QWP(0,$K256)); &vpshufb (@X[3],@X[3],$t3); &vpaddd ($t1,@X[1],&QWP(16,$K256)); &vpaddd ($t2,@X[2],&QWP(32,$K256)); &vpaddd ($t3,@X[3],&QWP(48,$K256)); &vmovdqa (&QWP(32+0,"esp"),$t0); &vmovdqa (&QWP(32+16,"esp"),$t1); &vmovdqa (&QWP(32+32,"esp"),$t2); &vmovdqa (&QWP(32+48,"esp"),$t3); &jmp (&label("avx_00_47")); &set_label("avx_00_47",16); &add ($K256,64); sub Xupdate_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],4);', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],4);', # X[9..12] '&vpsrld ($t2,$t0,7);', '&vpaddd (@X[0],@X[0],$t3);', # X[0..3] += X[9..16] '&vpsrld ($t3,$t0,3);', '&vpslld ($t1,$t0,14);', '&vpxor ($t0,$t3,$t2);', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,18-7);', '&vpxor ($t0,$t0,$t1);', '&vpslld ($t1,$t1,25-14);', '&vpxor ($t0,$t0,$t2);', '&vpsrld ($t2,$t3,10);', '&vpxor ($t0,$t0,$t1);', # sigma0(X[1..4]) '&vpsrlq ($t1,$t3,17);', '&vpaddd (@X[0],@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t1);', '&vpsrlq ($t3,$t3,19);', '&vpxor ($t2,$t2,$t3);', # sigma1(X[14..15] '&vpshufd ($t3,$t2,0b10000100);', '&vpsrldq ($t3,$t3,8);', '&vpaddd (@X[0],@X[0],$t3);', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,10);', '&vpsrlq ($t1,$t3,17);', '&vpxor ($t2,$t2,$t1);', '&vpsrlq ($t3,$t3,19);', '&vpxor ($t2,$t2,$t3);', # sigma1(X[16..17] '&vpshufd ($t3,$t2,0b11101000);', '&vpslldq ($t3,$t3,8);', '&vpaddd (@X[0],@X[0],$t3);' # X[2..3] += sigma1(X[16..17]) ); } local *ror = sub { &shrd(@_[0],@_) }; sub AVX_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 120 instructions my $insn; foreach (Xupdate_AVX()) { # 31 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval($insn = shift(@insns)); eval(shift(@insns)) if ($insn =~ /rorx/ && @insns[0] =~ /rorx/); } &vpaddd ($t2,@X[0],&QWP(16*$j,$K256)); foreach (@insns) { eval; } # remaining instructions &vmovdqa (&QWP(32+16*$j,"esp"),$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmp (&DWP(16*$j,$K256),0x00010203); &jne (&label("avx_00_47")); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ecx",&DWP(24,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ecx",&DWP(24,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(28,"esp")); &mov (&DWP(24,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &add ("edi",&DWP(28,"esi")); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esi"),"edi"); &mov (&DWP(28,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &vmovdqa ($t3,&QWP(64,$K256)); &sub ($K256,3*64); # rewind K &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_avx")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &vzeroall (); &function_end_A(); if ($avx>1) { sub bodyx_00_15 () { # +10% ( '&rorx ("ecx",$E,6)', '&rorx ("esi",$E,11)', '&mov (&off($e),$E)', # save $E, modulo-scheduled '&rorx ("edi",$E,25)', '&xor ("ecx","esi")', '&andn ("esi",$E,&off($g))', '&xor ("ecx","edi")', # Sigma1(e) '&and ($E,&off($f))', '&mov (&off($a),$AH[0]);', # save $A, modulo-scheduled '&or ($E,"esi")', # T = Ch(e,f,g) '&rorx ("edi",$AH[0],2)', '&rorx ("esi",$AH[0],13)', '&lea ($E,&DWP(0,$E,"ecx"))', # T += Sigma1(e) '&rorx ("ecx",$AH[0],22)', '&xor ("esi","edi")', '&mov ("edi",&off($b))', '&xor ("ecx","esi")', # Sigma0(a) '&xor ($AH[0],"edi")', # a ^= b, (b^c) in next round '&add ($E,&off($h))', # T += h '&and ($AH[1],$AH[0])', # (b^c) &= (a^b) '&add ($E,&DWP(32+4*($i&15),"esp"))', # T += K[i]+X[i] '&xor ($AH[1],"edi")', # h = Maj(a,b,c) = Ch(a^b,c,b) '&add ("ecx",$E)', # h += T '&add ($E,&off($d))', # d += T '&lea ($AH[1],&DWP(0,$AH[1],"ecx"));'. # h += Sigma0(a) '@AH = reverse(@AH); $i++;' # rotate(a,h) ); } &set_label("AVX_BMI",32); &lea ("esp",&DWP(-96,"esp")); &vzeroall (); # copy ctx->h[0-7] to A,B,C,D,E,F,G,H on stack &mov ($AH[0],&DWP(0,"esi")); &mov ($AH[1],&DWP(4,"esi")); &mov ("ecx",&DWP(8,"esi")); &mov ("edi",&DWP(12,"esi")); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"ecx"); # magic &mov (&DWP(8,"esp"),"ecx"); &mov (&DWP(12,"esp"),"edi"); &mov ($E,&DWP(16,"esi")); &mov ("edi",&DWP(20,"esi")); &mov ("ecx",&DWP(24,"esi")); &mov ("esi",&DWP(28,"esi")); #&mov (&DWP(16,"esp"),$E); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esp"),"esi"); &vmovdqa ($t3,&QWP(256,$K256)); &jmp (&label("grand_avx_bmi")); &set_label("grand_avx_bmi",32); # load input, reverse byte order, add K256[0..15], save to stack &vmovdqu (@X[0],&QWP(0,"edi")); &vmovdqu (@X[1],&QWP(16,"edi")); &vmovdqu (@X[2],&QWP(32,"edi")); &vmovdqu (@X[3],&QWP(48,"edi")); &add ("edi",64); &vpshufb (@X[0],@X[0],$t3); &mov (&DWP(96+4,"esp"),"edi"); &vpshufb (@X[1],@X[1],$t3); &vpshufb (@X[2],@X[2],$t3); &vpaddd ($t0,@X[0],&QWP(0,$K256)); &vpshufb (@X[3],@X[3],$t3); &vpaddd ($t1,@X[1],&QWP(16,$K256)); &vpaddd ($t2,@X[2],&QWP(32,$K256)); &vpaddd ($t3,@X[3],&QWP(48,$K256)); &vmovdqa (&QWP(32+0,"esp"),$t0); &vmovdqa (&QWP(32+16,"esp"),$t1); &vmovdqa (&QWP(32+32,"esp"),$t2); &vmovdqa (&QWP(32+48,"esp"),$t3); &jmp (&label("avx_bmi_00_47")); &set_label("avx_bmi_00_47",16); &add ($K256,64); for ($i=0,$j=0; $j<4; $j++) { &AVX_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmp (&DWP(16*$j,$K256),0x00010203); &jne (&label("avx_bmi_00_47")); for ($i=0; $i<16; ) { foreach(bodyx_00_15()) { eval; } } &mov ("esi",&DWP(96,"esp")); #ctx #&mov ($AH[0],&DWP(0,"esp")); &xor ($AH[1],"edi"); #&mov ($AH[1],&DWP(4,"esp")); #&mov ("edi", &DWP(8,"esp")); &mov ("ecx",&DWP(12,"esp")); &add ($AH[0],&DWP(0,"esi")); &add ($AH[1],&DWP(4,"esi")); &add ("edi",&DWP(8,"esi")); &add ("ecx",&DWP(12,"esi")); &mov (&DWP(0,"esi"),$AH[0]); &mov (&DWP(4,"esi"),$AH[1]); &mov (&DWP(8,"esi"),"edi"); &mov (&DWP(12,"esi"),"ecx"); #&mov (&DWP(0,"esp"),$AH[0]); &mov (&DWP(4,"esp"),$AH[1]); &xor ($AH[1],"edi"); # magic &mov (&DWP(8,"esp"),"edi"); &mov (&DWP(12,"esp"),"ecx"); #&mov ($E,&DWP(16,"esp")); &mov ("edi",&DWP(20,"esp")); &mov ("ecx",&DWP(24,"esp")); &add ($E,&DWP(16,"esi")); &add ("edi",&DWP(20,"esi")); &add ("ecx",&DWP(24,"esi")); &mov (&DWP(16,"esi"),$E); &mov (&DWP(20,"esi"),"edi"); &mov (&DWP(20,"esp"),"edi"); &mov ("edi",&DWP(28,"esp")); &mov (&DWP(24,"esi"),"ecx"); #&mov (&DWP(16,"esp"),$E); &add ("edi",&DWP(28,"esi")); &mov (&DWP(24,"esp"),"ecx"); &mov (&DWP(28,"esi"),"edi"); &mov (&DWP(28,"esp"),"edi"); &mov ("edi",&DWP(96+4,"esp")); # inp &vmovdqa ($t3,&QWP(64,$K256)); &sub ($K256,3*64); # rewind K &cmp ("edi",&DWP(96+8,"esp")); # are we done yet? &jb (&label("grand_avx_bmi")); &mov ("esp",&DWP(96+12,"esp")); # restore sp &vzeroall (); &function_end_A(); } } }}} &function_end_B("sha256_block_data_order"); &asm_finish(); close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/sha/asm/sha256-mb-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/sha/asm/sha256-mb-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/sha/asm/sha256-mb-x86_64.pl (revision 364963) @@ -1,1614 +1,1614 @@ #! /usr/bin/env perl # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # Multi-buffer SHA256 procedure processes n buffers in parallel by # placing buffer data to designated lane of SIMD register. n is # naturally limited to 4 on pre-AVX2 processors and to 8 on # AVX2-capable processors such as Haswell. # # this +aesni(i) sha256 aesni-sha256 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% # Atom(ii) 38.7/n +3.93=13.6(n=4) 20.8 +5.69=26.5 +95% # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% # Skylake (18.9 +5.00=23.9)/n 7.70 8.17 +170% # Bulldozer (21.6 +5.76=27.4)/n 13.6 13.7 +100% # # (i) multi-block CBC encrypt with 128-bit key; # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom, # because of lower AES-NI instruction throughput, nor is there # AES-NI-SHA256 stitch for these processors; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 20.3+4.44=24.7; # (iv) presented improvement coefficients are asymptotic limits and # in real-life application are somewhat lower, e.g. for 2KB # fragments they range from 75% to 130% (on Haswell); $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; $avx=0; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; # void sha256_multi_block ( # struct { unsigned int A[8]; # unsigned int B[8]; # unsigned int C[8]; # unsigned int D[8]; # unsigned int E[8]; # unsigned int F[8]; # unsigned int G[8]; # unsigned int H[8]; } *ctx, # struct { void *ptr; int blocks; } inp[8], # int num); /* 1 or 2 */ # $ctx="%rdi"; # 1st arg $inp="%rsi"; # 2nd arg $num="%edx"; # 3rd arg @ptr=map("%r$_",(8..11)); $Tbl="%rbp"; @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15)); ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7)); $REG_SZ=16; sub Xi_off { my $off = shift; $off %= 16; $off *= $REG_SZ; $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)"; } sub ROUND_00_15 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; $code.=<<___ if ($i<15); movd `4*$i`(@ptr[0]),$Xi movd `4*$i`(@ptr[1]),$t1 movd `4*$i`(@ptr[2]),$t2 movd `4*$i`(@ptr[3]),$t3 punpckldq $t2,$Xi punpckldq $t3,$t1 punpckldq $t1,$Xi ___ $code.=<<___ if ($i==15); movd `4*$i`(@ptr[0]),$Xi lea `16*4`(@ptr[0]),@ptr[0] movd `4*$i`(@ptr[1]),$t1 lea `16*4`(@ptr[1]),@ptr[1] movd `4*$i`(@ptr[2]),$t2 lea `16*4`(@ptr[2]),@ptr[2] movd `4*$i`(@ptr[3]),$t3 lea `16*4`(@ptr[3]),@ptr[3] punpckldq $t2,$Xi punpckldq $t3,$t1 punpckldq $t1,$Xi ___ $code.=<<___; movdqa $e,$sigma `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==0)` movdqa $e,$t3 `"pshufb $Xn,$Xi" if ($i<=15 && ($i&1)==1)` psrld \$6,$sigma movdqa $e,$t2 pslld \$7,$t3 movdqa $Xi,`&Xi_off($i)` paddd $h,$Xi # Xi+=h psrld \$11,$t2 pxor $t3,$sigma pslld \$21-7,$t3 paddd `32*($i%8)-128`($Tbl),$Xi # Xi+=K[round] pxor $t2,$sigma psrld \$25-11,$t2 movdqa $e,$t1 `"prefetcht0 63(@ptr[0])" if ($i==15)` pxor $t3,$sigma movdqa $e,$axb # borrow $axb pslld \$26-21,$t3 pandn $g,$t1 pand $f,$axb pxor $t2,$sigma `"prefetcht0 63(@ptr[1])" if ($i==15)` movdqa $a,$t2 pxor $t3,$sigma # Sigma1(e) movdqa $a,$t3 psrld \$2,$t2 paddd $sigma,$Xi # Xi+=Sigma1(e) pxor $axb,$t1 # Ch(e,f,g) movdqa $b,$axb movdqa $a,$sigma pslld \$10,$t3 pxor $a,$axb # a^b, b^c in next round `"prefetcht0 63(@ptr[2])" if ($i==15)` psrld \$13,$sigma pxor $t3,$t2 paddd $t1,$Xi # Xi+=Ch(e,f,g) pslld \$19-10,$t3 pand $axb,$bxc pxor $sigma,$t2 `"prefetcht0 63(@ptr[3])" if ($i==15)` psrld \$22-13,$sigma pxor $t3,$t2 movdqa $b,$h pslld \$30-19,$t3 pxor $t2,$sigma pxor $bxc,$h # h=Maj(a,b,c)=Ch(a^b,c,b) paddd $Xi,$d # d+=Xi pxor $t3,$sigma # Sigma0(a) paddd $Xi,$h # h+=Xi paddd $sigma,$h # h+=Sigma0(a) ___ $code.=<<___ if (($i%8)==7); lea `32*8`($Tbl),$Tbl ___ ($axb,$bxc)=($bxc,$axb); } sub ROUND_16_XX { my $i=shift; $code.=<<___; movdqa `&Xi_off($i+1)`,$Xn paddd `&Xi_off($i+9)`,$Xi # Xi+=X[i+9] movdqa $Xn,$sigma movdqa $Xn,$t2 psrld \$3,$sigma movdqa $Xn,$t3 psrld \$7,$t2 movdqa `&Xi_off($i+14)`,$t1 pslld \$14,$t3 pxor $t2,$sigma psrld \$18-7,$t2 movdqa $t1,$axb # borrow $axb pxor $t3,$sigma pslld \$25-14,$t3 pxor $t2,$sigma psrld \$10,$t1 movdqa $axb,$t2 psrld \$17,$axb pxor $t3,$sigma # sigma0(X[i+1]) pslld \$13,$t2 paddd $sigma,$Xi # Xi+=sigma0(e) pxor $axb,$t1 psrld \$19-17,$axb pxor $t2,$t1 pslld \$15-13,$t2 pxor $axb,$t1 pxor $t2,$t1 # sigma0(X[i+14]) paddd $t1,$Xi # Xi+=sigma1(X[i+14]) ___ &ROUND_00_15($i,@_); ($Xi,$Xn)=($Xn,$Xi); } $code.=<<___; .text .extern OPENSSL_ia32cap_P .globl sha256_multi_block .type sha256_multi_block,\@function,3 .align 32 sha256_multi_block: .cfi_startproc mov OPENSSL_ia32cap_P+4(%rip),%rcx bt \$61,%rcx # check SHA bit jc _shaext_shortcut ___ $code.=<<___ if ($avx); test \$`1<<28`,%ecx jnz _avx_shortcut ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody: lea K256+128(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx lea 0x80($ctx),$ctx # size optimization .Loop_grande: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone movdqu 0x00-0x80($ctx),$A # load context lea 128(%rsp),%rax movdqu 0x20-0x80($ctx),$B movdqu 0x40-0x80($ctx),$C movdqu 0x60-0x80($ctx),$D movdqu 0x80-0x80($ctx),$E movdqu 0xa0-0x80($ctx),$F movdqu 0xc0-0x80($ctx),$G movdqu 0xe0-0x80($ctx),$H movdqu .Lpbswap(%rip),$Xn jmp .Loop .align 32 .Loop: movdqa $C,$bxc pxor $B,$bxc # magic seed ___ for($i=0;$i<16;$i++) { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); } $code.=<<___; movdqu `&Xi_off($i)`,$Xi mov \$3,%ecx jmp .Loop_16_xx .align 32 .Loop_16_xx: ___ for(;$i<32;$i++) { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); } $code.=<<___; dec %ecx jnz .Loop_16_xx mov \$1,%ecx lea K256+128(%rip),$Tbl movdqa (%rbx),$sigma # pull counters cmp 4*0(%rbx),%ecx # examine counters pxor $t1,$t1 cmovge $Tbl,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx movdqa $sigma,$Xn cmovge $Tbl,@ptr[1] cmp 4*2(%rbx),%ecx pcmpgtd $t1,$Xn # mask value cmovge $Tbl,@ptr[2] cmp 4*3(%rbx),%ecx paddd $Xn,$sigma # counters-- cmovge $Tbl,@ptr[3] movdqu 0x00-0x80($ctx),$t1 pand $Xn,$A movdqu 0x20-0x80($ctx),$t2 pand $Xn,$B movdqu 0x40-0x80($ctx),$t3 pand $Xn,$C movdqu 0x60-0x80($ctx),$Xi pand $Xn,$D paddd $t1,$A movdqu 0x80-0x80($ctx),$t1 pand $Xn,$E paddd $t2,$B movdqu 0xa0-0x80($ctx),$t2 pand $Xn,$F paddd $t3,$C movdqu 0xc0-0x80($ctx),$t3 pand $Xn,$G paddd $Xi,$D movdqu 0xe0-0x80($ctx),$Xi pand $Xn,$H paddd $t1,$E paddd $t2,$F movdqu $A,0x00-0x80($ctx) paddd $t3,$G movdqu $B,0x20-0x80($ctx) paddd $Xi,$H movdqu $C,0x40-0x80($ctx) movdqu $D,0x60-0x80($ctx) movdqu $E,0x80-0x80($ctx) movdqu $F,0xa0-0x80($ctx) movdqu $G,0xc0-0x80($ctx) movdqu $H,0xe0-0x80($ctx) movdqa $sigma,(%rbx) # save counters movdqa .Lpbswap(%rip),$Xn dec $num jnz .Loop mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande .Ldone: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size sha256_multi_block,.-sha256_multi_block ___ {{{ my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15)); my @MSG0=map("%xmm$_",(4..7)); my @MSG1=map("%xmm$_",(8..11)); $code.=<<___; .type sha256_multi_block_shaext,\@function,3 .align 32 sha256_multi_block_shaext: .cfi_startproc _shaext_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`,%rsp shl \$1,$num # we process pair at a time and \$-256,%rsp lea 0x80($ctx),$ctx # size optimization mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .Lbody_shaext: lea `$REG_SZ*16`(%rsp),%rbx lea K256_shaext+0x80(%rip),$Tbl .Loop_grande_shaext: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<2;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle %rsp,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_shaext movq 0x00-0x80($ctx),$ABEF0 # A1.A0 movq 0x20-0x80($ctx),@MSG0[0] # B1.B0 movq 0x40-0x80($ctx),$CDGH0 # C1.C0 movq 0x60-0x80($ctx),@MSG0[1] # D1.D0 movq 0x80-0x80($ctx),@MSG1[0] # E1.E0 movq 0xa0-0x80($ctx),@MSG1[1] # F1.F0 movq 0xc0-0x80($ctx),@MSG1[2] # G1.G0 movq 0xe0-0x80($ctx),@MSG1[3] # H1.H0 punpckldq @MSG0[0],$ABEF0 # B1.A1.B0.A0 punpckldq @MSG0[1],$CDGH0 # D1.C1.D0.C0 punpckldq @MSG1[1],@MSG1[0] # F1.E1.F0.E0 punpckldq @MSG1[3],@MSG1[2] # H1.G1.H0.G0 movdqa K256_shaext-0x10(%rip),$TMPx # byte swap movdqa $ABEF0,$ABEF1 movdqa $CDGH0,$CDGH1 punpcklqdq @MSG1[0],$ABEF0 # F0.E0.B0.A0 punpcklqdq @MSG1[2],$CDGH0 # H0.G0.D0.C0 punpckhqdq @MSG1[0],$ABEF1 # F1.E1.B1.A1 punpckhqdq @MSG1[2],$CDGH1 # H1.G1.D1.C1 pshufd \$0b00011011,$ABEF0,$ABEF0 pshufd \$0b00011011,$CDGH0,$CDGH0 pshufd \$0b00011011,$ABEF1,$ABEF1 pshufd \$0b00011011,$CDGH1,$CDGH1 jmp .Loop_shaext .align 32 .Loop_shaext: movdqu 0x00(@ptr[0]),@MSG0[0] movdqu 0x00(@ptr[1]),@MSG1[0] movdqu 0x10(@ptr[0]),@MSG0[1] movdqu 0x10(@ptr[1]),@MSG1[1] movdqu 0x20(@ptr[0]),@MSG0[2] pshufb $TMPx,@MSG0[0] movdqu 0x20(@ptr[1]),@MSG1[2] pshufb $TMPx,@MSG1[0] movdqu 0x30(@ptr[0]),@MSG0[3] lea 0x40(@ptr[0]),@ptr[0] movdqu 0x30(@ptr[1]),@MSG1[3] lea 0x40(@ptr[1]),@ptr[1] movdqa 0*16-0x80($Tbl),$Wi pshufb $TMPx,@MSG0[1] paddd @MSG0[0],$Wi pxor $ABEF0,@MSG0[0] # black magic movdqa $Wi,$TMP0 movdqa 0*16-0x80($Tbl),$TMP1 pshufb $TMPx,@MSG1[1] paddd @MSG1[0],$TMP1 movdqa $CDGH0,0x50(%rsp) # offload sha256rnds2 $ABEF0,$CDGH0 # 0-3 pxor $ABEF1,@MSG1[0] # black magic movdqa $TMP1,$Wi movdqa $CDGH1,0x70(%rsp) sha256rnds2 $ABEF1,$CDGH1 # 0-3 pshufd \$0x0e,$TMP0,$Wi pxor $ABEF0,@MSG0[0] # black magic movdqa $ABEF0,0x40(%rsp) # offload sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi pxor $ABEF1,@MSG1[0] # black magic movdqa $ABEF1,0x60(%rsp) movdqa 1*16-0x80($Tbl),$TMP0 paddd @MSG0[1],$TMP0 pshufb $TMPx,@MSG0[2] sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi movdqa 1*16-0x80($Tbl),$TMP1 paddd @MSG1[1],$TMP1 sha256rnds2 $ABEF0,$CDGH0 # 4-7 movdqa $TMP1,$Wi prefetcht0 127(@ptr[0]) pshufb $TMPx,@MSG0[3] pshufb $TMPx,@MSG1[2] prefetcht0 127(@ptr[1]) sha256rnds2 $ABEF1,$CDGH1 # 4-7 pshufd \$0x0e,$TMP0,$Wi pshufb $TMPx,@MSG1[3] sha256msg1 @MSG0[1],@MSG0[0] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 2*16-0x80($Tbl),$TMP0 paddd @MSG0[2],$TMP0 sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi movdqa 2*16-0x80($Tbl),$TMP1 paddd @MSG1[2],$TMP1 sha256rnds2 $ABEF0,$CDGH0 # 8-11 sha256msg1 @MSG1[1],@MSG1[0] movdqa $TMP1,$Wi movdqa @MSG0[3],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 8-11 pshufd \$0x0e,$TMP0,$Wi palignr \$4,@MSG0[2],$TMPx paddd $TMPx,@MSG0[0] movdqa @MSG1[3],$TMPx palignr \$4,@MSG1[2],$TMPx sha256msg1 @MSG0[2],@MSG0[1] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 3*16-0x80($Tbl),$TMP0 paddd @MSG0[3],$TMP0 sha256rnds2 $CDGH1,$ABEF1 sha256msg1 @MSG1[2],@MSG1[1] movdqa $TMP0,$Wi movdqa 3*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[0] paddd @MSG1[3],$TMP1 sha256msg2 @MSG0[3],@MSG0[0] sha256rnds2 $ABEF0,$CDGH0 # 12-15 movdqa $TMP1,$Wi movdqa @MSG0[0],$TMPx palignr \$4,@MSG0[3],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 12-15 sha256msg2 @MSG1[3],@MSG1[0] pshufd \$0x0e,$TMP0,$Wi paddd $TMPx,@MSG0[1] movdqa @MSG1[0],$TMPx palignr \$4,@MSG1[3],$TMPx sha256msg1 @MSG0[3],@MSG0[2] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 4*16-0x80($Tbl),$TMP0 paddd @MSG0[0],$TMP0 sha256rnds2 $CDGH1,$ABEF1 sha256msg1 @MSG1[3],@MSG1[2] ___ for($i=4;$i<16-3;$i++) { $code.=<<___; movdqa $TMP0,$Wi movdqa $i*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[1] paddd @MSG1[0],$TMP1 sha256msg2 @MSG0[0],@MSG0[1] sha256rnds2 $ABEF0,$CDGH0 # 16-19... movdqa $TMP1,$Wi movdqa @MSG0[1],$TMPx palignr \$4,@MSG0[0],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 16-19... sha256msg2 @MSG1[0],@MSG1[1] pshufd \$0x0e,$TMP0,$Wi paddd $TMPx,@MSG0[2] movdqa @MSG1[1],$TMPx palignr \$4,@MSG1[0],$TMPx sha256msg1 @MSG0[0],@MSG0[3] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa `($i+1)*16`-0x80($Tbl),$TMP0 paddd @MSG0[1],$TMP0 sha256rnds2 $CDGH1,$ABEF1 sha256msg1 @MSG1[0],@MSG1[3] ___ push(@MSG0,shift(@MSG0)); push(@MSG1,shift(@MSG1)); } $code.=<<___; movdqa $TMP0,$Wi movdqa 13*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[1] paddd @MSG1[0],$TMP1 sha256msg2 @MSG0[0],@MSG0[1] sha256rnds2 $ABEF0,$CDGH0 # 52-55 movdqa $TMP1,$Wi movdqa @MSG0[1],$TMPx palignr \$4,@MSG0[0],$TMPx sha256rnds2 $ABEF1,$CDGH1 # 52-55 sha256msg2 @MSG1[0],@MSG1[1] pshufd \$0x0e,$TMP0,$Wi paddd $TMPx,@MSG0[2] movdqa @MSG1[1],$TMPx palignr \$4,@MSG1[0],$TMPx nop sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 14*16-0x80($Tbl),$TMP0 paddd @MSG0[1],$TMP0 sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi movdqa 14*16-0x80($Tbl),$TMP1 paddd $TMPx,@MSG1[2] paddd @MSG1[1],$TMP1 sha256msg2 @MSG0[1],@MSG0[2] nop sha256rnds2 $ABEF0,$CDGH0 # 56-59 movdqa $TMP1,$Wi mov \$1,%ecx pxor @MSG0[1],@MSG0[1] # zero sha256rnds2 $ABEF1,$CDGH1 # 56-59 sha256msg2 @MSG1[1],@MSG1[2] pshufd \$0x0e,$TMP0,$Wi movdqa 15*16-0x80($Tbl),$TMP0 paddd @MSG0[2],$TMP0 movq (%rbx),@MSG0[2] # pull counters nop sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi movdqa 15*16-0x80($Tbl),$TMP1 paddd @MSG1[2],$TMP1 sha256rnds2 $CDGH1,$ABEF1 movdqa $TMP0,$Wi cmp 4*0(%rbx),%ecx # examine counters cmovge %rsp,@ptr[0] # cancel input cmp 4*1(%rbx),%ecx cmovge %rsp,@ptr[1] pshufd \$0x00,@MSG0[2],@MSG1[0] sha256rnds2 $ABEF0,$CDGH0 # 60-63 movdqa $TMP1,$Wi pshufd \$0x55,@MSG0[2],@MSG1[1] movdqa @MSG0[2],@MSG1[2] sha256rnds2 $ABEF1,$CDGH1 # 60-63 pshufd \$0x0e,$TMP0,$Wi pcmpgtd @MSG0[1],@MSG1[0] pcmpgtd @MSG0[1],@MSG1[1] sha256rnds2 $CDGH0,$ABEF0 pshufd \$0x0e,$TMP1,$Wi pcmpgtd @MSG0[1],@MSG1[2] # counter mask movdqa K256_shaext-0x10(%rip),$TMPx sha256rnds2 $CDGH1,$ABEF1 pand @MSG1[0],$CDGH0 pand @MSG1[1],$CDGH1 pand @MSG1[0],$ABEF0 pand @MSG1[1],$ABEF1 paddd @MSG0[2],@MSG1[2] # counters-- paddd 0x50(%rsp),$CDGH0 paddd 0x70(%rsp),$CDGH1 paddd 0x40(%rsp),$ABEF0 paddd 0x60(%rsp),$ABEF1 movq @MSG1[2],(%rbx) # save counters dec $num jnz .Loop_shaext mov `$REG_SZ*17+8`(%rsp),$num pshufd \$0b00011011,$ABEF0,$ABEF0 pshufd \$0b00011011,$CDGH0,$CDGH0 pshufd \$0b00011011,$ABEF1,$ABEF1 pshufd \$0b00011011,$CDGH1,$CDGH1 movdqa $ABEF0,@MSG0[0] movdqa $CDGH0,@MSG0[1] punpckldq $ABEF1,$ABEF0 # B1.B0.A1.A0 punpckhdq $ABEF1,@MSG0[0] # F1.F0.E1.E0 punpckldq $CDGH1,$CDGH0 # D1.D0.C1.C0 punpckhdq $CDGH1,@MSG0[1] # H1.H0.G1.G0 movq $ABEF0,0x00-0x80($ctx) # A1.A0 psrldq \$8,$ABEF0 movq @MSG0[0],0x80-0x80($ctx) # E1.E0 psrldq \$8,@MSG0[0] movq $ABEF0,0x20-0x80($ctx) # B1.B0 movq @MSG0[0],0xa0-0x80($ctx) # F1.F0 movq $CDGH0,0x40-0x80($ctx) # C1.C0 psrldq \$8,$CDGH0 movq @MSG0[1],0xc0-0x80($ctx) # G1.G0 psrldq \$8,@MSG0[1] movq $CDGH0,0x60-0x80($ctx) # D1.D0 movq @MSG0[1],0xe0-0x80($ctx) # H1.H0 lea `$REG_SZ/2`($ctx),$ctx lea `16*2`($inp),$inp dec $num jnz .Loop_grande_shaext .Ldone_shaext: #mov `$REG_SZ*17`(%rsp),%rax # original %rsp ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_shaext: ret .cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext ___ }}} if ($avx) {{{ sub ROUND_00_15_avx { my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; $code.=<<___ if ($i<15 && $REG_SZ==16); vmovd `4*$i`(@ptr[0]),$Xi vmovd `4*$i`(@ptr[1]),$t1 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 vpunpckldq $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___ if ($i==15 && $REG_SZ==16); vmovd `4*$i`(@ptr[0]),$Xi lea `16*4`(@ptr[0]),@ptr[0] vmovd `4*$i`(@ptr[1]),$t1 lea `16*4`(@ptr[1]),@ptr[1] vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,`4*$i`(@ptr[3]),$t1,$t1 lea `16*4`(@ptr[3]),@ptr[3] vpunpckldq $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___ if ($i<15 && $REG_SZ==32); vmovd `4*$i`(@ptr[0]),$Xi vmovd `4*$i`(@ptr[4]),$t1 vmovd `4*$i`(@ptr[1]),$t2 vmovd `4*$i`(@ptr[5]),$t3 vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 vpunpckldq $t2,$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 vpunpckldq $t3,$t1,$t1 vinserti128 $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___ if ($i==15 && $REG_SZ==32); vmovd `4*$i`(@ptr[0]),$Xi lea `16*4`(@ptr[0]),@ptr[0] vmovd `4*$i`(@ptr[4]),$t1 lea `16*4`(@ptr[4]),@ptr[4] vmovd `4*$i`(@ptr[1]),$t2 lea `16*4`(@ptr[1]),@ptr[1] vmovd `4*$i`(@ptr[5]),$t3 lea `16*4`(@ptr[5]),@ptr[5] vpinsrd \$1,`4*$i`(@ptr[2]),$Xi,$Xi lea `16*4`(@ptr[2]),@ptr[2] vpinsrd \$1,`4*$i`(@ptr[6]),$t1,$t1 lea `16*4`(@ptr[6]),@ptr[6] vpinsrd \$1,`4*$i`(@ptr[3]),$t2,$t2 lea `16*4`(@ptr[3]),@ptr[3] vpunpckldq $t2,$Xi,$Xi vpinsrd \$1,`4*$i`(@ptr[7]),$t3,$t3 lea `16*4`(@ptr[7]),@ptr[7] vpunpckldq $t3,$t1,$t1 vinserti128 $t1,$Xi,$Xi vpshufb $Xn,$Xi,$Xi ___ $code.=<<___; vpsrld \$6,$e,$sigma vpslld \$26,$e,$t3 vmovdqu $Xi,`&Xi_off($i)` vpaddd $h,$Xi,$Xi # Xi+=h vpsrld \$11,$e,$t2 vpxor $t3,$sigma,$sigma vpslld \$21,$e,$t3 vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi # Xi+=K[round] vpxor $t2,$sigma,$sigma vpsrld \$25,$e,$t2 vpxor $t3,$sigma,$sigma `"prefetcht0 63(@ptr[0])" if ($i==15)` vpslld \$7,$e,$t3 vpandn $g,$e,$t1 vpand $f,$e,$axb # borrow $axb `"prefetcht0 63(@ptr[1])" if ($i==15)` vpxor $t2,$sigma,$sigma vpsrld \$2,$a,$h # borrow $h vpxor $t3,$sigma,$sigma # Sigma1(e) `"prefetcht0 63(@ptr[2])" if ($i==15)` vpslld \$30,$a,$t2 vpxor $axb,$t1,$t1 # Ch(e,f,g) vpxor $a,$b,$axb # a^b, b^c in next round `"prefetcht0 63(@ptr[3])" if ($i==15)` vpxor $t2,$h,$h vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) vpsrld \$13,$a,$t2 `"prefetcht0 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpslld \$19,$a,$t3 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) vpand $axb,$bxc,$bxc `"prefetcht0 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$h,$sigma vpsrld \$22,$a,$t2 vpxor $t3,$sigma,$sigma `"prefetcht0 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpslld \$10,$a,$t3 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) vpaddd $Xi,$d,$d # d+=Xi `"prefetcht0 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # Sigma0(a) vpaddd $Xi,$h,$h # h+=Xi vpaddd $sigma,$h,$h # h+=Sigma0(a) ___ $code.=<<___ if (($i%8)==7); add \$`32*8`,$Tbl ___ ($axb,$bxc)=($bxc,$axb); } sub ROUND_16_XX_avx { my $i=shift; $code.=<<___; vmovdqu `&Xi_off($i+1)`,$Xn vpaddd `&Xi_off($i+9)`,$Xi,$Xi # Xi+=X[i+9] vpsrld \$3,$Xn,$sigma vpsrld \$7,$Xn,$t2 vpslld \$25,$Xn,$t3 vpxor $t2,$sigma,$sigma vpsrld \$18,$Xn,$t2 vpxor $t3,$sigma,$sigma vpslld \$14,$Xn,$t3 vmovdqu `&Xi_off($i+14)`,$t1 vpsrld \$10,$t1,$axb # borrow $axb vpxor $t2,$sigma,$sigma vpsrld \$17,$t1,$t2 vpxor $t3,$sigma,$sigma # sigma0(X[i+1]) vpslld \$15,$t1,$t3 vpaddd $sigma,$Xi,$Xi # Xi+=sigma0(e) vpxor $t2,$axb,$sigma vpsrld \$19,$t1,$t2 vpxor $t3,$sigma,$sigma vpslld \$13,$t1,$t3 vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # sigma0(X[i+14]) vpaddd $sigma,$Xi,$Xi # Xi+=sigma1(X[i+14]) ___ &ROUND_00_15_avx($i,@_); ($Xi,$Xn)=($Xn,$Xi); } $code.=<<___; .type sha256_multi_block_avx,\@function,3 .align 32 sha256_multi_block_avx: .cfi_startproc _avx_shortcut: ___ $code.=<<___ if ($avx>1); shr \$32,%rcx cmp \$2,$num jb .Lavx test \$`1<<5`,%ecx jnz _avx2_shortcut jmp .Lavx .align 32 .Lavx: ___ $code.=<<___; mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,-0x78(%rax) movaps %xmm11,-0x68(%rax) movaps %xmm12,-0x58(%rax) movaps %xmm13,-0x48(%rax) movaps %xmm14,-0x38(%rax) movaps %xmm15,-0x28(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx: lea K256+128(%rip),$Tbl lea `$REG_SZ*16`(%rsp),%rbx lea 0x80($ctx),$ctx # size optimization .Loop_grande_avx: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num ___ for($i=0;$i<4;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; test $num,$num jz .Ldone_avx vmovdqu 0x00-0x80($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20-0x80($ctx),$B vmovdqu 0x40-0x80($ctx),$C vmovdqu 0x60-0x80($ctx),$D vmovdqu 0x80-0x80($ctx),$E vmovdqu 0xa0-0x80($ctx),$F vmovdqu 0xc0-0x80($ctx),$G vmovdqu 0xe0-0x80($ctx),$H vmovdqu .Lpbswap(%rip),$Xn jmp .Loop_avx .align 32 .Loop_avx: vpxor $B,$C,$bxc # magic seed ___ for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; vmovdqu `&Xi_off($i)`,$Xi mov \$3,%ecx jmp .Loop_16_xx_avx .align 32 .Loop_16_xx_avx: ___ for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; dec %ecx jnz .Loop_16_xx_avx mov \$1,%ecx lea K256+128(%rip),$Tbl ___ for($i=0;$i<4;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqa (%rbx),$sigma # pull counters vpxor $t1,$t1,$t1 vmovdqa $sigma,$Xn vpcmpgtd $t1,$Xn,$Xn # mask value vpaddd $Xn,$sigma,$sigma # counters-- vmovdqu 0x00-0x80($ctx),$t1 vpand $Xn,$A,$A vmovdqu 0x20-0x80($ctx),$t2 vpand $Xn,$B,$B vmovdqu 0x40-0x80($ctx),$t3 vpand $Xn,$C,$C vmovdqu 0x60-0x80($ctx),$Xi vpand $Xn,$D,$D vpaddd $t1,$A,$A vmovdqu 0x80-0x80($ctx),$t1 vpand $Xn,$E,$E vpaddd $t2,$B,$B vmovdqu 0xa0-0x80($ctx),$t2 vpand $Xn,$F,$F vpaddd $t3,$C,$C vmovdqu 0xc0-0x80($ctx),$t3 vpand $Xn,$G,$G vpaddd $Xi,$D,$D vmovdqu 0xe0-0x80($ctx),$Xi vpand $Xn,$H,$H vpaddd $t1,$E,$E vpaddd $t2,$F,$F vmovdqu $A,0x00-0x80($ctx) vpaddd $t3,$G,$G vmovdqu $B,0x20-0x80($ctx) vpaddd $Xi,$H,$H vmovdqu $C,0x40-0x80($ctx) vmovdqu $D,0x60-0x80($ctx) vmovdqu $E,0x80-0x80($ctx) vmovdqu $F,0xa0-0x80($ctx) vmovdqu $G,0xc0-0x80($ctx) vmovdqu $H,0xe0-0x80($ctx) vmovdqu $sigma,(%rbx) # save counters vmovdqu .Lpbswap(%rip),$Xn dec $num jnz .Loop_avx mov `$REG_SZ*17+8`(%rsp),$num lea $REG_SZ($ctx),$ctx lea `16*$REG_SZ/4`($inp),$inp dec $num jnz .Loop_grande_avx .Ldone_avx: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xb8(%rax),%xmm6 movaps -0xa8(%rax),%xmm7 movaps -0x98(%rax),%xmm8 movaps -0x88(%rax),%xmm9 movaps -0x78(%rax),%xmm10 movaps -0x68(%rax),%xmm11 movaps -0x58(%rax),%xmm12 movaps -0x48(%rax),%xmm13 movaps -0x38(%rax),%xmm14 movaps -0x28(%rax),%xmm15 ___ $code.=<<___; mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size sha256_multi_block_avx,.-sha256_multi_block_avx ___ if ($avx>1) { $code =~ s/\`([^\`]*)\`/eval $1/gem; $REG_SZ=32; @ptr=map("%r$_",(12..15,8..11)); @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15)); ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7)); $code.=<<___; .type sha256_multi_block_avx2,\@function,3 .align 32 sha256_multi_block_avx2: .cfi_startproc _avx2_shortcut: mov %rsp,%rax .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 ___ $code.=<<___ if ($win64); lea -0xa8(%rsp),%rsp movaps %xmm6,(%rsp) movaps %xmm7,0x10(%rsp) movaps %xmm8,0x20(%rsp) movaps %xmm9,0x30(%rsp) movaps %xmm10,0x40(%rsp) movaps %xmm11,0x50(%rsp) movaps %xmm12,-0x78(%rax) movaps %xmm13,-0x68(%rax) movaps %xmm14,-0x58(%rax) movaps %xmm15,-0x48(%rax) ___ $code.=<<___; sub \$`$REG_SZ*18`, %rsp and \$-256,%rsp mov %rax,`$REG_SZ*17`(%rsp) # original %rsp .cfi_cfa_expression %rsp+`$REG_SZ*17`,deref,+8 .Lbody_avx2: lea K256+128(%rip),$Tbl lea 0x80($ctx),$ctx # size optimization .Loop_grande_avx2: mov $num,`$REG_SZ*17+8`(%rsp) # original $num xor $num,$num lea `$REG_SZ*16`(%rsp),%rbx ___ for($i=0;$i<8;$i++) { $code.=<<___; mov `16*$i+0`($inp),@ptr[$i] # input pointer mov `16*$i+8`($inp),%ecx # number of blocks cmp $num,%ecx cmovg %ecx,$num # find maximum test %ecx,%ecx mov %ecx,`4*$i`(%rbx) # initialize counters cmovle $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqu 0x00-0x80($ctx),$A # load context lea 128(%rsp),%rax vmovdqu 0x20-0x80($ctx),$B lea 256+128(%rsp),%rbx vmovdqu 0x40-0x80($ctx),$C vmovdqu 0x60-0x80($ctx),$D vmovdqu 0x80-0x80($ctx),$E vmovdqu 0xa0-0x80($ctx),$F vmovdqu 0xc0-0x80($ctx),$G vmovdqu 0xe0-0x80($ctx),$H vmovdqu .Lpbswap(%rip),$Xn jmp .Loop_avx2 .align 32 .Loop_avx2: vpxor $B,$C,$bxc # magic seed ___ for($i=0;$i<16;$i++) { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; vmovdqu `&Xi_off($i)`,$Xi mov \$3,%ecx jmp .Loop_16_xx_avx2 .align 32 .Loop_16_xx_avx2: ___ for(;$i<32;$i++) { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); } $code.=<<___; dec %ecx jnz .Loop_16_xx_avx2 mov \$1,%ecx lea `$REG_SZ*16`(%rsp),%rbx lea K256+128(%rip),$Tbl ___ for($i=0;$i<8;$i++) { $code.=<<___; cmp `4*$i`(%rbx),%ecx # examine counters cmovge $Tbl,@ptr[$i] # cancel input ___ } $code.=<<___; vmovdqa (%rbx),$sigma # pull counters vpxor $t1,$t1,$t1 vmovdqa $sigma,$Xn vpcmpgtd $t1,$Xn,$Xn # mask value vpaddd $Xn,$sigma,$sigma # counters-- vmovdqu 0x00-0x80($ctx),$t1 vpand $Xn,$A,$A vmovdqu 0x20-0x80($ctx),$t2 vpand $Xn,$B,$B vmovdqu 0x40-0x80($ctx),$t3 vpand $Xn,$C,$C vmovdqu 0x60-0x80($ctx),$Xi vpand $Xn,$D,$D vpaddd $t1,$A,$A vmovdqu 0x80-0x80($ctx),$t1 vpand $Xn,$E,$E vpaddd $t2,$B,$B vmovdqu 0xa0-0x80($ctx),$t2 vpand $Xn,$F,$F vpaddd $t3,$C,$C vmovdqu 0xc0-0x80($ctx),$t3 vpand $Xn,$G,$G vpaddd $Xi,$D,$D vmovdqu 0xe0-0x80($ctx),$Xi vpand $Xn,$H,$H vpaddd $t1,$E,$E vpaddd $t2,$F,$F vmovdqu $A,0x00-0x80($ctx) vpaddd $t3,$G,$G vmovdqu $B,0x20-0x80($ctx) vpaddd $Xi,$H,$H vmovdqu $C,0x40-0x80($ctx) vmovdqu $D,0x60-0x80($ctx) vmovdqu $E,0x80-0x80($ctx) vmovdqu $F,0xa0-0x80($ctx) vmovdqu $G,0xc0-0x80($ctx) vmovdqu $H,0xe0-0x80($ctx) vmovdqu $sigma,(%rbx) # save counters lea 256+128(%rsp),%rbx vmovdqu .Lpbswap(%rip),$Xn dec $num jnz .Loop_avx2 #mov `$REG_SZ*17+8`(%rsp),$num #lea $REG_SZ($ctx),$ctx #lea `16*$REG_SZ/4`($inp),$inp #dec $num #jnz .Loop_grande_avx2 .Ldone_avx2: mov `$REG_SZ*17`(%rsp),%rax # original %rsp .cfi_def_cfa %rax,8 vzeroupper ___ $code.=<<___ if ($win64); movaps -0xd8(%rax),%xmm6 movaps -0xc8(%rax),%xmm7 movaps -0xb8(%rax),%xmm8 movaps -0xa8(%rax),%xmm9 movaps -0x98(%rax),%xmm10 movaps -0x88(%rax),%xmm11 movaps -0x78(%rax),%xmm12 movaps -0x68(%rax),%xmm13 movaps -0x58(%rax),%xmm14 movaps -0x48(%rax),%xmm15 ___ $code.=<<___; mov -48(%rax),%r15 .cfi_restore %r15 mov -40(%rax),%r14 .cfi_restore %r14 mov -32(%rax),%r13 .cfi_restore %r13 mov -24(%rax),%r12 .cfi_restore %r12 mov -16(%rax),%rbp .cfi_restore %rbp mov -8(%rax),%rbx .cfi_restore %rbx lea (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size sha256_multi_block_avx2,.-sha256_multi_block_avx2 ___ } }}} $code.=<<___; .align 256 K256: ___ sub TABLE { foreach (@_) { $code.=<<___; .long $_,$_,$_,$_ .long $_,$_,$_,$_ ___ } } &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5, 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5, 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3, 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174, 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc, 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da, 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7, 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967, 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13, 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85, 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3, 0xd192e819,0xd6990624,0xf40e3585,0x106aa070, 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5, 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3, 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208, 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 ); $code.=<<___; .Lpbswap: .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap K256_shaext: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .asciz "SHA256 multi-block transform for x86_64, CRYPTOGAMS by " ___ if ($win64) { # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->Rip<.Lbody jb .Lin_prologue mov 152($context),%rax # pull context->Rsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue mov `16*17`(%rax),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp lea -24-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($avx>1); .type avx2_handler,\@abi-omnipotent .align 16 avx2_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HandlerData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # end of prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue mov `32*17`($context),%rax # pull saved stack pointer mov -8(%rax),%rbx mov -16(%rax),%rbp mov -24(%rax),%r12 mov -32(%rax),%r13 mov -40(%rax),%r14 mov -48(%rax),%r15 mov %rbx,144($context) # restore context->Rbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea -56-10*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$20,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size avx2_handler,.-avx2_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_sha256_multi_block .rva .LSEH_end_sha256_multi_block .rva .LSEH_info_sha256_multi_block .rva .LSEH_begin_sha256_multi_block_shaext .rva .LSEH_end_sha256_multi_block_shaext .rva .LSEH_info_sha256_multi_block_shaext ___ $code.=<<___ if ($avx); .rva .LSEH_begin_sha256_multi_block_avx .rva .LSEH_end_sha256_multi_block_avx .rva .LSEH_info_sha256_multi_block_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_sha256_multi_block_avx2 .rva .LSEH_end_sha256_multi_block_avx2 .rva .LSEH_info_sha256_multi_block_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_sha256_multi_block: .byte 9,0,0,0 .rva se_handler .rva .Lbody,.Lepilogue # HandlerData[] .LSEH_info_sha256_multi_block_shaext: .byte 9,0,0,0 .rva se_handler .rva .Lbody_shaext,.Lepilogue_shaext # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_sha256_multi_block_avx: .byte 9,0,0,0 .rva se_handler .rva .Lbody_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_sha256_multi_block_avx2: .byte 9,0,0,0 .rva avx2_handler .rva .Lbody_avx2,.Lepilogue_avx2 # HandlerData[] ___ } #################################################################### sub rex { local *opcode=shift; my ($dst,$src)=@_; my $rex=0; $rex|=0x04 if ($dst>=8); $rex|=0x01 if ($src>=8); unshift @opcode,$rex|0x40 if ($rex); } sub sha256op38 { my $instr = shift; my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) { my @opcode=(0x0f,0x38); rex(\@opcode,$2,$1); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval($1)/ge; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo or s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/crypto/openssl/crypto/sha/asm/sha512-x86_64.pl =================================================================== --- stable/12/crypto/openssl/crypto/sha/asm/sha512-x86_64.pl (revision 364962) +++ stable/12/crypto/openssl/crypto/sha/asm/sha512-x86_64.pl (revision 364963) @@ -1,2558 +1,2558 @@ #! /usr/bin/env perl # Copyright 2005-2020 The OpenSSL Project Authors. All Rights Reserved. # # Licensed under the OpenSSL license (the "License"). You may not use # this file except in compliance with the License. You can obtain a copy # in the file LICENSE in the source distribution or at # https://www.openssl.org/source/license.html # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. Rights for redistribution and usage in source and binary # forms are granted according to the OpenSSL license. # ==================================================================== # # sha256/512_block procedure for x86_64. # # 40% improvement over compiler-generated code on Opteron. On EM64T # sha256 was observed to run >80% faster and sha512 - >40%. No magical # tricks, just straight implementation... I really wonder why gcc # [being armed with inline assembler] fails to generate as fast code. # The only thing which is cool about this module is that it's very # same instruction sequence used for both SHA-256 and SHA-512. In # former case the instructions operate on 32-bit operands, while in # latter - on 64-bit ones. All I had to do is to get one flavor right, # the other one passed the test right away:-) # # sha256_block runs in ~1005 cycles on Opteron, which gives you # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock # frequency in GHz. sha512_block runs in ~1275 cycles, which results # in 128*1000/1275=100MBps per GHz. Is there room for improvement? # Well, if you compare it to IA-64 implementation, which maintains # X[16] in register bank[!], tends to 4 instructions per CPU clock # cycle and runs in 1003 cycles, 1275 is very good result for 3-way # issue Opteron pipeline and X[16] maintained in memory. So that *if* # there is a way to improve it, *then* the only way would be to try to # offload X[16] updates to SSE unit, but that would require "deeper" # loop unroll, which in turn would naturally cause size blow-up, not # to mention increased complexity! And once again, only *if* it's # actually possible to noticeably improve overall ILP, instruction # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect # performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates # apparently are not atomic instructions, but implemented in microcode. # # May 2012. # # Optimization including one of Pavel Semjanov's ideas, alternative # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and # unfortunately -2% SHA512 on P4 [which nobody should care about # that much]. # # June 2012. # # Add SIMD code paths, see below for improvement coefficients. SSSE3 # code path was not attempted for SHA512, because improvement is not # estimated to be high enough, noticeably less than 9%, to justify # the effort, not on pre-AVX processors. [Obviously with exclusion # for VIA Nano, but it has SHA512 instruction that is faster and # should be used instead.] For reference, corresponding estimated # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that # higher coefficients are observed on VIA Nano and Bulldozer has more # to do with specifics of their architecture [which is topic for # separate discussion]. # # November 2012. # # Add AVX2 code path. Two consecutive input blocks are loaded to # 256-bit %ymm registers, with data from first block to least # significant 128-bit halves and data from second to most significant. # The data is then processed with same SIMD instruction sequence as # for AVX, but with %ymm as operands. Side effect is increased stack # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB # code size increase. # # March 2014. # # Add support for Intel SHA Extensions. ###################################################################### # Current performance in cycles per processed byte (less is better): # # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) # # AMD K8 14.9 - - 9.57 - # P4 17.3 - - 30.8 - # Core 2 15.6 13.8(+13%) - 9.97 - # Westmere 14.8 12.3(+19%) - 9.58 - # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) # Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) # Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - # Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # # (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions # below certain limit makes no difference/sense; to conserve # space SHA256 XOP code path is therefore omitted; $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.19) + ($1>=2.22); } if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { $avx = ($1>=2.09) + ($1>=2.10); } if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && `ml64 2>&1` =~ /Version ([0-9]+)\./) { $avx = ($1>=10) + ($1>=11); } -if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) { $avx = ($2>=3.0) + ($2>3.0); } $shaext=1; ### set to zero if compiling for 1.0.1 $avx=1 if (!$shaext && $avx); open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; *STDOUT=*OUT; if ($output =~ /512/) { $func="sha512_block_data_order"; $TABLE="K512"; $SZ=8; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx", "%r8", "%r9", "%r10","%r11"); ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi"); @Sigma0=(28,34,39); @Sigma1=(14,18,41); @sigma0=(1, 8, 7); @sigma1=(19,61, 6); $rounds=80; } else { $func="sha256_block_data_order"; $TABLE="K256"; $SZ=4; @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx", "%r8d","%r9d","%r10d","%r11d"); ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi"); @Sigma0=( 2,13,22); @Sigma1=( 6,11,25); @sigma0=( 7,18, 3); @sigma1=(17,19,10); $rounds=64; } $ctx="%rdi"; # 1st arg, zapped by $a3 $inp="%rsi"; # 2nd arg $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; $_rsp="`16*$SZ+3*8`(%rsp)"; $framesz="16*$SZ+4*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; my $STRIDE=$SZ; $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 xor $e,$a0 ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $g,$a2 # f^g mov $T1,`$SZ*($i&0xf)`(%rsp) xor $a,$a1 and $e,$a2 # (f^g)&e ror \$`$Sigma1[1]-$Sigma1[0]`,$a0 add $h,$T1 # T1+=h xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g ror \$`$Sigma0[1]-$Sigma0[0]`,$a1 xor $e,$a0 add $a2,$T1 # T1+=Ch(e,f,g) mov $a,$a2 add ($Tbl),$T1 # T1+=K[round] xor $a,$a1 xor $b,$a2 # a^b, b^c in next round ror \$$Sigma1[0],$a0 # Sigma1(e) mov $b,$h and $a2,$a3 ror \$$Sigma0[0],$a1 # Sigma0(a) add $a0,$T1 # T1+=Sigma1(e) xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) add $T1,$d # d+=T1 add $T1,$h # h+=T1 lea $STRIDE($Tbl),$Tbl # round++ ___ $code.=<<___ if ($i<15); add $a1,$h # h+=Sigma0(a) ___ ($a2,$a3) = ($a3,$a2); } sub ROUND_16_XX() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 mov $a0,$T1 ror \$`$sigma0[1]-$sigma0[0]`,$a0 add $a1,$a # modulo-scheduled h+=Sigma0(a) mov $a2,$a1 ror \$`$sigma1[1]-$sigma1[0]`,$a2 xor $T1,$a0 shr \$$sigma0[2],$T1 ror \$$sigma0[0],$a0 xor $a1,$a2 shr \$$sigma1[2],$a1 ror \$$sigma1[0],$a2 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) xor $a1,$a2 # sigma1(X[(i+14)&0xf]) add `$SZ*(($i+9)&0xf)`(%rsp),$T1 add `$SZ*($i&0xf)`(%rsp),$T1 mov $e,$a0 add $a2,$T1 mov $a,$a1 ___ &ROUND_00_15(@_); } $code=<<___; .text .extern OPENSSL_ia32cap_P .globl $func .type $func,\@function,3 .align 16 $func: .cfi_startproc ___ $code.=<<___ if ($SZ==4 || $avx); lea OPENSSL_ia32cap_P(%rip),%r11 mov 0(%r11),%r9d mov 4(%r11),%r10d mov 8(%r11),%r11d ___ $code.=<<___ if ($SZ==4 && $shaext); test \$`1<<29`,%r11d # check for SHA jnz _shaext_shortcut ___ $code.=<<___ if ($avx && $SZ==8); test \$`1<<11`,%r10d # check for XOP jnz .Lxop_shortcut ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 cmp \$`1<<8|1<<5|1<<3`,%r11d je .Lavx2_shortcut ___ $code.=<<___ if ($avx); and \$`1<<30`,%r9d # mask "Intel CPU" bit and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits or %r9d,%r10d cmp \$`1<<28|1<<9|1<<30`,%r10d je .Lavx_shortcut ___ $code.=<<___ if ($SZ==4); test \$`1<<9`,%r10d jnz .Lssse3_shortcut ___ $code.=<<___; mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 .Lprologue: mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H jmp .Lloop .align 16 .Lloop: mov $B,$a3 lea $TABLE(%rip),$Tbl xor $C,$a3 # magic ___ for($i=0;$i<16;$i++) { $code.=" mov $SZ*$i($inp),$T1\n"; $code.=" mov @ROT[4],$a0\n"; $code.=" mov @ROT[0],$a1\n"; $code.=" bswap $T1\n"; &ROUND_00_15($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: ___ for(;$i<32;$i++) { &ROUND_16_XX($i,@ROT); unshift(@ROT,pop(@ROT)); } $code.=<<___; cmpb \$0,`$SZ-1`($Tbl) jnz .Lrounds_16_xx mov $_ctx,$ctx add $a1,$A # modulo-scheduled h+=Sigma0(a) lea 16*$SZ($inp),$inp add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop mov $_rsp,%rsi .cfi_def_cfa %rsi,8 mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: ret .cfi_endproc .size $func,.-$func ___ if ($SZ==4) { $code.=<<___; .align 64 .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " ___ } else { $code.=<<___; .align 64 .type $TABLE,\@object $TABLE: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " ___ } ###################################################################### # SIMD code paths # if ($SZ==4 && $shaext) {{{ ###################################################################### # Intel SHA Extensions implementation of SHA256 update function. # my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); my @MSG=map("%xmm$_",(3..6)); $code.=<<___; .type sha256_block_data_order_shaext,\@function,3 .align 64 sha256_block_data_order_shaext: _shaext_shortcut: .cfi_startproc ___ $code.=<<___ if ($win64); lea `-8-5*16`(%rsp),%rsp movaps %xmm6,-8-5*16(%rax) movaps %xmm7,-8-4*16(%rax) movaps %xmm8,-8-3*16(%rax) movaps %xmm9,-8-2*16(%rax) movaps %xmm10,-8-1*16(%rax) .Lprologue_shaext: ___ $code.=<<___; lea K256+0x80(%rip),$Tbl movdqu ($ctx),$ABEF # DCBA movdqu 16($ctx),$CDGH # HGFE movdqa 0x200-0x80($Tbl),$TMP # byte swap mask pshufd \$0x1b,$ABEF,$Wi # ABCD pshufd \$0xb1,$ABEF,$ABEF # CDAB pshufd \$0x1b,$CDGH,$CDGH # EFGH movdqa $TMP,$BSWAP # offload palignr \$8,$CDGH,$ABEF # ABEF punpcklqdq $Wi,$CDGH # CDGH jmp .Loop_shaext .align 16 .Loop_shaext: movdqu ($inp),@MSG[0] movdqu 0x10($inp),@MSG[1] movdqu 0x20($inp),@MSG[2] pshufb $TMP,@MSG[0] movdqu 0x30($inp),@MSG[3] movdqa 0*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi pshufb $TMP,@MSG[1] movdqa $CDGH,$CDGH_SAVE # offload sha256rnds2 $ABEF,$CDGH # 0-3 pshufd \$0x0e,$Wi,$Wi nop movdqa $ABEF,$ABEF_SAVE # offload sha256rnds2 $CDGH,$ABEF movdqa 1*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi pshufb $TMP,@MSG[2] sha256rnds2 $ABEF,$CDGH # 4-7 pshufd \$0x0e,$Wi,$Wi lea 0x40($inp),$inp sha256msg1 @MSG[1],@MSG[0] sha256rnds2 $CDGH,$ABEF movdqa 2*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi pshufb $TMP,@MSG[3] sha256rnds2 $ABEF,$CDGH # 8-11 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[3],$TMP palignr \$4,@MSG[2],$TMP nop paddd $TMP,@MSG[0] sha256msg1 @MSG[2],@MSG[1] sha256rnds2 $CDGH,$ABEF movdqa 3*32-0x80($Tbl),$Wi paddd @MSG[3],$Wi sha256msg2 @MSG[3],@MSG[0] sha256rnds2 $ABEF,$CDGH # 12-15 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[0],$TMP palignr \$4,@MSG[3],$TMP nop paddd $TMP,@MSG[1] sha256msg1 @MSG[3],@MSG[2] sha256rnds2 $CDGH,$ABEF ___ for($i=4;$i<16-3;$i++) { $code.=<<___; movdqa $i*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 16-19... pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP nop paddd $TMP,@MSG[2] sha256msg1 @MSG[0],@MSG[3] sha256rnds2 $CDGH,$ABEF ___ push(@MSG,shift(@MSG)); } $code.=<<___; movdqa 13*32-0x80($Tbl),$Wi paddd @MSG[0],$Wi sha256msg2 @MSG[0],@MSG[1] sha256rnds2 $ABEF,$CDGH # 52-55 pshufd \$0x0e,$Wi,$Wi movdqa @MSG[1],$TMP palignr \$4,@MSG[0],$TMP sha256rnds2 $CDGH,$ABEF paddd $TMP,@MSG[2] movdqa 14*32-0x80($Tbl),$Wi paddd @MSG[1],$Wi sha256rnds2 $ABEF,$CDGH # 56-59 pshufd \$0x0e,$Wi,$Wi sha256msg2 @MSG[1],@MSG[2] movdqa $BSWAP,$TMP sha256rnds2 $CDGH,$ABEF movdqa 15*32-0x80($Tbl),$Wi paddd @MSG[2],$Wi nop sha256rnds2 $ABEF,$CDGH # 60-63 pshufd \$0x0e,$Wi,$Wi dec $num nop sha256rnds2 $CDGH,$ABEF paddd $CDGH_SAVE,$CDGH paddd $ABEF_SAVE,$ABEF jnz .Loop_shaext pshufd \$0xb1,$CDGH,$CDGH # DCHG pshufd \$0x1b,$ABEF,$TMP # FEBA pshufd \$0xb1,$ABEF,$ABEF # BAFE punpckhqdq $CDGH,$ABEF # DCBA palignr \$8,$TMP,$CDGH # HGFE movdqu $ABEF,($ctx) movdqu $CDGH,16($ctx) ___ $code.=<<___ if ($win64); movaps -8-5*16(%rax),%xmm6 movaps -8-4*16(%rax),%xmm7 movaps -8-3*16(%rax),%xmm8 movaps -8-2*16(%rax),%xmm9 movaps -8-1*16(%rax),%xmm10 mov %rax,%rsp .Lepilogue_shaext: ___ $code.=<<___; ret .cfi_endproc .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext ___ }}} {{{ my $a4=$T1; my ($a,$b,$c,$d,$e,$f,$g,$h); sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; my $arg = pop; $arg = "\$$arg" if ($arg*1 eq $arg); $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n"; } sub body_00_15 () { ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&ror ($a0,$Sigma1[2]-$Sigma1[1])', '&mov ($a,$a1)', '&mov ($a4,$f)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', '&xor ($a0,$e)', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', '&xor ($a1,$a)', '&and ($a4,$e)', # (f^g)&e '&xor ($a0,$e)', '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a2,$b)', # a^b, b^c in next round '&add ($h,$a4)', # h+=Ch(e,f,g) '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&ror ($a1,$Sigma0[0])', # Sigma0(a) '&add ($d,$h)', # d+=h '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', '&add ($a1,$h);'. # h+=Sigma0(a) '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); } ###################################################################### # SSSE3 code path # if ($SZ==4) { # SHA256 only my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; .type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: .cfi_startproc .Lssse3_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___; .Lprologue_ssse3: mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ $code.=<<___; #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] pshufb $t3,@X[0] movdqu 0x30($inp),@X[3] lea $TABLE(%rip),$Tbl pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 movdqa 0x20($Tbl),$t1 pshufb $t3,@X[2] paddd @X[0],$t0 movdqa 0x40($Tbl),$t2 pshufb $t3,@X[3] movdqa 0x60($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 movdqa $t0,0x00(%rsp) mov $A,$a1 movdqa $t1,0x10(%rsp) mov $B,$a3 movdqa $t2,0x20(%rsp) xor $C,$a3 # magic movdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( '&movdqa ($t0,@X[1]);', '&movdqa ($t3,@X[3])', '&palignr ($t0,@X[0],$SZ)', # X[1..4] '&palignr ($t3,@X[2],$SZ);', # X[9..12] '&movdqa ($t1,$t0)', '&movdqa ($t2,$t0);', '&psrld ($t0,$sigma0[2])', '&paddd (@X[0],$t3);', # X[0..3] += X[9..12] '&psrld ($t2,$sigma0[0])', '&pshufd ($t3,@X[3],0b11111010)',# X[14..15] '&pslld ($t1,8*$SZ-$sigma0[1]);'. '&pxor ($t0,$t2)', '&psrld ($t2,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t1)', '&pslld ($t1,$sigma0[1]-$sigma0[0]);'. '&pxor ($t0,$t2);', '&movdqa ($t2,$t3)', '&pxor ($t0,$t1);', # sigma0(X[1..4]) '&psrld ($t3,$sigma1[2])', '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4]) '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2)', '&pshufb ($t3,$t4)', # sigma1(X[14..15]) '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) '&pshufd ($t3,@X[0],0b01010000)',# X[16..17] '&movdqa ($t2,$t3);', '&psrld ($t3,$sigma1[2])', '&psrlq ($t2,$sigma1[0])', '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', '&movdqa ($t2,16*2*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); } sub SSSE3_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions if (0) { foreach (Xupdate_256_SSSE3()) { # 36 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } } else { # squeeze extra 4% on Westmere and 19% on Atom eval(shift(@insns)); #@ &movdqa ($t0,@X[1]); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t3,@X[3]); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &palignr ($t0,@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t1,$t0); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t2,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[4..15] eval(shift(@insns)); eval(shift(@insns)); #@ &pslld ($t1,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrld ($t2,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); #@ &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t0,$t1); # sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); #&pshufb ($t3,$t4); # sigma1(X[14..15]) &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ #&pshufb ($t3,$t5); &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); eval(shift(@insns)); &movdqa ($t2,16*2*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); } &paddd ($t2,@X[0]); foreach (@insns) { eval; } # remaining instructions &movdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_ssse3 mov $_rsp,%rsi .cfi_def_cfa %rsi,8 ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret .cfi_endproc .size ${func}_ssse3,.-${func}_ssse3 ___ } if ($avx) {{ ###################################################################### # XOP code path # if ($SZ==8) { # SHA512 only $code.=<<___; .type ${func}_xop,\@function,3 .align 64 ${func}_xop: .cfi_startproc .Lxop_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_xop: vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H jmp .Lloop_xop ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub XOP_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[3],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpsrldq ($t3,$t3,8); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrld ($t2,@X[0],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpslldq ($t3,$t3,8); # 22 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } else { # SHA512 my @X = map("%xmm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); $code.=<<___; .align 16 .Lloop_xop: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[1],@X[1] vmovdqu 0x40($inp),@X[4] vpshufb $t3,@X[2],@X[2] vmovdqu 0x50($inp),@X[5] vpshufb $t3,@X[3],@X[3] vmovdqu 0x60($inp),@X[6] vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) mov $B,$a3 vmovdqa $t2,0x60(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x70(%rsp) mov $E,$a0 jmp .Lxop_00_47 .align 16 .Lxop_00_47: add \$`16*2*$SZ`,$Tbl ___ sub XOP_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 52 instructions &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] eval(shift(@insns)); eval(shift(@insns)); &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); eval(shift(@insns)); eval(shift(@insns)); &vpsrlq ($t0,$t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t1); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) eval(shift(@insns)); eval(shift(@insns)); &vpsrlq ($t2,@X[7],$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) eval(shift(@insns)); eval(shift(@insns)); &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t2); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &XOP_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_xop mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96(%rsp),%xmm10 movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_xop: ret .cfi_endproc .size ${func}_xop,.-${func}_xop ___ } ###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; .type ${func}_avx,\@function,3 .align 64 ${func}_avx: .cfi_startproc .Lavx_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ and \$-64,%rsp # align stack frame mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_avx: vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ if ($SZ==4) { # SHA256 my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[0],@X[0] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[1],@X[1] vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) mov $B,$a3 vmovdqa $t2,0x20(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x30(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4] '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12] '&vpsrld ($t2,$t0,$sigma0[0]);', '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12] '&vpsrld ($t3,$t0,$sigma0[2])', '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15] '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrld ($t2,$t3,$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4]) '&vpsrlq ($t3,$t3,$sigma1[0]);', '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4]) '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15]) '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15]) '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17] '&vpsrld ($t2,$t3,$sigma1[2])', '&vpsrlq ($t3,$t3,$sigma1[0])', '&vpxor ($t2,$t2,$t3);', '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])', '&vpxor ($t2,$t2,$t3)', '&vpshufb ($t2,$t2,$t5)', '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17]) ); } sub AVX_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 104 instructions foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } else { # SHA512 my @X = map("%xmm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] vmovdqu 0x30($inp),@X[3] vpshufb $t3,@X[1],@X[1] vmovdqu 0x40($inp),@X[4] vpshufb $t3,@X[2],@X[2] vmovdqu 0x50($inp),@X[5] vpshufb $t3,@X[3],@X[3] vmovdqu 0x60($inp),@X[6] vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) mov $B,$a3 vmovdqa $t2,0x60(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x70(%rsp) mov $E,$a0 jmp .Lavx_00_47 .align 16 .Lavx_00_47: add \$`16*2*$SZ`,$Tbl ___ sub Xupdate_512_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] '&vpsrlq ($t2,$t0,$sigma0[0])', '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] '&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t1)', '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);', '&vpxor ($t0,$t0,$t2)', '&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);', '&vpxor ($t3,$t3,$t1)', '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15]) '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15]) ); } sub AVX_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 52 instructions foreach (Xupdate_512_AVX()) { # 23 instructions eval; eval(shift(@insns)); eval(shift(@insns)); } &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &AVX_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { foreach(body_00_15()) { eval; } } } $code.=<<___; mov $_ctx,$ctx mov $a1,$A add $SZ*0($ctx),$A lea 16*$SZ($inp),$inp add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jb .Lloop_avx mov $_rsp,%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 movaps 16*$SZ+48(%rsp),%xmm7 movaps 16*$SZ+64(%rsp),%xmm8 movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96(%rsp),%xmm10 movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx: ret .cfi_endproc .size ${func}_avx,.-${func}_avx ___ if ($avx>1) {{ ###################################################################### # AVX2+BMI code path # my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp my $PUSH8=8*2*$SZ; use integer; sub bodyx_00_15 () { # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f ( '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] '&and ($a4,$e)', # f&e '&rorx ($a0,$e,$Sigma1[2])', '&rorx ($a2,$e,$Sigma1[1])', '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past '&lea ($h,"($h,$a4)")', '&andn ($a4,$e,$g)', # ~e&g '&xor ($a0,$a2)', '&rorx ($a1,$e,$Sigma1[0])', '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) '&xor ($a0,$a1)', # Sigma1(e) '&mov ($a2,$a)', '&rorx ($a4,$a,$Sigma0[2])', '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) '&xor ($a2,$b)', # a^b, b^c in next round '&rorx ($a1,$a,$Sigma0[1])', '&rorx ($a0,$a,$Sigma0[0])', '&lea ($d,"($d,$h)")', # d+=h '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a4)', '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) '&xor ($a1,$a0)', # Sigma0(a) '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) '&mov ($a4,$e)', # copy of f in future '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' ); # and at the finish one has to $a+=$a1 } $code.=<<___; .type ${func}_avx2,\@function,3 .align 64 ${func}_avx2: .cfi_startproc .Lavx2_shortcut: mov %rsp,%rax # copy %rsp .cfi_def_cfa_register %rax push %rbx .cfi_push %rbx push %rbp .cfi_push %rbp push %r12 .cfi_push %r12 push %r13 .cfi_push %r13 push %r14 .cfi_push %r14 push %r15 .cfi_push %r15 sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp shl \$4,%rdx # num*16 and \$-256*$SZ,%rsp # align stack frame lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ add \$`2*$SZ*($rounds-8)`,%rsp mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg mov %rax,$_rsp # save copy of %rsp .cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) movaps %xmm7,16*$SZ+48(%rsp) movaps %xmm8,16*$SZ+64(%rsp) movaps %xmm9,16*$SZ+80(%rsp) ___ $code.=<<___ if ($win64 && $SZ>4); movaps %xmm10,16*$SZ+96(%rsp) movaps %xmm11,16*$SZ+112(%rsp) ___ $code.=<<___; .Lprologue_avx2: vzeroupper sub \$-16*$SZ,$inp # inp++, size optimization mov $SZ*0($ctx),$A mov $inp,%r12 # borrow $T1 mov $SZ*1($ctx),$B cmp %rdx,$inp # $_end mov $SZ*2($ctx),$C cmove %rsp,%r12 # next block or random data mov $SZ*3($ctx),$D mov $SZ*4($ctx),$E mov $SZ*5($ctx),$F mov $SZ*6($ctx),$G mov $SZ*7($ctx),$H ___ if ($SZ==4) { # SHA256 my @X = map("%ymm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); $code.=<<___; vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu -16*$SZ+0($inp),%xmm0 vmovdqu -16*$SZ+16($inp),%xmm1 vmovdqu -16*$SZ+32($inp),%xmm2 vmovdqu -16*$SZ+48($inp),%xmm3 #mov $inp,$_inp # offload $inp vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t3,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t3,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] lea $TABLE(%rip),$Tbl vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] vpaddd 0x20($Tbl),@X[1],$t1 vpaddd 0x40($Tbl),@X[2],$t2 vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rdi as frame pointer mov $_rsp,%rdi .cfi_def_cfa %rdi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rdi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; mov $B,$a3 vmovdqa $t2,0x00(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x20(%rsp) mov $F,$a4 sub \$-16*2*$SZ,$Tbl # size optimization jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: ___ sub AVX2_256_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body,&$body,&$body); # 96 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%2)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_256_AVX()) { # 29 instructions eval; eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } for ($i=0,$j=0; $j<4; $j++) { &AVX2_256_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1)."($Tbl)",0); &jne (".Lavx2_00_47"); for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } else { # SHA512 my @X = map("%ymm$_",(0..7)); my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); $code.=<<___; jmp .Loop_avx2 .align 16 .Loop_avx2: vmovdqu -16*$SZ($inp),%xmm0 vmovdqu -16*$SZ+16($inp),%xmm1 vmovdqu -16*$SZ+32($inp),%xmm2 lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu -16*$SZ+48($inp),%xmm3 vmovdqu -16*$SZ+64($inp),%xmm4 vmovdqu -16*$SZ+80($inp),%xmm5 vmovdqu -16*$SZ+96($inp),%xmm6 vmovdqu -16*$SZ+112($inp),%xmm7 #mov $inp,$_inp # offload $inp vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 vinserti128 \$1,(%r12),@X[0],@X[0] vinserti128 \$1,16(%r12),@X[1],@X[1] vpshufb $t2,@X[0],@X[0] vinserti128 \$1,32(%r12),@X[2],@X[2] vpshufb $t2,@X[1],@X[1] vinserti128 \$1,48(%r12),@X[3],@X[3] vpshufb $t2,@X[2],@X[2] vinserti128 \$1,64(%r12),@X[4],@X[4] vpshufb $t2,@X[3],@X[3] vinserti128 \$1,80(%r12),@X[5],@X[5] vpshufb $t2,@X[4],@X[4] vinserti128 \$1,96(%r12),@X[6],@X[6] vpshufb $t2,@X[5],@X[5] vinserti128 \$1,112(%r12),@X[7],@X[7] vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t2,@X[6],@X[6] vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t2,@X[7],@X[7] vpaddq -0x40($Tbl),@X[2],$t2 vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x20(%rsp) vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x40(%rsp) vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x60(%rsp) ___ $code.=<<___ if (!$win64); # temporarily use %rdi as frame pointer mov $_rsp,%rdi .cfi_def_cfa %rdi,8 ___ $code.=<<___; lea -$PUSH8(%rsp),%rsp ___ $code.=<<___ if (!$win64); # the frame info is at $_rsp, but the stack is moving... # so a second frame pointer is saved at -8(%rsp) # that is in the red zone mov %rdi,-8(%rsp) .cfi_cfa_expression %rsp-8,deref,+8 ___ $code.=<<___; vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x00(%rsp) xor $a1,$a1 vmovdqa $t1,0x20(%rsp) mov $B,$a3 vmovdqa $t2,0x40(%rsp) xor $C,$a3 # magic vmovdqa $t3,0x60(%rsp) mov $F,$a4 add \$16*2*$SZ,$Tbl jmp .Lavx2_00_47 .align 16 .Lavx2_00_47: ___ sub AVX2_512_00_47 () { my $j = shift; my $body = shift; my @X = @_; my @insns = (&$body,&$body); # 48 instructions my $base = "+2*$PUSH8(%rsp)"; if (($j%4)==0) { &lea ("%rsp","-$PUSH8(%rsp)"); $code.=<<___ if (!$win64); .cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 # copy secondary frame pointer to new location again at -8(%rsp) pushq $PUSH8-8(%rsp) .cfi_cfa_expression %rsp,deref,+8 lea 8(%rsp),%rsp .cfi_cfa_expression %rsp-8,deref,+8 ___ } foreach (Xupdate_512_AVX()) { # 23 instructions eval; if ($_ !~ /\;$/) { eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); } } &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); } for ($i=0,$j=0; $j<8; $j++) { &AVX2_512_00_47($j,\&bodyx_00_15,@X); push(@X,shift(@X)); # rotate(@X) } &lea ($Tbl,16*2*$SZ."($Tbl)"); &cmpb (($SZ-1-0x80)."($Tbl)",0); &jne (".Lavx2_00_47"); for ($i=0; $i<16; ) { my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; foreach(bodyx_00_15()) { eval; } } } $code.=<<___; mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx add $a1,$A #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp lea `2*$SZ*($rounds-8)`(%rsp),$Tbl add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F add $SZ*6($ctx),$G add $SZ*7($ctx),$H mov $A,$SZ*0($ctx) mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) cmp `$PUSH8+2*8`($Tbl),$inp # $_end je .Ldone_avx2 xor $a1,$a1 mov $B,$a3 xor $C,$a3 # magic mov $F,$a4 jmp .Lower_avx2 .align 16 .Lower_avx2: ___ for ($i=0; $i<8; ) { my $base="+16($Tbl)"; foreach(bodyx_00_15()) { eval; } } $code.=<<___; lea -$PUSH8($Tbl),$Tbl cmp %rsp,$Tbl jae .Lower_avx2 mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx add $a1,$A #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp lea `2*$SZ*($rounds-8)`(%rsp),%rsp # restore frame pointer to original location at $_rsp .cfi_cfa_expression $_rsp,deref,+8 add $SZ*0($ctx),$A add $SZ*1($ctx),$B add $SZ*2($ctx),$C add $SZ*3($ctx),$D add $SZ*4($ctx),$E add $SZ*5($ctx),$F lea `2*16*$SZ`($inp),$inp # inp+=2 add $SZ*6($ctx),$G mov $inp,%r12 add $SZ*7($ctx),$H cmp $_end,$inp mov $A,$SZ*0($ctx) cmove %rsp,%r12 # next block or stale data mov $B,$SZ*1($ctx) mov $C,$SZ*2($ctx) mov $D,$SZ*3($ctx) mov $E,$SZ*4($ctx) mov $F,$SZ*5($ctx) mov $G,$SZ*6($ctx) mov $H,$SZ*7($ctx) jbe .Loop_avx2 lea (%rsp),$Tbl # temporarily use $Tbl as index to $_rsp # this avoids the need to save a secondary frame pointer at -8(%rsp) .cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 .Ldone_avx2: mov `16*$SZ+3*8`($Tbl),%rsi .cfi_def_cfa %rsi,8 vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32($Tbl),%xmm6 movaps 16*$SZ+48($Tbl),%xmm7 movaps 16*$SZ+64($Tbl),%xmm8 movaps 16*$SZ+80($Tbl),%xmm9 ___ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+96($Tbl),%xmm10 movaps 16*$SZ+112($Tbl),%xmm11 ___ $code.=<<___; mov -48(%rsi),%r15 .cfi_restore %r15 mov -40(%rsi),%r14 .cfi_restore %r14 mov -32(%rsi),%r13 .cfi_restore %r13 mov -24(%rsi),%r12 .cfi_restore %r12 mov -16(%rsi),%rbp .cfi_restore %rbp mov -8(%rsi),%rbx .cfi_restore %rbx lea (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_avx2: ret .cfi_endproc .size ${func}_avx2,.-${func}_avx2 ___ }} }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, # CONTEXT *context,DISPATCHER_CONTEXT *disp) if ($win64) { $rec="%rcx"; $frame="%rdx"; $context="%r8"; $disp="%r9"; $code.=<<___; .extern __imp_RtlVirtualUnwind .type se_handler,\@abi-omnipotent .align 16 se_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip mov 8($disp),%rsi # disp->ImageBase mov 56($disp),%r11 # disp->HanderlData mov 0(%r11),%r10d # HandlerData[0] lea (%rsi,%r10),%r10 # prologue label cmp %r10,%rbx # context->RipRsp mov 4(%r11),%r10d # HandlerData[1] lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue ___ $code.=<<___ if ($avx>1); lea .Lavx2_shortcut(%rip),%r10 cmp %r10,%rbx # context->RipRbx mov %rbp,160($context) # restore context->Rbp mov %r12,216($context) # restore context->R12 mov %r13,224($context) # restore context->R13 mov %r14,232($context) # restore context->R14 mov %r15,240($context) # restore context->R15 lea .Lepilogue(%rip),%r10 cmp %r10,%rbx jb .Lin_prologue # non-AVX code lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area lea 512($context),%rdi # &context.Xmm6 mov \$`$SZ==4?8:12`,%ecx .long 0xa548f3fc # cld; rep movsq .Lin_prologue: mov 8(%rax),%rdi mov 16(%rax),%rsi mov %rax,152($context) # restore context->Rsp mov %rsi,168($context) # restore context->Rsi mov %rdi,176($context) # restore context->Rdi mov 40($disp),%rdi # disp->ContextRecord mov $context,%rsi # context mov \$154,%ecx # sizeof(CONTEXT) .long 0xa548f3fc # cld; rep movsq mov $disp,%rsi xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER mov 8(%rsi),%rdx # arg2, disp->ImageBase mov 0(%rsi),%r8 # arg3, disp->ControlPc mov 16(%rsi),%r9 # arg4, disp->FunctionEntry mov 40(%rsi),%r10 # disp->ContextRecord lea 56(%rsi),%r11 # &disp->HandlerData lea 24(%rsi),%r12 # &disp->EstablisherFrame mov %r10,32(%rsp) # arg5 mov %r11,40(%rsp) # arg6 mov %r12,48(%rsp) # arg7 mov %rcx,56(%rsp) # arg8, (NULL) call *__imp_RtlVirtualUnwind(%rip) mov \$1,%eax # ExceptionContinueSearch add \$64,%rsp popfq pop %r15 pop %r14 pop %r13 pop %r12 pop %rbp pop %rbx pop %rdi pop %rsi ret .size se_handler,.-se_handler ___ $code.=<<___ if ($SZ==4 && $shaext); .type shaext_handler,\@abi-omnipotent .align 16 shaext_handler: push %rsi push %rdi push %rbx push %rbp push %r12 push %r13 push %r14 push %r15 pushfq sub \$64,%rsp mov 120($context),%rax # pull context->Rax mov 248($context),%rbx # pull context->Rip lea .Lprologue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip<.Lprologue jb .Lin_prologue lea .Lepilogue_shaext(%rip),%r10 cmp %r10,%rbx # context->Rip>=.Lepilogue jae .Lin_prologue lea -8-5*16(%rax),%rsi lea 512($context),%rdi # &context.Xmm6 mov \$10,%ecx .long 0xa548f3fc # cld; rep movsq jmp .Lin_prologue .size shaext_handler,.-shaext_handler ___ $code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_$func .rva .LSEH_end_$func .rva .LSEH_info_$func ___ $code.=<<___ if ($SZ==4 && $shaext); .rva .LSEH_begin_${func}_shaext .rva .LSEH_end_${func}_shaext .rva .LSEH_info_${func}_shaext ___ $code.=<<___ if ($SZ==4); .rva .LSEH_begin_${func}_ssse3 .rva .LSEH_end_${func}_ssse3 .rva .LSEH_info_${func}_ssse3 ___ $code.=<<___ if ($avx && $SZ==8); .rva .LSEH_begin_${func}_xop .rva .LSEH_end_${func}_xop .rva .LSEH_info_${func}_xop ___ $code.=<<___ if ($avx); .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ $code.=<<___ if ($avx>1); .rva .LSEH_begin_${func}_avx2 .rva .LSEH_end_${func}_avx2 .rva .LSEH_info_${func}_avx2 ___ $code.=<<___; .section .xdata .align 8 .LSEH_info_$func: .byte 9,0,0,0 .rva se_handler .rva .Lprologue,.Lepilogue # HandlerData[] ___ $code.=<<___ if ($SZ==4 && $shaext); .LSEH_info_${func}_shaext: .byte 9,0,0,0 .rva shaext_handler ___ $code.=<<___ if ($SZ==4); .LSEH_info_${func}_ssse3: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ $code.=<<___ if ($avx && $SZ==8); .LSEH_info_${func}_xop: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] ___ $code.=<<___ if ($avx); .LSEH_info_${func}_avx: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ $code.=<<___ if ($avx>1); .LSEH_info_${func}_avx2: .byte 9,0,0,0 .rva se_handler .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] ___ } sub sha256op38 { my $instr = shift; my %opcodelet = ( "sha256rnds2" => 0xcb, "sha256msg1" => 0xcc, "sha256msg2" => 0xcd ); if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { my @opcode=(0x0f,0x38); push @opcode,$opcodelet{$instr}; push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M return ".byte\t".join(',',@opcode); } else { return $instr."\t".@_[0]; } } foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; print $_,"\n"; } close STDOUT or die "error closing STDOUT: $!"; Index: stable/12/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/aesni-gcm-x86_64.S (revision 364963) @@ -1,21 +1,791 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from aesni-gcm-x86_64.pl. */ .text -.globl aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -aesni_gcm_encrypt: +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: .cfi_startproc - xorl %eax,%eax + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%ebp + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + je .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + .byte 0xf3,0xc3 .cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt - +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x .globl aesni_gcm_decrypt .type aesni_gcm_decrypt,@function +.align 32 aesni_gcm_decrypt: .cfi_startproc - xorl %eax,%eax + xorq %r10,%r10 + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32+32(%r9),%r9 + movl 240-128(%rcx),%ebp + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + leaq -192(%rdi,%rdx,1),%r15 + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_dec_abort: + movq %r10,%rax .byte 0xf3,0xc3 .cfi_endproc .size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%rbp),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $288,%rdx + jb .Lgcm_enc_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%ebp + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + leaq (%rsi),%r14 + leaq -192(%rsi,%rdx,1),%r15 + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + leaq 32+32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 Index: stable/12/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/aesni-mb-x86_64.S (revision 364963) @@ -1,544 +1,1509 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from aesni-mb-x86_64.pl. */ .text .globl aesni_multi_cbc_encrypt .type aesni_multi_cbc_encrypt,@function .align 32 aesni_multi_cbc_encrypt: .cfi_startproc + cmpl $2,%edx + jb .Lenc_non_avx + movl OPENSSL_ia32cap_P+4(%rip),%ecx + testl $268435456,%ecx + jnz _avx_cbc_enc_shortcut + jmp .Lenc_non_avx +.align 16 +.Lenc_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 subq $48,%rsp andq $-64,%rsp movq %rax,16(%rsp) .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 .Lenc4x_body: movdqu (%rsi),%xmm12 leaq 120(%rsi),%rsi leaq 80(%rdi),%rdi .Lenc4x_loop_grande: movl %edx,24(%rsp) xorl %edx,%edx movl -64(%rdi),%ecx movq -80(%rdi),%r8 cmpl %edx,%ecx movq -72(%rdi),%r12 cmovgl %ecx,%edx testl %ecx,%ecx movdqu -56(%rdi),%xmm2 movl %ecx,32(%rsp) cmovleq %rsp,%r8 movl -24(%rdi),%ecx movq -40(%rdi),%r9 cmpl %edx,%ecx movq -32(%rdi),%r13 cmovgl %ecx,%edx testl %ecx,%ecx movdqu -16(%rdi),%xmm3 movl %ecx,36(%rsp) cmovleq %rsp,%r9 movl 16(%rdi),%ecx movq 0(%rdi),%r10 cmpl %edx,%ecx movq 8(%rdi),%r14 cmovgl %ecx,%edx testl %ecx,%ecx movdqu 24(%rdi),%xmm4 movl %ecx,40(%rsp) cmovleq %rsp,%r10 movl 56(%rdi),%ecx movq 40(%rdi),%r11 cmpl %edx,%ecx movq 48(%rdi),%r15 cmovgl %ecx,%edx testl %ecx,%ecx movdqu 64(%rdi),%xmm5 movl %ecx,44(%rsp) cmovleq %rsp,%r11 testl %edx,%edx jz .Lenc4x_done movups 16-120(%rsi),%xmm1 pxor %xmm12,%xmm2 movups 32-120(%rsi),%xmm0 pxor %xmm12,%xmm3 movl 240-120(%rsi),%eax pxor %xmm12,%xmm4 movdqu (%r8),%xmm6 pxor %xmm12,%xmm5 movdqu (%r9),%xmm7 pxor %xmm6,%xmm2 movdqu (%r10),%xmm8 pxor %xmm7,%xmm3 movdqu (%r11),%xmm9 pxor %xmm8,%xmm4 pxor %xmm9,%xmm5 movdqa 32(%rsp),%xmm10 xorq %rbx,%rbx jmp .Loop_enc4x .align 32 .Loop_enc4x: addq $16,%rbx leaq 16(%rsp),%rbp movl $1,%ecx subq %rbx,%rbp .byte 102,15,56,220,209 prefetcht0 31(%r8,%rbx,1) prefetcht0 31(%r9,%rbx,1) .byte 102,15,56,220,217 prefetcht0 31(%r10,%rbx,1) prefetcht0 31(%r10,%rbx,1) .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups 48-120(%rsi),%xmm1 cmpl 32(%rsp),%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 cmovgeq %rbp,%r8 cmovgq %rbp,%r12 .byte 102,15,56,220,232 movups -56(%rsi),%xmm0 cmpl 36(%rsp),%ecx .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 cmovgeq %rbp,%r9 cmovgq %rbp,%r13 .byte 102,15,56,220,233 movups -40(%rsi),%xmm1 cmpl 40(%rsp),%ecx .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 cmovgeq %rbp,%r10 cmovgq %rbp,%r14 .byte 102,15,56,220,232 movups -24(%rsi),%xmm0 cmpl 44(%rsp),%ecx .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 cmovgeq %rbp,%r11 cmovgq %rbp,%r15 .byte 102,15,56,220,233 movups -8(%rsi),%xmm1 movdqa %xmm10,%xmm11 .byte 102,15,56,220,208 prefetcht0 15(%r12,%rbx,1) prefetcht0 15(%r13,%rbx,1) .byte 102,15,56,220,216 prefetcht0 15(%r14,%rbx,1) prefetcht0 15(%r15,%rbx,1) .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups 128-120(%rsi),%xmm0 pxor %xmm12,%xmm12 .byte 102,15,56,220,209 pcmpgtd %xmm12,%xmm11 movdqu -120(%rsi),%xmm12 .byte 102,15,56,220,217 paddd %xmm11,%xmm10 movdqa %xmm10,32(%rsp) .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups 144-120(%rsi),%xmm1 cmpl $11,%eax .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups 160-120(%rsi),%xmm0 jb .Lenc4x_tail .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups 176-120(%rsi),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups 192-120(%rsi),%xmm0 je .Lenc4x_tail .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movups 208-120(%rsi),%xmm1 .byte 102,15,56,220,208 .byte 102,15,56,220,216 .byte 102,15,56,220,224 .byte 102,15,56,220,232 movups 224-120(%rsi),%xmm0 jmp .Lenc4x_tail .align 32 .Lenc4x_tail: .byte 102,15,56,220,209 .byte 102,15,56,220,217 .byte 102,15,56,220,225 .byte 102,15,56,220,233 movdqu (%r8,%rbx,1),%xmm6 movdqu 16-120(%rsi),%xmm1 .byte 102,15,56,221,208 movdqu (%r9,%rbx,1),%xmm7 pxor %xmm12,%xmm6 .byte 102,15,56,221,216 movdqu (%r10,%rbx,1),%xmm8 pxor %xmm12,%xmm7 .byte 102,15,56,221,224 movdqu (%r11,%rbx,1),%xmm9 pxor %xmm12,%xmm8 .byte 102,15,56,221,232 movdqu 32-120(%rsi),%xmm0 pxor %xmm12,%xmm9 movups %xmm2,-16(%r12,%rbx,1) pxor %xmm6,%xmm2 movups %xmm3,-16(%r13,%rbx,1) pxor %xmm7,%xmm3 movups %xmm4,-16(%r14,%rbx,1) pxor %xmm8,%xmm4 movups %xmm5,-16(%r15,%rbx,1) pxor %xmm9,%xmm5 decl %edx jnz .Loop_enc4x movq 16(%rsp),%rax .cfi_def_cfa %rax,8 movl 24(%rsp),%edx leaq 160(%rdi),%rdi decl %edx jnz .Lenc4x_loop_grande .Lenc4x_done: movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lenc4x_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt .globl aesni_multi_cbc_decrypt .type aesni_multi_cbc_decrypt,@function .align 32 aesni_multi_cbc_decrypt: .cfi_startproc + cmpl $2,%edx + jb .Ldec_non_avx + movl OPENSSL_ia32cap_P+4(%rip),%ecx + testl $268435456,%ecx + jnz _avx_cbc_dec_shortcut + jmp .Ldec_non_avx +.align 16 +.Ldec_non_avx: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 subq $48,%rsp andq $-64,%rsp movq %rax,16(%rsp) .cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 .Ldec4x_body: movdqu (%rsi),%xmm12 leaq 120(%rsi),%rsi leaq 80(%rdi),%rdi .Ldec4x_loop_grande: movl %edx,24(%rsp) xorl %edx,%edx movl -64(%rdi),%ecx movq -80(%rdi),%r8 cmpl %edx,%ecx movq -72(%rdi),%r12 cmovgl %ecx,%edx testl %ecx,%ecx movdqu -56(%rdi),%xmm6 movl %ecx,32(%rsp) cmovleq %rsp,%r8 movl -24(%rdi),%ecx movq -40(%rdi),%r9 cmpl %edx,%ecx movq -32(%rdi),%r13 cmovgl %ecx,%edx testl %ecx,%ecx movdqu -16(%rdi),%xmm7 movl %ecx,36(%rsp) cmovleq %rsp,%r9 movl 16(%rdi),%ecx movq 0(%rdi),%r10 cmpl %edx,%ecx movq 8(%rdi),%r14 cmovgl %ecx,%edx testl %ecx,%ecx movdqu 24(%rdi),%xmm8 movl %ecx,40(%rsp) cmovleq %rsp,%r10 movl 56(%rdi),%ecx movq 40(%rdi),%r11 cmpl %edx,%ecx movq 48(%rdi),%r15 cmovgl %ecx,%edx testl %ecx,%ecx movdqu 64(%rdi),%xmm9 movl %ecx,44(%rsp) cmovleq %rsp,%r11 testl %edx,%edx jz .Ldec4x_done movups 16-120(%rsi),%xmm1 movups 32-120(%rsi),%xmm0 movl 240-120(%rsi),%eax movdqu (%r8),%xmm2 movdqu (%r9),%xmm3 pxor %xmm12,%xmm2 movdqu (%r10),%xmm4 pxor %xmm12,%xmm3 movdqu (%r11),%xmm5 pxor %xmm12,%xmm4 pxor %xmm12,%xmm5 movdqa 32(%rsp),%xmm10 xorq %rbx,%rbx jmp .Loop_dec4x .align 32 .Loop_dec4x: addq $16,%rbx leaq 16(%rsp),%rbp movl $1,%ecx subq %rbx,%rbp .byte 102,15,56,222,209 prefetcht0 31(%r8,%rbx,1) prefetcht0 31(%r9,%rbx,1) .byte 102,15,56,222,217 prefetcht0 31(%r10,%rbx,1) prefetcht0 31(%r11,%rbx,1) .byte 102,15,56,222,225 .byte 102,15,56,222,233 movups 48-120(%rsi),%xmm1 cmpl 32(%rsp),%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 .byte 102,15,56,222,224 cmovgeq %rbp,%r8 cmovgq %rbp,%r12 .byte 102,15,56,222,232 movups -56(%rsi),%xmm0 cmpl 36(%rsp),%ecx .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 cmovgeq %rbp,%r9 cmovgq %rbp,%r13 .byte 102,15,56,222,233 movups -40(%rsi),%xmm1 cmpl 40(%rsp),%ecx .byte 102,15,56,222,208 .byte 102,15,56,222,216 .byte 102,15,56,222,224 cmovgeq %rbp,%r10 cmovgq %rbp,%r14 .byte 102,15,56,222,232 movups -24(%rsi),%xmm0 cmpl 44(%rsp),%ecx .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 cmovgeq %rbp,%r11 cmovgq %rbp,%r15 .byte 102,15,56,222,233 movups -8(%rsi),%xmm1 movdqa %xmm10,%xmm11 .byte 102,15,56,222,208 prefetcht0 15(%r12,%rbx,1) prefetcht0 15(%r13,%rbx,1) .byte 102,15,56,222,216 prefetcht0 15(%r14,%rbx,1) prefetcht0 15(%r15,%rbx,1) .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups 128-120(%rsi),%xmm0 pxor %xmm12,%xmm12 .byte 102,15,56,222,209 pcmpgtd %xmm12,%xmm11 movdqu -120(%rsi),%xmm12 .byte 102,15,56,222,217 paddd %xmm11,%xmm10 movdqa %xmm10,32(%rsp) .byte 102,15,56,222,225 .byte 102,15,56,222,233 movups 144-120(%rsi),%xmm1 cmpl $11,%eax .byte 102,15,56,222,208 .byte 102,15,56,222,216 .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups 160-120(%rsi),%xmm0 jb .Ldec4x_tail .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 .byte 102,15,56,222,233 movups 176-120(%rsi),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups 192-120(%rsi),%xmm0 je .Ldec4x_tail .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 .byte 102,15,56,222,233 movups 208-120(%rsi),%xmm1 .byte 102,15,56,222,208 .byte 102,15,56,222,216 .byte 102,15,56,222,224 .byte 102,15,56,222,232 movups 224-120(%rsi),%xmm0 jmp .Ldec4x_tail .align 32 .Ldec4x_tail: .byte 102,15,56,222,209 .byte 102,15,56,222,217 .byte 102,15,56,222,225 pxor %xmm0,%xmm6 pxor %xmm0,%xmm7 .byte 102,15,56,222,233 movdqu 16-120(%rsi),%xmm1 pxor %xmm0,%xmm8 pxor %xmm0,%xmm9 movdqu 32-120(%rsi),%xmm0 .byte 102,15,56,223,214 .byte 102,15,56,223,223 movdqu -16(%r8,%rbx,1),%xmm6 movdqu -16(%r9,%rbx,1),%xmm7 .byte 102,65,15,56,223,224 .byte 102,65,15,56,223,233 movdqu -16(%r10,%rbx,1),%xmm8 movdqu -16(%r11,%rbx,1),%xmm9 movups %xmm2,-16(%r12,%rbx,1) movdqu (%r8,%rbx,1),%xmm2 movups %xmm3,-16(%r13,%rbx,1) movdqu (%r9,%rbx,1),%xmm3 pxor %xmm12,%xmm2 movups %xmm4,-16(%r14,%rbx,1) movdqu (%r10,%rbx,1),%xmm4 pxor %xmm12,%xmm3 movups %xmm5,-16(%r15,%rbx,1) movdqu (%r11,%rbx,1),%xmm5 pxor %xmm12,%xmm4 pxor %xmm12,%xmm5 decl %edx jnz .Loop_dec4x movq 16(%rsp),%rax .cfi_def_cfa %rax,8 movl 24(%rsp),%edx leaq 160(%rdi),%rdi decl %edx jnz .Ldec4x_loop_grande .Ldec4x_done: movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Ldec4x_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt +.type aesni_multi_cbc_encrypt_avx,@function +.align 32 +aesni_multi_cbc_encrypt_avx: +.cfi_startproc +_avx_cbc_enc_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + + + + + + + + subq $192,%rsp + andq $-128,%rsp + movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 + +.Lenc8x_body: + vzeroupper + vmovdqu (%rsi),%xmm15 + leaq 120(%rsi),%rsi + leaq 160(%rdi),%rdi + shrl $1,%edx + +.Lenc8x_loop_grande: + + xorl %edx,%edx + movl -144(%rdi),%ecx + movq -160(%rdi),%r8 + cmpl %edx,%ecx + movq -152(%rdi),%rbx + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -136(%rdi),%xmm2 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + subq %r8,%rbx + movq %rbx,64(%rsp) + movl -104(%rdi),%ecx + movq -120(%rdi),%r9 + cmpl %edx,%ecx + movq -112(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -96(%rdi),%xmm3 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + subq %r9,%rbp + movq %rbp,72(%rsp) + movl -64(%rdi),%ecx + movq -80(%rdi),%r10 + cmpl %edx,%ecx + movq -72(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -56(%rdi),%xmm4 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + subq %r10,%rbp + movq %rbp,80(%rsp) + movl -24(%rdi),%ecx + movq -40(%rdi),%r11 + cmpl %edx,%ecx + movq -32(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -16(%rdi),%xmm5 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + subq %r11,%rbp + movq %rbp,88(%rsp) + movl 16(%rdi),%ecx + movq 0(%rdi),%r12 + cmpl %edx,%ecx + movq 8(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 24(%rdi),%xmm6 + movl %ecx,48(%rsp) + cmovleq %rsp,%r12 + subq %r12,%rbp + movq %rbp,96(%rsp) + movl 56(%rdi),%ecx + movq 40(%rdi),%r13 + cmpl %edx,%ecx + movq 48(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 64(%rdi),%xmm7 + movl %ecx,52(%rsp) + cmovleq %rsp,%r13 + subq %r13,%rbp + movq %rbp,104(%rsp) + movl 96(%rdi),%ecx + movq 80(%rdi),%r14 + cmpl %edx,%ecx + movq 88(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 104(%rdi),%xmm8 + movl %ecx,56(%rsp) + cmovleq %rsp,%r14 + subq %r14,%rbp + movq %rbp,112(%rsp) + movl 136(%rdi),%ecx + movq 120(%rdi),%r15 + cmpl %edx,%ecx + movq 128(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 144(%rdi),%xmm9 + movl %ecx,60(%rsp) + cmovleq %rsp,%r15 + subq %r15,%rbp + movq %rbp,120(%rsp) + testl %edx,%edx + jz .Lenc8x_done + + vmovups 16-120(%rsi),%xmm1 + vmovups 32-120(%rsi),%xmm0 + movl 240-120(%rsi),%eax + + vpxor (%r8),%xmm15,%xmm10 + leaq 128(%rsp),%rbp + vpxor (%r9),%xmm15,%xmm11 + vpxor (%r10),%xmm15,%xmm12 + vpxor (%r11),%xmm15,%xmm13 + vpxor %xmm10,%xmm2,%xmm2 + vpxor (%r12),%xmm15,%xmm10 + vpxor %xmm11,%xmm3,%xmm3 + vpxor (%r13),%xmm15,%xmm11 + vpxor %xmm12,%xmm4,%xmm4 + vpxor (%r14),%xmm15,%xmm12 + vpxor %xmm13,%xmm5,%xmm5 + vpxor (%r15),%xmm15,%xmm13 + vpxor %xmm10,%xmm6,%xmm6 + movl $1,%ecx + vpxor %xmm11,%xmm7,%xmm7 + vpxor %xmm12,%xmm8,%xmm8 + vpxor %xmm13,%xmm9,%xmm9 + jmp .Loop_enc8x + +.align 32 +.Loop_enc8x: + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+0(%rsp),%ecx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r8) + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r8,%rbx,1),%rbx + cmovgeq %rsp,%r8 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r8,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r8),%xmm15,%xmm10 + movq %rbx,64+0(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups -72(%rsi),%xmm1 + leaq 16(%r8,%rbx,1),%r8 + vmovdqu %xmm10,0(%rbp) + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+4(%rsp),%ecx + movq 64+8(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r9) + vaesenc %xmm0,%xmm4,%xmm4 + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%r9,%rbx,1),%rbx + cmovgeq %rsp,%r9 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r9,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r9),%xmm15,%xmm11 + movq %rbx,64+8(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups -56(%rsi),%xmm0 + leaq 16(%r9,%rbx,1),%r9 + vmovdqu %xmm11,16(%rbp) + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+8(%rsp),%ecx + movq 64+16(%rsp),%rbx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r10) + vaesenc %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r8) + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r10,%rbx,1),%rbx + cmovgeq %rsp,%r10 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r10,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r10),%xmm15,%xmm12 + movq %rbx,64+16(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups -40(%rsi),%xmm1 + leaq 16(%r10,%rbx,1),%r10 + vmovdqu %xmm12,32(%rbp) + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+12(%rsp),%ecx + movq 64+24(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r11) + vaesenc %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r9) + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%r11,%rbx,1),%rbx + cmovgeq %rsp,%r11 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r11,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r11),%xmm15,%xmm13 + movq %rbx,64+24(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups -24(%rsi),%xmm0 + leaq 16(%r11,%rbx,1),%r11 + vmovdqu %xmm13,48(%rbp) + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+16(%rsp),%ecx + movq 64+32(%rsp),%rbx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r12) + vaesenc %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r10) + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r12,%rbx,1),%rbx + cmovgeq %rsp,%r12 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r12,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r12),%xmm15,%xmm10 + movq %rbx,64+32(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups -8(%rsi),%xmm1 + leaq 16(%r12,%rbx,1),%r12 + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+20(%rsp),%ecx + movq 64+40(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r13) + vaesenc %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r11) + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%rbx,%r13,1),%rbx + cmovgeq %rsp,%r13 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r13,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r13),%xmm15,%xmm11 + movq %rbx,64+40(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 8(%rsi),%xmm0 + leaq 16(%r13,%rbx,1),%r13 + vaesenc %xmm1,%xmm2,%xmm2 + cmpl 32+24(%rsp),%ecx + movq 64+48(%rsp),%rbx + vaesenc %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r14) + vaesenc %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r12) + vaesenc %xmm1,%xmm5,%xmm5 + leaq (%r14,%rbx,1),%rbx + cmovgeq %rsp,%r14 + vaesenc %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm1,%xmm7,%xmm7 + subq %r14,%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vpxor 16(%r14),%xmm15,%xmm12 + movq %rbx,64+48(%rsp) + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 24(%rsi),%xmm1 + leaq 16(%r14,%rbx,1),%r14 + vaesenc %xmm0,%xmm2,%xmm2 + cmpl 32+28(%rsp),%ecx + movq 64+56(%rsp),%rbx + vaesenc %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r15) + vaesenc %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r13) + vaesenc %xmm0,%xmm5,%xmm5 + leaq (%r15,%rbx,1),%rbx + cmovgeq %rsp,%r15 + vaesenc %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesenc %xmm0,%xmm7,%xmm7 + subq %r15,%rbx + vaesenc %xmm0,%xmm8,%xmm8 + vpxor 16(%r15),%xmm15,%xmm13 + movq %rbx,64+56(%rsp) + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 40(%rsi),%xmm0 + leaq 16(%r15,%rbx,1),%r15 + vmovdqu 32(%rsp),%xmm14 + prefetcht0 15(%r14) + prefetcht0 15(%r15) + cmpl $11,%eax + jb .Lenc8x_tail + + vaesenc %xmm1,%xmm2,%xmm2 + vaesenc %xmm1,%xmm3,%xmm3 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm5,%xmm5 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm8,%xmm8 + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 176-120(%rsi),%xmm1 + + vaesenc %xmm0,%xmm2,%xmm2 + vaesenc %xmm0,%xmm3,%xmm3 + vaesenc %xmm0,%xmm4,%xmm4 + vaesenc %xmm0,%xmm5,%xmm5 + vaesenc %xmm0,%xmm6,%xmm6 + vaesenc %xmm0,%xmm7,%xmm7 + vaesenc %xmm0,%xmm8,%xmm8 + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 192-120(%rsi),%xmm0 + je .Lenc8x_tail + + vaesenc %xmm1,%xmm2,%xmm2 + vaesenc %xmm1,%xmm3,%xmm3 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm5,%xmm5 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm8,%xmm8 + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 208-120(%rsi),%xmm1 + + vaesenc %xmm0,%xmm2,%xmm2 + vaesenc %xmm0,%xmm3,%xmm3 + vaesenc %xmm0,%xmm4,%xmm4 + vaesenc %xmm0,%xmm5,%xmm5 + vaesenc %xmm0,%xmm6,%xmm6 + vaesenc %xmm0,%xmm7,%xmm7 + vaesenc %xmm0,%xmm8,%xmm8 + vaesenc %xmm0,%xmm9,%xmm9 + vmovups 224-120(%rsi),%xmm0 + +.Lenc8x_tail: + vaesenc %xmm1,%xmm2,%xmm2 + vpxor %xmm15,%xmm15,%xmm15 + vaesenc %xmm1,%xmm3,%xmm3 + vaesenc %xmm1,%xmm4,%xmm4 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesenc %xmm1,%xmm5,%xmm5 + vaesenc %xmm1,%xmm6,%xmm6 + vpaddd %xmm14,%xmm15,%xmm15 + vmovdqu 48(%rsp),%xmm14 + vaesenc %xmm1,%xmm7,%xmm7 + movq 64(%rsp),%rbx + vaesenc %xmm1,%xmm8,%xmm8 + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 16-120(%rsi),%xmm1 + + vaesenclast %xmm0,%xmm2,%xmm2 + vmovdqa %xmm15,32(%rsp) + vpxor %xmm15,%xmm15,%xmm15 + vaesenclast %xmm0,%xmm3,%xmm3 + vaesenclast %xmm0,%xmm4,%xmm4 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesenclast %xmm0,%xmm5,%xmm5 + vaesenclast %xmm0,%xmm6,%xmm6 + vpaddd %xmm15,%xmm14,%xmm14 + vmovdqu -120(%rsi),%xmm15 + vaesenclast %xmm0,%xmm7,%xmm7 + vaesenclast %xmm0,%xmm8,%xmm8 + vmovdqa %xmm14,48(%rsp) + vaesenclast %xmm0,%xmm9,%xmm9 + vmovups 32-120(%rsi),%xmm0 + + vmovups %xmm2,-16(%r8) + subq %rbx,%r8 + vpxor 0(%rbp),%xmm2,%xmm2 + vmovups %xmm3,-16(%r9) + subq 72(%rsp),%r9 + vpxor 16(%rbp),%xmm3,%xmm3 + vmovups %xmm4,-16(%r10) + subq 80(%rsp),%r10 + vpxor 32(%rbp),%xmm4,%xmm4 + vmovups %xmm5,-16(%r11) + subq 88(%rsp),%r11 + vpxor 48(%rbp),%xmm5,%xmm5 + vmovups %xmm6,-16(%r12) + subq 96(%rsp),%r12 + vpxor %xmm10,%xmm6,%xmm6 + vmovups %xmm7,-16(%r13) + subq 104(%rsp),%r13 + vpxor %xmm11,%xmm7,%xmm7 + vmovups %xmm8,-16(%r14) + subq 112(%rsp),%r14 + vpxor %xmm12,%xmm8,%xmm8 + vmovups %xmm9,-16(%r15) + subq 120(%rsp),%r15 + vpxor %xmm13,%xmm9,%xmm9 + + decl %edx + jnz .Loop_enc8x + + movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 + + + + + +.Lenc8x_done: + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lenc8x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx + +.type aesni_multi_cbc_decrypt_avx,@function +.align 32 +aesni_multi_cbc_decrypt_avx: +.cfi_startproc +_avx_cbc_dec_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + + + + + + + + + subq $256,%rsp + andq $-256,%rsp + subq $192,%rsp + movq %rax,16(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x10,0x06,0x23,0x08 + +.Ldec8x_body: + vzeroupper + vmovdqu (%rsi),%xmm15 + leaq 120(%rsi),%rsi + leaq 160(%rdi),%rdi + shrl $1,%edx + +.Ldec8x_loop_grande: + + xorl %edx,%edx + movl -144(%rdi),%ecx + movq -160(%rdi),%r8 + cmpl %edx,%ecx + movq -152(%rdi),%rbx + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -136(%rdi),%xmm2 + movl %ecx,32(%rsp) + cmovleq %rsp,%r8 + subq %r8,%rbx + movq %rbx,64(%rsp) + vmovdqu %xmm2,192(%rsp) + movl -104(%rdi),%ecx + movq -120(%rdi),%r9 + cmpl %edx,%ecx + movq -112(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -96(%rdi),%xmm3 + movl %ecx,36(%rsp) + cmovleq %rsp,%r9 + subq %r9,%rbp + movq %rbp,72(%rsp) + vmovdqu %xmm3,208(%rsp) + movl -64(%rdi),%ecx + movq -80(%rdi),%r10 + cmpl %edx,%ecx + movq -72(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -56(%rdi),%xmm4 + movl %ecx,40(%rsp) + cmovleq %rsp,%r10 + subq %r10,%rbp + movq %rbp,80(%rsp) + vmovdqu %xmm4,224(%rsp) + movl -24(%rdi),%ecx + movq -40(%rdi),%r11 + cmpl %edx,%ecx + movq -32(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu -16(%rdi),%xmm5 + movl %ecx,44(%rsp) + cmovleq %rsp,%r11 + subq %r11,%rbp + movq %rbp,88(%rsp) + vmovdqu %xmm5,240(%rsp) + movl 16(%rdi),%ecx + movq 0(%rdi),%r12 + cmpl %edx,%ecx + movq 8(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 24(%rdi),%xmm6 + movl %ecx,48(%rsp) + cmovleq %rsp,%r12 + subq %r12,%rbp + movq %rbp,96(%rsp) + vmovdqu %xmm6,256(%rsp) + movl 56(%rdi),%ecx + movq 40(%rdi),%r13 + cmpl %edx,%ecx + movq 48(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 64(%rdi),%xmm7 + movl %ecx,52(%rsp) + cmovleq %rsp,%r13 + subq %r13,%rbp + movq %rbp,104(%rsp) + vmovdqu %xmm7,272(%rsp) + movl 96(%rdi),%ecx + movq 80(%rdi),%r14 + cmpl %edx,%ecx + movq 88(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 104(%rdi),%xmm8 + movl %ecx,56(%rsp) + cmovleq %rsp,%r14 + subq %r14,%rbp + movq %rbp,112(%rsp) + vmovdqu %xmm8,288(%rsp) + movl 136(%rdi),%ecx + movq 120(%rdi),%r15 + cmpl %edx,%ecx + movq 128(%rdi),%rbp + cmovgl %ecx,%edx + testl %ecx,%ecx + vmovdqu 144(%rdi),%xmm9 + movl %ecx,60(%rsp) + cmovleq %rsp,%r15 + subq %r15,%rbp + movq %rbp,120(%rsp) + vmovdqu %xmm9,304(%rsp) + testl %edx,%edx + jz .Ldec8x_done + + vmovups 16-120(%rsi),%xmm1 + vmovups 32-120(%rsi),%xmm0 + movl 240-120(%rsi),%eax + leaq 192+128(%rsp),%rbp + + vmovdqu (%r8),%xmm2 + vmovdqu (%r9),%xmm3 + vmovdqu (%r10),%xmm4 + vmovdqu (%r11),%xmm5 + vmovdqu (%r12),%xmm6 + vmovdqu (%r13),%xmm7 + vmovdqu (%r14),%xmm8 + vmovdqu (%r15),%xmm9 + vmovdqu %xmm2,0(%rbp) + vpxor %xmm15,%xmm2,%xmm2 + vmovdqu %xmm3,16(%rbp) + vpxor %xmm15,%xmm3,%xmm3 + vmovdqu %xmm4,32(%rbp) + vpxor %xmm15,%xmm4,%xmm4 + vmovdqu %xmm5,48(%rbp) + vpxor %xmm15,%xmm5,%xmm5 + vmovdqu %xmm6,64(%rbp) + vpxor %xmm15,%xmm6,%xmm6 + vmovdqu %xmm7,80(%rbp) + vpxor %xmm15,%xmm7,%xmm7 + vmovdqu %xmm8,96(%rbp) + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu %xmm9,112(%rbp) + vpxor %xmm15,%xmm9,%xmm9 + xorq $0x80,%rbp + movl $1,%ecx + jmp .Loop_dec8x + +.align 32 +.Loop_dec8x: + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+0(%rsp),%ecx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r8) + vaesdec %xmm1,%xmm4,%xmm4 + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r8,%rbx,1),%rbx + cmovgeq %rsp,%r8 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r8,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r8),%xmm10 + movq %rbx,64+0(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups -72(%rsi),%xmm1 + leaq 16(%r8,%rbx,1),%r8 + vmovdqu %xmm10,128(%rsp) + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+4(%rsp),%ecx + movq 64+8(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r9) + vaesdec %xmm0,%xmm4,%xmm4 + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%r9,%rbx,1),%rbx + cmovgeq %rsp,%r9 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r9,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r9),%xmm11 + movq %rbx,64+8(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups -56(%rsi),%xmm0 + leaq 16(%r9,%rbx,1),%r9 + vmovdqu %xmm11,144(%rsp) + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+8(%rsp),%ecx + movq 64+16(%rsp),%rbx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r10) + vaesdec %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r8) + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r10,%rbx,1),%rbx + cmovgeq %rsp,%r10 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r10,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r10),%xmm12 + movq %rbx,64+16(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups -40(%rsi),%xmm1 + leaq 16(%r10,%rbx,1),%r10 + vmovdqu %xmm12,160(%rsp) + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+12(%rsp),%ecx + movq 64+24(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r11) + vaesdec %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r9) + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%r11,%rbx,1),%rbx + cmovgeq %rsp,%r11 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r11,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r11),%xmm13 + movq %rbx,64+24(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups -24(%rsi),%xmm0 + leaq 16(%r11,%rbx,1),%r11 + vmovdqu %xmm13,176(%rsp) + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+16(%rsp),%ecx + movq 64+32(%rsp),%rbx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r12) + vaesdec %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r10) + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r12,%rbx,1),%rbx + cmovgeq %rsp,%r12 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r12,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r12),%xmm10 + movq %rbx,64+32(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups -8(%rsi),%xmm1 + leaq 16(%r12,%rbx,1),%r12 + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+20(%rsp),%ecx + movq 64+40(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r13) + vaesdec %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r11) + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%rbx,%r13,1),%rbx + cmovgeq %rsp,%r13 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r13,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r13),%xmm11 + movq %rbx,64+40(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 8(%rsi),%xmm0 + leaq 16(%r13,%rbx,1),%r13 + vaesdec %xmm1,%xmm2,%xmm2 + cmpl 32+24(%rsp),%ecx + movq 64+48(%rsp),%rbx + vaesdec %xmm1,%xmm3,%xmm3 + prefetcht0 31(%r14) + vaesdec %xmm1,%xmm4,%xmm4 + prefetcht0 15(%r12) + vaesdec %xmm1,%xmm5,%xmm5 + leaq (%r14,%rbx,1),%rbx + cmovgeq %rsp,%r14 + vaesdec %xmm1,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm1,%xmm7,%xmm7 + subq %r14,%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vmovdqu 16(%r14),%xmm12 + movq %rbx,64+48(%rsp) + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 24(%rsi),%xmm1 + leaq 16(%r14,%rbx,1),%r14 + vaesdec %xmm0,%xmm2,%xmm2 + cmpl 32+28(%rsp),%ecx + movq 64+56(%rsp),%rbx + vaesdec %xmm0,%xmm3,%xmm3 + prefetcht0 31(%r15) + vaesdec %xmm0,%xmm4,%xmm4 + prefetcht0 15(%r13) + vaesdec %xmm0,%xmm5,%xmm5 + leaq (%r15,%rbx,1),%rbx + cmovgeq %rsp,%r15 + vaesdec %xmm0,%xmm6,%xmm6 + cmovgq %rsp,%rbx + vaesdec %xmm0,%xmm7,%xmm7 + subq %r15,%rbx + vaesdec %xmm0,%xmm8,%xmm8 + vmovdqu 16(%r15),%xmm13 + movq %rbx,64+56(%rsp) + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 40(%rsi),%xmm0 + leaq 16(%r15,%rbx,1),%r15 + vmovdqu 32(%rsp),%xmm14 + prefetcht0 15(%r14) + prefetcht0 15(%r15) + cmpl $11,%eax + jb .Ldec8x_tail + + vaesdec %xmm1,%xmm2,%xmm2 + vaesdec %xmm1,%xmm3,%xmm3 + vaesdec %xmm1,%xmm4,%xmm4 + vaesdec %xmm1,%xmm5,%xmm5 + vaesdec %xmm1,%xmm6,%xmm6 + vaesdec %xmm1,%xmm7,%xmm7 + vaesdec %xmm1,%xmm8,%xmm8 + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 176-120(%rsi),%xmm1 + + vaesdec %xmm0,%xmm2,%xmm2 + vaesdec %xmm0,%xmm3,%xmm3 + vaesdec %xmm0,%xmm4,%xmm4 + vaesdec %xmm0,%xmm5,%xmm5 + vaesdec %xmm0,%xmm6,%xmm6 + vaesdec %xmm0,%xmm7,%xmm7 + vaesdec %xmm0,%xmm8,%xmm8 + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 192-120(%rsi),%xmm0 + je .Ldec8x_tail + + vaesdec %xmm1,%xmm2,%xmm2 + vaesdec %xmm1,%xmm3,%xmm3 + vaesdec %xmm1,%xmm4,%xmm4 + vaesdec %xmm1,%xmm5,%xmm5 + vaesdec %xmm1,%xmm6,%xmm6 + vaesdec %xmm1,%xmm7,%xmm7 + vaesdec %xmm1,%xmm8,%xmm8 + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 208-120(%rsi),%xmm1 + + vaesdec %xmm0,%xmm2,%xmm2 + vaesdec %xmm0,%xmm3,%xmm3 + vaesdec %xmm0,%xmm4,%xmm4 + vaesdec %xmm0,%xmm5,%xmm5 + vaesdec %xmm0,%xmm6,%xmm6 + vaesdec %xmm0,%xmm7,%xmm7 + vaesdec %xmm0,%xmm8,%xmm8 + vaesdec %xmm0,%xmm9,%xmm9 + vmovups 224-120(%rsi),%xmm0 + +.Ldec8x_tail: + vaesdec %xmm1,%xmm2,%xmm2 + vpxor %xmm15,%xmm15,%xmm15 + vaesdec %xmm1,%xmm3,%xmm3 + vaesdec %xmm1,%xmm4,%xmm4 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesdec %xmm1,%xmm5,%xmm5 + vaesdec %xmm1,%xmm6,%xmm6 + vpaddd %xmm14,%xmm15,%xmm15 + vmovdqu 48(%rsp),%xmm14 + vaesdec %xmm1,%xmm7,%xmm7 + movq 64(%rsp),%rbx + vaesdec %xmm1,%xmm8,%xmm8 + vaesdec %xmm1,%xmm9,%xmm9 + vmovups 16-120(%rsi),%xmm1 + + vaesdeclast %xmm0,%xmm2,%xmm2 + vmovdqa %xmm15,32(%rsp) + vpxor %xmm15,%xmm15,%xmm15 + vaesdeclast %xmm0,%xmm3,%xmm3 + vpxor 0(%rbp),%xmm2,%xmm2 + vaesdeclast %xmm0,%xmm4,%xmm4 + vpxor 16(%rbp),%xmm3,%xmm3 + vpcmpgtd %xmm15,%xmm14,%xmm15 + vaesdeclast %xmm0,%xmm5,%xmm5 + vpxor 32(%rbp),%xmm4,%xmm4 + vaesdeclast %xmm0,%xmm6,%xmm6 + vpxor 48(%rbp),%xmm5,%xmm5 + vpaddd %xmm15,%xmm14,%xmm14 + vmovdqu -120(%rsi),%xmm15 + vaesdeclast %xmm0,%xmm7,%xmm7 + vpxor 64(%rbp),%xmm6,%xmm6 + vaesdeclast %xmm0,%xmm8,%xmm8 + vpxor 80(%rbp),%xmm7,%xmm7 + vmovdqa %xmm14,48(%rsp) + vaesdeclast %xmm0,%xmm9,%xmm9 + vpxor 96(%rbp),%xmm8,%xmm8 + vmovups 32-120(%rsi),%xmm0 + + vmovups %xmm2,-16(%r8) + subq %rbx,%r8 + vmovdqu 128+0(%rsp),%xmm2 + vpxor 112(%rbp),%xmm9,%xmm9 + vmovups %xmm3,-16(%r9) + subq 72(%rsp),%r9 + vmovdqu %xmm2,0(%rbp) + vpxor %xmm15,%xmm2,%xmm2 + vmovdqu 128+16(%rsp),%xmm3 + vmovups %xmm4,-16(%r10) + subq 80(%rsp),%r10 + vmovdqu %xmm3,16(%rbp) + vpxor %xmm15,%xmm3,%xmm3 + vmovdqu 128+32(%rsp),%xmm4 + vmovups %xmm5,-16(%r11) + subq 88(%rsp),%r11 + vmovdqu %xmm4,32(%rbp) + vpxor %xmm15,%xmm4,%xmm4 + vmovdqu 128+48(%rsp),%xmm5 + vmovups %xmm6,-16(%r12) + subq 96(%rsp),%r12 + vmovdqu %xmm5,48(%rbp) + vpxor %xmm15,%xmm5,%xmm5 + vmovdqu %xmm10,64(%rbp) + vpxor %xmm10,%xmm15,%xmm6 + vmovups %xmm7,-16(%r13) + subq 104(%rsp),%r13 + vmovdqu %xmm11,80(%rbp) + vpxor %xmm11,%xmm15,%xmm7 + vmovups %xmm8,-16(%r14) + subq 112(%rsp),%r14 + vmovdqu %xmm12,96(%rbp) + vpxor %xmm12,%xmm15,%xmm8 + vmovups %xmm9,-16(%r15) + subq 120(%rsp),%r15 + vmovdqu %xmm13,112(%rbp) + vpxor %xmm13,%xmm15,%xmm9 + + xorq $128,%rbp + decl %edx + jnz .Loop_dec8x + + movq 16(%rsp),%rax +.cfi_def_cfa %rax,8 + + + + + +.Ldec8x_done: + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Ldec8x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx Index: stable/12/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/aesni-sha1-x86_64.S (revision 364963) @@ -1,1711 +1,3037 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from aesni-sha1-x86_64.pl. */ .text .globl aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc,@function .align 32 aesni_cbc_sha1_enc: .cfi_startproc movl OPENSSL_ia32cap_P+0(%rip),%r10d movq OPENSSL_ia32cap_P+4(%rip),%r11 btq $61,%r11 jc aesni_cbc_sha1_enc_shaext + andl $268435456,%r11d + andl $1073741824,%r10d + orl %r11d,%r10d + cmpl $1342177280,%r10d + je aesni_cbc_sha1_enc_avx jmp aesni_cbc_sha1_enc_ssse3 .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc .type aesni_cbc_sha1_enc_ssse3,@function .align 32 aesni_cbc_sha1_enc_ssse3: .cfi_startproc movq 8(%rsp),%r10 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -104(%rsp),%rsp .cfi_adjust_cfa_offset 104 movq %rdi,%r12 movq %rsi,%r13 movq %rdx,%r14 leaq 112(%rcx),%r15 movdqu (%r8),%xmm2 movq %r8,88(%rsp) shlq $6,%r14 subq %r12,%r13 movl 240-112(%r15),%r8d addq %r10,%r14 leaq K_XX_XX(%rip),%r11 movl 0(%r9),%eax movl 4(%r9),%ebx movl 8(%r9),%ecx movl 12(%r9),%edx movl %ebx,%esi movl 16(%r9),%ebp movl %ecx,%edi xorl %edx,%edi andl %edi,%esi movdqa 64(%r11),%xmm3 movdqa 0(%r11),%xmm13 movdqu 0(%r10),%xmm4 movdqu 16(%r10),%xmm5 movdqu 32(%r10),%xmm6 movdqu 48(%r10),%xmm7 .byte 102,15,56,0,227 .byte 102,15,56,0,235 .byte 102,15,56,0,243 addq $64,%r10 paddd %xmm13,%xmm4 .byte 102,15,56,0,251 paddd %xmm13,%xmm5 paddd %xmm13,%xmm6 movdqa %xmm4,0(%rsp) psubd %xmm13,%xmm4 movdqa %xmm5,16(%rsp) psubd %xmm13,%xmm5 movdqa %xmm6,32(%rsp) psubd %xmm13,%xmm6 movups -112(%r15),%xmm15 movups 16-112(%r15),%xmm0 jmp .Loop_ssse3 .align 32 .Loop_ssse3: rorl $2,%ebx movups 0(%r12),%xmm14 xorps %xmm15,%xmm14 xorps %xmm14,%xmm2 movups -80(%r15),%xmm1 .byte 102,15,56,220,208 pshufd $238,%xmm4,%xmm8 xorl %edx,%esi movdqa %xmm7,%xmm12 paddd %xmm7,%xmm13 movl %eax,%edi addl 0(%rsp),%ebp punpcklqdq %xmm5,%xmm8 xorl %ecx,%ebx roll $5,%eax addl %esi,%ebp psrldq $4,%xmm12 andl %ebx,%edi xorl %ecx,%ebx pxor %xmm4,%xmm8 addl %eax,%ebp rorl $7,%eax pxor %xmm6,%xmm12 xorl %ecx,%edi movl %ebp,%esi addl 4(%rsp),%edx pxor %xmm12,%xmm8 xorl %ebx,%eax roll $5,%ebp movdqa %xmm13,48(%rsp) addl %edi,%edx movups -64(%r15),%xmm0 .byte 102,15,56,220,209 andl %eax,%esi movdqa %xmm8,%xmm3 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp movdqa %xmm8,%xmm12 xorl %ebx,%esi pslldq $12,%xmm3 paddd %xmm8,%xmm8 movl %edx,%edi addl 8(%rsp),%ecx psrld $31,%xmm12 xorl %eax,%ebp roll $5,%edx addl %esi,%ecx movdqa %xmm3,%xmm13 andl %ebp,%edi xorl %eax,%ebp psrld $30,%xmm3 addl %edx,%ecx rorl $7,%edx por %xmm12,%xmm8 xorl %eax,%edi movl %ecx,%esi addl 12(%rsp),%ebx movups -48(%r15),%xmm1 .byte 102,15,56,220,208 pslld $2,%xmm13 pxor %xmm3,%xmm8 xorl %ebp,%edx movdqa 0(%r11),%xmm3 roll $5,%ecx addl %edi,%ebx andl %edx,%esi pxor %xmm13,%xmm8 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx pshufd $238,%xmm5,%xmm9 xorl %ebp,%esi movdqa %xmm8,%xmm13 paddd %xmm8,%xmm3 movl %ebx,%edi addl 16(%rsp),%eax punpcklqdq %xmm6,%xmm9 xorl %edx,%ecx roll $5,%ebx addl %esi,%eax psrldq $4,%xmm13 andl %ecx,%edi xorl %edx,%ecx pxor %xmm5,%xmm9 addl %ebx,%eax rorl $7,%ebx movups -32(%r15),%xmm0 .byte 102,15,56,220,209 pxor %xmm7,%xmm13 xorl %edx,%edi movl %eax,%esi addl 20(%rsp),%ebp pxor %xmm13,%xmm9 xorl %ecx,%ebx roll $5,%eax movdqa %xmm3,0(%rsp) addl %edi,%ebp andl %ebx,%esi movdqa %xmm9,%xmm12 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax movdqa %xmm9,%xmm13 xorl %ecx,%esi pslldq $12,%xmm12 paddd %xmm9,%xmm9 movl %ebp,%edi addl 24(%rsp),%edx psrld $31,%xmm13 xorl %ebx,%eax roll $5,%ebp addl %esi,%edx movups -16(%r15),%xmm1 .byte 102,15,56,220,208 movdqa %xmm12,%xmm3 andl %eax,%edi xorl %ebx,%eax psrld $30,%xmm12 addl %ebp,%edx rorl $7,%ebp por %xmm13,%xmm9 xorl %ebx,%edi movl %edx,%esi addl 28(%rsp),%ecx pslld $2,%xmm3 pxor %xmm12,%xmm9 xorl %eax,%ebp movdqa 16(%r11),%xmm12 roll $5,%edx addl %edi,%ecx andl %ebp,%esi pxor %xmm3,%xmm9 xorl %eax,%ebp addl %edx,%ecx rorl $7,%edx pshufd $238,%xmm6,%xmm10 xorl %eax,%esi movdqa %xmm9,%xmm3 paddd %xmm9,%xmm12 movl %ecx,%edi addl 32(%rsp),%ebx movups 0(%r15),%xmm0 .byte 102,15,56,220,209 punpcklqdq %xmm7,%xmm10 xorl %ebp,%edx roll $5,%ecx addl %esi,%ebx psrldq $4,%xmm3 andl %edx,%edi xorl %ebp,%edx pxor %xmm6,%xmm10 addl %ecx,%ebx rorl $7,%ecx pxor %xmm8,%xmm3 xorl %ebp,%edi movl %ebx,%esi addl 36(%rsp),%eax pxor %xmm3,%xmm10 xorl %edx,%ecx roll $5,%ebx movdqa %xmm12,16(%rsp) addl %edi,%eax andl %ecx,%esi movdqa %xmm10,%xmm13 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx movups 16(%r15),%xmm1 .byte 102,15,56,220,208 movdqa %xmm10,%xmm3 xorl %edx,%esi pslldq $12,%xmm13 paddd %xmm10,%xmm10 movl %eax,%edi addl 40(%rsp),%ebp psrld $31,%xmm3 xorl %ecx,%ebx roll $5,%eax addl %esi,%ebp movdqa %xmm13,%xmm12 andl %ebx,%edi xorl %ecx,%ebx psrld $30,%xmm13 addl %eax,%ebp rorl $7,%eax por %xmm3,%xmm10 xorl %ecx,%edi movl %ebp,%esi addl 44(%rsp),%edx pslld $2,%xmm12 pxor %xmm13,%xmm10 xorl %ebx,%eax movdqa 16(%r11),%xmm13 roll $5,%ebp addl %edi,%edx movups 32(%r15),%xmm0 .byte 102,15,56,220,209 andl %eax,%esi pxor %xmm12,%xmm10 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp pshufd $238,%xmm7,%xmm11 xorl %ebx,%esi movdqa %xmm10,%xmm12 paddd %xmm10,%xmm13 movl %edx,%edi addl 48(%rsp),%ecx punpcklqdq %xmm8,%xmm11 xorl %eax,%ebp roll $5,%edx addl %esi,%ecx psrldq $4,%xmm12 andl %ebp,%edi xorl %eax,%ebp pxor %xmm7,%xmm11 addl %edx,%ecx rorl $7,%edx pxor %xmm9,%xmm12 xorl %eax,%edi movl %ecx,%esi addl 52(%rsp),%ebx movups 48(%r15),%xmm1 .byte 102,15,56,220,208 pxor %xmm12,%xmm11 xorl %ebp,%edx roll $5,%ecx movdqa %xmm13,32(%rsp) addl %edi,%ebx andl %edx,%esi movdqa %xmm11,%xmm3 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx movdqa %xmm11,%xmm12 xorl %ebp,%esi pslldq $12,%xmm3 paddd %xmm11,%xmm11 movl %ebx,%edi addl 56(%rsp),%eax psrld $31,%xmm12 xorl %edx,%ecx roll $5,%ebx addl %esi,%eax movdqa %xmm3,%xmm13 andl %ecx,%edi xorl %edx,%ecx psrld $30,%xmm3 addl %ebx,%eax rorl $7,%ebx cmpl $11,%r8d jb .Laesenclast1 movups 64(%r15),%xmm0 .byte 102,15,56,220,209 movups 80(%r15),%xmm1 .byte 102,15,56,220,208 je .Laesenclast1 movups 96(%r15),%xmm0 .byte 102,15,56,220,209 movups 112(%r15),%xmm1 .byte 102,15,56,220,208 .Laesenclast1: .byte 102,15,56,221,209 movups 16-112(%r15),%xmm0 por %xmm12,%xmm11 xorl %edx,%edi movl %eax,%esi addl 60(%rsp),%ebp pslld $2,%xmm13 pxor %xmm3,%xmm11 xorl %ecx,%ebx movdqa 16(%r11),%xmm3 roll $5,%eax addl %edi,%ebp andl %ebx,%esi pxor %xmm13,%xmm11 pshufd $238,%xmm10,%xmm13 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax pxor %xmm8,%xmm4 xorl %ecx,%esi movl %ebp,%edi addl 0(%rsp),%edx punpcklqdq %xmm11,%xmm13 xorl %ebx,%eax roll $5,%ebp pxor %xmm5,%xmm4 addl %esi,%edx movups 16(%r12),%xmm14 xorps %xmm15,%xmm14 movups %xmm2,0(%r12,%r13,1) xorps %xmm14,%xmm2 movups -80(%r15),%xmm1 .byte 102,15,56,220,208 andl %eax,%edi movdqa %xmm3,%xmm12 xorl %ebx,%eax paddd %xmm11,%xmm3 addl %ebp,%edx pxor %xmm13,%xmm4 rorl $7,%ebp xorl %ebx,%edi movl %edx,%esi addl 4(%rsp),%ecx movdqa %xmm4,%xmm13 xorl %eax,%ebp roll $5,%edx movdqa %xmm3,48(%rsp) addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp pslld $2,%xmm4 addl %edx,%ecx rorl $7,%edx psrld $30,%xmm13 xorl %eax,%esi movl %ecx,%edi addl 8(%rsp),%ebx movups -64(%r15),%xmm0 .byte 102,15,56,220,209 por %xmm13,%xmm4 xorl %ebp,%edx roll $5,%ecx pshufd $238,%xmm11,%xmm3 addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx addl %ecx,%ebx addl 12(%rsp),%eax xorl %ebp,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax pxor %xmm9,%xmm5 addl 16(%rsp),%ebp movups -48(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%esi punpcklqdq %xmm4,%xmm3 movl %eax,%edi roll $5,%eax pxor %xmm6,%xmm5 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm12,%xmm13 rorl $7,%ebx paddd %xmm4,%xmm12 addl %eax,%ebp pxor %xmm3,%xmm5 addl 20(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm5,%xmm3 addl %edi,%edx xorl %ebx,%esi movdqa %xmm12,0(%rsp) rorl $7,%eax addl %ebp,%edx addl 24(%rsp),%ecx pslld $2,%xmm5 xorl %eax,%esi movl %edx,%edi psrld $30,%xmm3 roll $5,%edx addl %esi,%ecx movups -32(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%edi rorl $7,%ebp por %xmm3,%xmm5 addl %edx,%ecx addl 28(%rsp),%ebx pshufd $238,%xmm4,%xmm12 xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx pxor %xmm10,%xmm6 addl 32(%rsp),%eax xorl %edx,%esi punpcklqdq %xmm5,%xmm12 movl %ebx,%edi roll $5,%ebx pxor %xmm7,%xmm6 addl %esi,%eax xorl %edx,%edi movdqa 32(%r11),%xmm3 rorl $7,%ecx paddd %xmm5,%xmm13 addl %ebx,%eax pxor %xmm12,%xmm6 addl 36(%rsp),%ebp movups -16(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%edi movl %eax,%esi roll $5,%eax movdqa %xmm6,%xmm12 addl %edi,%ebp xorl %ecx,%esi movdqa %xmm13,16(%rsp) rorl $7,%ebx addl %eax,%ebp addl 40(%rsp),%edx pslld $2,%xmm6 xorl %ebx,%esi movl %ebp,%edi psrld $30,%xmm12 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax por %xmm12,%xmm6 addl %ebp,%edx addl 44(%rsp),%ecx pshufd $238,%xmm5,%xmm13 xorl %eax,%edi movl %edx,%esi roll $5,%edx addl %edi,%ecx movups 0(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx pxor %xmm11,%xmm7 addl 48(%rsp),%ebx xorl %ebp,%esi punpcklqdq %xmm6,%xmm13 movl %ecx,%edi roll $5,%ecx pxor %xmm8,%xmm7 addl %esi,%ebx xorl %ebp,%edi movdqa %xmm3,%xmm12 rorl $7,%edx paddd %xmm6,%xmm3 addl %ecx,%ebx pxor %xmm13,%xmm7 addl 52(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx movdqa %xmm7,%xmm13 addl %edi,%eax xorl %edx,%esi movdqa %xmm3,32(%rsp) rorl $7,%ecx addl %ebx,%eax addl 56(%rsp),%ebp movups 16(%r15),%xmm1 .byte 102,15,56,220,208 pslld $2,%xmm7 xorl %ecx,%esi movl %eax,%edi psrld $30,%xmm13 roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx por %xmm13,%xmm7 addl %eax,%ebp addl 60(%rsp),%edx pshufd $238,%xmm6,%xmm3 xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx pxor %xmm4,%xmm8 addl 0(%rsp),%ecx xorl %eax,%esi punpcklqdq %xmm7,%xmm3 movl %edx,%edi roll $5,%edx pxor %xmm9,%xmm8 addl %esi,%ecx movups 32(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%edi movdqa %xmm12,%xmm13 rorl $7,%ebp paddd %xmm7,%xmm12 addl %edx,%ecx pxor %xmm3,%xmm8 addl 4(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx movdqa %xmm8,%xmm3 addl %edi,%ebx xorl %ebp,%esi movdqa %xmm12,48(%rsp) rorl $7,%edx addl %ecx,%ebx addl 8(%rsp),%eax pslld $2,%xmm8 xorl %edx,%esi movl %ebx,%edi psrld $30,%xmm3 roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx por %xmm3,%xmm8 addl %ebx,%eax addl 12(%rsp),%ebp movups 48(%r15),%xmm1 .byte 102,15,56,220,208 pshufd $238,%xmm7,%xmm12 xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp pxor %xmm5,%xmm9 addl 16(%rsp),%edx xorl %ebx,%esi punpcklqdq %xmm8,%xmm12 movl %ebp,%edi roll $5,%ebp pxor %xmm10,%xmm9 addl %esi,%edx xorl %ebx,%edi movdqa %xmm13,%xmm3 rorl $7,%eax paddd %xmm8,%xmm13 addl %ebp,%edx pxor %xmm12,%xmm9 addl 20(%rsp),%ecx xorl %eax,%edi movl %edx,%esi roll $5,%edx movdqa %xmm9,%xmm12 addl %edi,%ecx cmpl $11,%r8d jb .Laesenclast2 movups 64(%r15),%xmm0 .byte 102,15,56,220,209 movups 80(%r15),%xmm1 .byte 102,15,56,220,208 je .Laesenclast2 movups 96(%r15),%xmm0 .byte 102,15,56,220,209 movups 112(%r15),%xmm1 .byte 102,15,56,220,208 .Laesenclast2: .byte 102,15,56,221,209 movups 16-112(%r15),%xmm0 xorl %eax,%esi movdqa %xmm13,0(%rsp) rorl $7,%ebp addl %edx,%ecx addl 24(%rsp),%ebx pslld $2,%xmm9 xorl %ebp,%esi movl %ecx,%edi psrld $30,%xmm12 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx por %xmm12,%xmm9 addl %ecx,%ebx addl 28(%rsp),%eax pshufd $238,%xmm8,%xmm13 rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi roll $5,%ebx addl %edi,%eax xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax pxor %xmm6,%xmm10 addl 32(%rsp),%ebp movups 32(%r12),%xmm14 xorps %xmm15,%xmm14 movups %xmm2,16(%r13,%r12,1) xorps %xmm14,%xmm2 movups -80(%r15),%xmm1 .byte 102,15,56,220,208 andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx punpcklqdq %xmm9,%xmm13 movl %eax,%edi xorl %ecx,%esi pxor %xmm11,%xmm10 roll $5,%eax addl %esi,%ebp movdqa %xmm3,%xmm12 xorl %ebx,%edi paddd %xmm9,%xmm3 xorl %ecx,%ebx pxor %xmm13,%xmm10 addl %eax,%ebp addl 36(%rsp),%edx andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax movdqa %xmm10,%xmm13 movl %ebp,%esi xorl %ebx,%edi movdqa %xmm3,16(%rsp) roll $5,%ebp addl %edi,%edx movups -64(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%esi pslld $2,%xmm10 xorl %ebx,%eax addl %ebp,%edx psrld $30,%xmm13 addl 40(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax por %xmm13,%xmm10 rorl $7,%ebp movl %edx,%edi xorl %eax,%esi roll $5,%edx pshufd $238,%xmm9,%xmm3 addl %esi,%ecx xorl %ebp,%edi xorl %eax,%ebp addl %edx,%ecx addl 44(%rsp),%ebx andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx movups -48(%r15),%xmm1 .byte 102,15,56,220,208 movl %ecx,%esi xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx xorl %edx,%esi xorl %ebp,%edx addl %ecx,%ebx pxor %xmm7,%xmm11 addl 48(%rsp),%eax andl %edx,%esi xorl %ebp,%edx rorl $7,%ecx punpcklqdq %xmm10,%xmm3 movl %ebx,%edi xorl %edx,%esi pxor %xmm4,%xmm11 roll $5,%ebx addl %esi,%eax movdqa 48(%r11),%xmm13 xorl %ecx,%edi paddd %xmm10,%xmm12 xorl %edx,%ecx pxor %xmm3,%xmm11 addl %ebx,%eax addl 52(%rsp),%ebp movups -32(%r15),%xmm0 .byte 102,15,56,220,209 andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx movdqa %xmm11,%xmm3 movl %eax,%esi xorl %ecx,%edi movdqa %xmm12,32(%rsp) roll $5,%eax addl %edi,%ebp xorl %ebx,%esi pslld $2,%xmm11 xorl %ecx,%ebx addl %eax,%ebp psrld $30,%xmm3 addl 56(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx por %xmm3,%xmm11 rorl $7,%eax movl %ebp,%edi xorl %ebx,%esi roll $5,%ebp pshufd $238,%xmm10,%xmm12 addl %esi,%edx movups -16(%r15),%xmm1 .byte 102,15,56,220,208 xorl %eax,%edi xorl %ebx,%eax addl %ebp,%edx addl 60(%rsp),%ecx andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp movl %edx,%esi xorl %eax,%edi roll $5,%edx addl %edi,%ecx xorl %ebp,%esi xorl %eax,%ebp addl %edx,%ecx pxor %xmm8,%xmm4 addl 0(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp rorl $7,%edx movups 0(%r15),%xmm0 .byte 102,15,56,220,209 punpcklqdq %xmm11,%xmm12 movl %ecx,%edi xorl %ebp,%esi pxor %xmm5,%xmm4 roll $5,%ecx addl %esi,%ebx movdqa %xmm13,%xmm3 xorl %edx,%edi paddd %xmm11,%xmm13 xorl %ebp,%edx pxor %xmm12,%xmm4 addl %ecx,%ebx addl 4(%rsp),%eax andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx movdqa %xmm4,%xmm12 movl %ebx,%esi xorl %edx,%edi movdqa %xmm13,48(%rsp) roll $5,%ebx addl %edi,%eax xorl %ecx,%esi pslld $2,%xmm4 xorl %edx,%ecx addl %ebx,%eax psrld $30,%xmm12 addl 8(%rsp),%ebp movups 16(%r15),%xmm1 .byte 102,15,56,220,208 andl %ecx,%esi xorl %edx,%ecx por %xmm12,%xmm4 rorl $7,%ebx movl %eax,%edi xorl %ecx,%esi roll $5,%eax pshufd $238,%xmm11,%xmm13 addl %esi,%ebp xorl %ebx,%edi xorl %ecx,%ebx addl %eax,%ebp addl 12(%rsp),%edx andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax movl %ebp,%esi xorl %ebx,%edi roll $5,%ebp addl %edi,%edx movups 32(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%esi xorl %ebx,%eax addl %ebp,%edx pxor %xmm9,%xmm5 addl 16(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%ebp punpcklqdq %xmm4,%xmm13 movl %edx,%edi xorl %eax,%esi pxor %xmm6,%xmm5 roll $5,%edx addl %esi,%ecx movdqa %xmm3,%xmm12 xorl %ebp,%edi paddd %xmm4,%xmm3 xorl %eax,%ebp pxor %xmm13,%xmm5 addl %edx,%ecx addl 20(%rsp),%ebx andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx movups 48(%r15),%xmm1 .byte 102,15,56,220,208 movdqa %xmm5,%xmm13 movl %ecx,%esi xorl %ebp,%edi movdqa %xmm3,0(%rsp) roll $5,%ecx addl %edi,%ebx xorl %edx,%esi pslld $2,%xmm5 xorl %ebp,%edx addl %ecx,%ebx psrld $30,%xmm13 addl 24(%rsp),%eax andl %edx,%esi xorl %ebp,%edx por %xmm13,%xmm5 rorl $7,%ecx movl %ebx,%edi xorl %edx,%esi roll $5,%ebx pshufd $238,%xmm4,%xmm3 addl %esi,%eax xorl %ecx,%edi xorl %edx,%ecx addl %ebx,%eax addl 28(%rsp),%ebp cmpl $11,%r8d jb .Laesenclast3 movups 64(%r15),%xmm0 .byte 102,15,56,220,209 movups 80(%r15),%xmm1 .byte 102,15,56,220,208 je .Laesenclast3 movups 96(%r15),%xmm0 .byte 102,15,56,220,209 movups 112(%r15),%xmm1 .byte 102,15,56,220,208 .Laesenclast3: .byte 102,15,56,221,209 movups 16-112(%r15),%xmm0 andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx movl %eax,%esi xorl %ecx,%edi roll $5,%eax addl %edi,%ebp xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%ebp pxor %xmm10,%xmm6 addl 32(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax punpcklqdq %xmm5,%xmm3 movl %ebp,%edi xorl %ebx,%esi pxor %xmm7,%xmm6 roll $5,%ebp addl %esi,%edx movups 48(%r12),%xmm14 xorps %xmm15,%xmm14 movups %xmm2,32(%r13,%r12,1) xorps %xmm14,%xmm2 movups -80(%r15),%xmm1 .byte 102,15,56,220,208 movdqa %xmm12,%xmm13 xorl %eax,%edi paddd %xmm5,%xmm12 xorl %ebx,%eax pxor %xmm3,%xmm6 addl %ebp,%edx addl 36(%rsp),%ecx andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp movdqa %xmm6,%xmm3 movl %edx,%esi xorl %eax,%edi movdqa %xmm12,16(%rsp) roll $5,%edx addl %edi,%ecx xorl %ebp,%esi pslld $2,%xmm6 xorl %eax,%ebp addl %edx,%ecx psrld $30,%xmm3 addl 40(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp por %xmm3,%xmm6 rorl $7,%edx movups -64(%r15),%xmm0 .byte 102,15,56,220,209 movl %ecx,%edi xorl %ebp,%esi roll $5,%ecx pshufd $238,%xmm5,%xmm12 addl %esi,%ebx xorl %edx,%edi xorl %ebp,%edx addl %ecx,%ebx addl 44(%rsp),%eax andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi roll $5,%ebx addl %edi,%eax xorl %edx,%esi addl %ebx,%eax pxor %xmm11,%xmm7 addl 48(%rsp),%ebp movups -48(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%esi punpcklqdq %xmm6,%xmm12 movl %eax,%edi roll $5,%eax pxor %xmm8,%xmm7 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm13,%xmm3 rorl $7,%ebx paddd %xmm6,%xmm13 addl %eax,%ebp pxor %xmm12,%xmm7 addl 52(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm7,%xmm12 addl %edi,%edx xorl %ebx,%esi movdqa %xmm13,32(%rsp) rorl $7,%eax addl %ebp,%edx addl 56(%rsp),%ecx pslld $2,%xmm7 xorl %eax,%esi movl %edx,%edi psrld $30,%xmm12 roll $5,%edx addl %esi,%ecx movups -32(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%edi rorl $7,%ebp por %xmm12,%xmm7 addl %edx,%ecx addl 60(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx addl 0(%rsp),%eax xorl %edx,%esi movl %ebx,%edi roll $5,%ebx paddd %xmm7,%xmm3 addl %esi,%eax xorl %edx,%edi movdqa %xmm3,48(%rsp) rorl $7,%ecx addl %ebx,%eax addl 4(%rsp),%ebp movups -16(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp addl 8(%rsp),%edx xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax addl %ebp,%edx addl 12(%rsp),%ecx xorl %eax,%edi movl %edx,%esi roll $5,%edx addl %edi,%ecx movups 0(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx cmpq %r14,%r10 je .Ldone_ssse3 movdqa 64(%r11),%xmm3 movdqa 0(%r11),%xmm13 movdqu 0(%r10),%xmm4 movdqu 16(%r10),%xmm5 movdqu 32(%r10),%xmm6 movdqu 48(%r10),%xmm7 .byte 102,15,56,0,227 addq $64,%r10 addl 16(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi .byte 102,15,56,0,235 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx paddd %xmm13,%xmm4 addl %ecx,%ebx addl 20(%rsp),%eax xorl %edx,%edi movl %ebx,%esi movdqa %xmm4,0(%rsp) roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx psubd %xmm13,%xmm4 addl %ebx,%eax addl 24(%rsp),%ebp movups 16(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%esi movl %eax,%edi roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx addl %eax,%ebp addl 28(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi movl %edx,%edi .byte 102,15,56,0,243 roll $5,%edx addl %esi,%ecx movups 32(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%edi rorl $7,%ebp paddd %xmm13,%xmm5 addl %edx,%ecx addl 36(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi movdqa %xmm5,16(%rsp) roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx psubd %xmm13,%xmm5 addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi movl %ebx,%edi roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx addl %ebx,%eax addl 44(%rsp),%ebp movups 48(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp addl 48(%rsp),%edx xorl %ebx,%esi movl %ebp,%edi .byte 102,15,56,0,251 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax paddd %xmm13,%xmm6 addl %ebp,%edx addl 52(%rsp),%ecx xorl %eax,%edi movl %edx,%esi movdqa %xmm6,32(%rsp) roll $5,%edx addl %edi,%ecx cmpl $11,%r8d jb .Laesenclast4 movups 64(%r15),%xmm0 .byte 102,15,56,220,209 movups 80(%r15),%xmm1 .byte 102,15,56,220,208 je .Laesenclast4 movups 96(%r15),%xmm0 .byte 102,15,56,220,209 movups 112(%r15),%xmm1 .byte 102,15,56,220,208 .Laesenclast4: .byte 102,15,56,221,209 movups 16-112(%r15),%xmm0 xorl %eax,%esi rorl $7,%ebp psubd %xmm13,%xmm6 addl %edx,%ecx addl 56(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx addl %ecx,%ebx addl 60(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax rorl $7,%ecx addl %ebx,%eax movups %xmm2,48(%r13,%r12,1) leaq 64(%r12),%r12 addl 0(%r9),%eax addl 4(%r9),%esi addl 8(%r9),%ecx addl 12(%r9),%edx movl %eax,0(%r9) addl 16(%r9),%ebp movl %esi,4(%r9) movl %esi,%ebx movl %ecx,8(%r9) movl %ecx,%edi movl %edx,12(%r9) xorl %edx,%edi movl %ebp,16(%r9) andl %edi,%esi jmp .Loop_ssse3 .Ldone_ssse3: addl 16(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx addl %ecx,%ebx addl 20(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax addl 24(%rsp),%ebp movups 16(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%esi movl %eax,%edi roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx addl %eax,%ebp addl 28(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi movl %edx,%edi roll $5,%edx addl %esi,%ecx movups 32(%r15),%xmm0 .byte 102,15,56,220,209 xorl %eax,%edi rorl $7,%ebp addl %edx,%ecx addl 36(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi movl %ebx,%edi roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx addl %ebx,%eax addl 44(%rsp),%ebp movups 48(%r15),%xmm1 .byte 102,15,56,220,208 xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp addl 48(%rsp),%edx xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax addl %ebp,%edx addl 52(%rsp),%ecx xorl %eax,%edi movl %edx,%esi roll $5,%edx addl %edi,%ecx cmpl $11,%r8d jb .Laesenclast5 movups 64(%r15),%xmm0 .byte 102,15,56,220,209 movups 80(%r15),%xmm1 .byte 102,15,56,220,208 je .Laesenclast5 movups 96(%r15),%xmm0 .byte 102,15,56,220,209 movups 112(%r15),%xmm1 .byte 102,15,56,220,208 .Laesenclast5: .byte 102,15,56,221,209 movups 16-112(%r15),%xmm0 xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx addl 56(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx addl %ecx,%ebx addl 60(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax rorl $7,%ecx addl %ebx,%eax movups %xmm2,48(%r13,%r12,1) movq 88(%rsp),%r8 addl 0(%r9),%eax addl 4(%r9),%esi addl 8(%r9),%ecx movl %eax,0(%r9) addl 12(%r9),%edx movl %esi,4(%r9) addl 16(%r9),%ebp movl %ecx,8(%r9) movl %edx,12(%r9) movl %ebp,16(%r9) movups %xmm2,(%r8) leaq 104(%rsp),%rsi .cfi_def_cfa %rsi,56 movq 0(%rsi),%r15 .cfi_restore %r15 movq 8(%rsi),%r14 .cfi_restore %r14 movq 16(%rsi),%r13 .cfi_restore %r13 movq 24(%rsi),%r12 .cfi_restore %r12 movq 32(%rsi),%rbp .cfi_restore %rbp movq 40(%rsi),%rbx .cfi_restore %rbx leaq 48(%rsi),%rsp .cfi_def_cfa %rsp,8 .Lepilogue_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3 +.type aesni_cbc_sha1_enc_avx,@function +.align 32 +aesni_cbc_sha1_enc_avx: +.cfi_startproc + movq 8(%rsp),%r10 + + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + leaq -104(%rsp),%rsp +.cfi_adjust_cfa_offset 104 + + + vzeroall + movq %rdi,%r12 + movq %rsi,%r13 + movq %rdx,%r14 + leaq 112(%rcx),%r15 + vmovdqu (%r8),%xmm12 + movq %r8,88(%rsp) + shlq $6,%r14 + subq %r12,%r13 + movl 240-112(%r15),%r8d + addq %r10,%r14 + + leaq K_XX_XX(%rip),%r11 + movl 0(%r9),%eax + movl 4(%r9),%ebx + movl 8(%r9),%ecx + movl 12(%r9),%edx + movl %ebx,%esi + movl 16(%r9),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r11),%xmm6 + vmovdqa 0(%r11),%xmm10 + vmovdqu 0(%r10),%xmm0 + vmovdqu 16(%r10),%xmm1 + vmovdqu 32(%r10),%xmm2 + vmovdqu 48(%r10),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r10 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm10,%xmm0,%xmm4 + vpaddd %xmm10,%xmm1,%xmm5 + vpaddd %xmm10,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + jmp .Loop_avx +.align 32 +.Loop_avx: + shrdl $2,%ebx,%ebx + vmovdqu 0(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm10,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm9 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpor %xmm8,%xmm4,%xmm4 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + vpxor %xmm9,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm10,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm9 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpor %xmm8,%xmm5,%xmm5 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm9,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa 16(%r11),%xmm10 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + vpaddd %xmm5,%xmm10,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm9 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpor %xmm8,%xmm6,%xmm6 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm9,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm10,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm9 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpor %xmm8,%xmm7,%xmm7 + vpsrld $30,%xmm9,%xmm8 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + cmpl $11,%r8d + jb .Lvaesenclast6 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast6 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast6: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm9,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm10,%xmm9 + addl %esi,%edx + vmovdqu 16(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vmovups %xmm12,0(%r12,%r13,1) + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm10,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm10,%xmm9 + vmovdqa 32(%r11),%xmm10 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm10,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%edi + vpaddd %xmm3,%xmm10,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm10,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + cmpl $11,%r8d + jb .Lvaesenclast7 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast7 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast7: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + vmovdqu 32(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vmovups %xmm12,16(%r13,%r12,1) + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm10,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm10,%xmm9 + vmovdqa 48(%r11),%xmm10 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm10,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm10,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + cmpl $11,%r8d + jb .Lvaesenclast8 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast8 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast8: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm10,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vmovdqu 48(%r12),%xmm13 + vpxor %xmm15,%xmm13,%xmm13 + vmovups %xmm12,32(%r13,%r12,1) + vpxor %xmm13,%xmm12,%xmm12 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -80(%r15),%xmm15 + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -64(%r15),%xmm14 + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -48(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm10,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups -32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm10,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups -16(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 0(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r14,%r10 + je .Ldone_avx + vmovdqa 64(%r11),%xmm9 + vmovdqa 0(%r11),%xmm10 + vmovdqu 0(%r10),%xmm0 + vmovdqu 16(%r10),%xmm1 + vmovdqu 32(%r10),%xmm2 + vmovdqu 48(%r10),%xmm3 + vpshufb %xmm9,%xmm0,%xmm0 + addq $64,%r10 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm9,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm10,%xmm0,%xmm8 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm8,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm9,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm10,%xmm1,%xmm8 + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm8,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm9,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm10,%xmm2,%xmm8 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm8,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + cmpl $11,%r8d + jb .Lvaesenclast9 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast9 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast9: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vmovups %xmm12,48(%r13,%r12,1) + leaq 64(%r12),%r12 + + addl 0(%r9),%eax + addl 4(%r9),%esi + addl 8(%r9),%ecx + addl 12(%r9),%edx + movl %eax,0(%r9) + addl 16(%r9),%ebp + movl %esi,4(%r9) + movl %esi,%ebx + movl %ecx,8(%r9) + movl %ecx,%edi + movl %edx,12(%r9) + xorl %edx,%edi + movl %ebp,16(%r9) + andl %edi,%esi + jmp .Loop_avx + +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 16(%r15),%xmm15 + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 32(%r15),%xmm14 + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 48(%r15),%xmm15 + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + cmpl $11,%r8d + jb .Lvaesenclast10 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 64(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 80(%r15),%xmm15 + je .Lvaesenclast10 + vaesenc %xmm15,%xmm12,%xmm12 + vmovups 96(%r15),%xmm14 + vaesenc %xmm14,%xmm12,%xmm12 + vmovups 112(%r15),%xmm15 +.Lvaesenclast10: + vaesenclast %xmm15,%xmm12,%xmm12 + vmovups -112(%r15),%xmm15 + vmovups 16-112(%r15),%xmm14 + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vmovups %xmm12,48(%r13,%r12,1) + movq 88(%rsp),%r8 + + addl 0(%r9),%eax + addl 4(%r9),%esi + addl 8(%r9),%ecx + movl %eax,0(%r9) + addl 12(%r9),%edx + movl %esi,4(%r9) + addl 16(%r9),%ebp + movl %ecx,8(%r9) + movl %edx,12(%r9) + movl %ebp,16(%r9) + vmovups %xmm12,(%r8) + vzeroall + leaq 104(%rsp),%rsi +.cfi_def_cfa %rsi,56 + movq 0(%rsi),%r15 +.cfi_restore %r15 + movq 8(%rsi),%r14 +.cfi_restore %r14 + movq 16(%rsi),%r13 +.cfi_restore %r13 + movq 24(%rsi),%r12 +.cfi_restore %r12 + movq 32(%rsi),%rbp +.cfi_restore %rbp + movq 40(%rsi),%rbx +.cfi_restore %rbx + leaq 48(%rsi),%rsp +.cfi_def_cfa %rsp,8 +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,49,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 .type aesni_cbc_sha1_enc_shaext,@function .align 32 aesni_cbc_sha1_enc_shaext: .cfi_startproc movq 8(%rsp),%r10 movdqu (%r9),%xmm8 movd 16(%r9),%xmm9 movdqa K_XX_XX+80(%rip),%xmm7 movl 240(%rcx),%r11d subq %rdi,%rsi movups (%rcx),%xmm15 movups (%r8),%xmm2 movups 16(%rcx),%xmm0 leaq 112(%rcx),%rcx pshufd $27,%xmm8,%xmm8 pshufd $27,%xmm9,%xmm9 jmp .Loop_shaext .align 16 .Loop_shaext: movups 0(%rdi),%xmm14 xorps %xmm15,%xmm14 xorps %xmm14,%xmm2 movups -80(%rcx),%xmm1 .byte 102,15,56,220,208 movdqu (%r10),%xmm3 movdqa %xmm9,%xmm12 .byte 102,15,56,0,223 movdqu 16(%r10),%xmm4 movdqa %xmm8,%xmm11 movups -64(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 102,15,56,0,231 paddd %xmm3,%xmm9 movdqu 32(%r10),%xmm5 leaq 64(%r10),%r10 pxor %xmm12,%xmm3 movups -48(%rcx),%xmm1 .byte 102,15,56,220,208 pxor %xmm12,%xmm3 movdqa %xmm8,%xmm10 .byte 102,15,56,0,239 .byte 69,15,58,204,193,0 .byte 68,15,56,200,212 movups -32(%rcx),%xmm0 .byte 102,15,56,220,209 .byte 15,56,201,220 movdqu -16(%r10),%xmm6 movdqa %xmm8,%xmm9 .byte 102,15,56,0,247 movups -16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 69,15,58,204,194,0 .byte 68,15,56,200,205 pxor %xmm5,%xmm3 .byte 15,56,201,229 movups 0(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,0 .byte 68,15,56,200,214 movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,222 pxor %xmm6,%xmm4 .byte 15,56,201,238 movups 32(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,0 .byte 68,15,56,200,203 movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,227 pxor %xmm3,%xmm5 .byte 15,56,201,243 cmpl $11,%r11d - jb .Laesenclast6 + jb .Laesenclast11 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast6 + je .Laesenclast11 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast6: +.Laesenclast11: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,0 .byte 68,15,56,200,212 movups 16(%rdi),%xmm14 xorps %xmm15,%xmm14 movups %xmm2,0(%rsi,%rdi,1) xorps %xmm14,%xmm2 movups -80(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,236 pxor %xmm4,%xmm6 .byte 15,56,201,220 movups -64(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,1 .byte 68,15,56,200,205 movups -48(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,245 pxor %xmm5,%xmm3 .byte 15,56,201,229 movups -32(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,1 .byte 68,15,56,200,214 movups -16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,222 pxor %xmm6,%xmm4 .byte 15,56,201,238 movups 0(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,1 .byte 68,15,56,200,203 movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,227 pxor %xmm3,%xmm5 .byte 15,56,201,243 movups 32(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,1 .byte 68,15,56,200,212 movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,236 pxor %xmm4,%xmm6 .byte 15,56,201,220 cmpl $11,%r11d - jb .Laesenclast7 + jb .Laesenclast12 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast7 + je .Laesenclast12 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast7: +.Laesenclast12: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,1 .byte 68,15,56,200,205 movups 32(%rdi),%xmm14 xorps %xmm15,%xmm14 movups %xmm2,16(%rsi,%rdi,1) xorps %xmm14,%xmm2 movups -80(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,245 pxor %xmm5,%xmm3 .byte 15,56,201,229 movups -64(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,2 .byte 68,15,56,200,214 movups -48(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,222 pxor %xmm6,%xmm4 .byte 15,56,201,238 movups -32(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,2 .byte 68,15,56,200,203 movups -16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,227 pxor %xmm3,%xmm5 .byte 15,56,201,243 movups 0(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,2 .byte 68,15,56,200,212 movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,236 pxor %xmm4,%xmm6 .byte 15,56,201,220 movups 32(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,2 .byte 68,15,56,200,205 movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,245 pxor %xmm5,%xmm3 .byte 15,56,201,229 cmpl $11,%r11d - jb .Laesenclast8 + jb .Laesenclast13 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast8 + je .Laesenclast13 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast8: +.Laesenclast13: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,2 .byte 68,15,56,200,214 movups 48(%rdi),%xmm14 xorps %xmm15,%xmm14 movups %xmm2,32(%rsi,%rdi,1) xorps %xmm14,%xmm2 movups -80(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,222 pxor %xmm6,%xmm4 .byte 15,56,201,238 movups -64(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,3 .byte 68,15,56,200,203 movups -48(%rcx),%xmm1 .byte 102,15,56,220,208 .byte 15,56,202,227 pxor %xmm3,%xmm5 .byte 15,56,201,243 movups -32(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,3 .byte 68,15,56,200,212 .byte 15,56,202,236 pxor %xmm4,%xmm6 movups -16(%rcx),%xmm1 .byte 102,15,56,220,208 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,3 .byte 68,15,56,200,205 .byte 15,56,202,245 movups 0(%rcx),%xmm0 .byte 102,15,56,220,209 movdqa %xmm12,%xmm5 movdqa %xmm8,%xmm10 .byte 69,15,58,204,193,3 .byte 68,15,56,200,214 movups 16(%rcx),%xmm1 .byte 102,15,56,220,208 movdqa %xmm8,%xmm9 .byte 69,15,58,204,194,3 .byte 68,15,56,200,205 movups 32(%rcx),%xmm0 .byte 102,15,56,220,209 movups 48(%rcx),%xmm1 .byte 102,15,56,220,208 cmpl $11,%r11d - jb .Laesenclast9 + jb .Laesenclast14 movups 64(%rcx),%xmm0 .byte 102,15,56,220,209 movups 80(%rcx),%xmm1 .byte 102,15,56,220,208 - je .Laesenclast9 + je .Laesenclast14 movups 96(%rcx),%xmm0 .byte 102,15,56,220,209 movups 112(%rcx),%xmm1 .byte 102,15,56,220,208 -.Laesenclast9: +.Laesenclast14: .byte 102,15,56,221,209 movups 16-112(%rcx),%xmm0 decq %rdx paddd %xmm11,%xmm8 movups %xmm2,48(%rsi,%rdi,1) leaq 64(%rdi),%rdi jnz .Loop_shaext pshufd $27,%xmm8,%xmm8 pshufd $27,%xmm9,%xmm9 movups %xmm2,(%r8) movdqu %xmm8,(%r9) movd %xmm9,16(%r9) .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha1_enc_shaext,.-aesni_cbc_sha1_enc_shaext Index: stable/12/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/aesni-sha256-x86_64.S (revision 364963) @@ -1,61 +1,4437 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from aesni-sha256-x86_64.pl. */ .text .globl aesni_cbc_sha256_enc .type aesni_cbc_sha256_enc,@function .align 16 aesni_cbc_sha256_enc: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl $1,%eax + cmpq $0,%rdi + je .Lprobe + movl 0(%r11),%eax + movq 4(%r11),%r10 + btq $61,%r10 + jc aesni_cbc_sha256_enc_shaext + movq %r10,%r11 + shrq $32,%r11 + + testl $2048,%r10d + jnz aesni_cbc_sha256_enc_xop + andl $296,%r11d + cmpl $296,%r11d + je aesni_cbc_sha256_enc_avx2 + andl $268435456,%r10d + jnz aesni_cbc_sha256_enc_avx + ud2 xorl %eax,%eax cmpq $0,%rdi je .Lprobe ud2 .Lprobe: .byte 0xf3,0xc3 .cfi_endproc .size aesni_cbc_sha256_enc,.-aesni_cbc_sha256_enc .align 64 .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0,0,0,0, 0,0,0,0, -1,-1,-1,-1 .long 0,0,0,0, 0,0,0,0 .byte 65,69,83,78,73,45,67,66,67,43,83,72,65,50,53,54,32,115,116,105,116,99,104,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 +.type aesni_cbc_sha256_enc_xop,@function +.align 64 +aesni_cbc_sha256_enc_xop: +.cfi_startproc +.Lxop_shortcut: + movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $128,%rsp + andq $-64,%rsp + + shlq $6,%rdx + subq %rdi,%rsi + subq %rdi,%r10 + addq %rdi,%rdx + + + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movq %r8,64+32(%rsp) + movq %r9,64+40(%rsp) + movq %r10,64+48(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 +.Lprologue_xop: + vzeroall + + movq %rdi,%r12 + leaq 128(%rcx),%rdi + leaq K256+544(%rip),%r13 + movl 240-128(%rdi),%r14d + movq %r9,%r15 + movq %r10,%rsi + vmovdqu (%r8),%xmm8 + subq $9,%r14 + + movl 0(%r15),%eax + movl 4(%r15),%ebx + movl 8(%r15),%ecx + movl 12(%r15),%edx + movl 16(%r15),%r8d + movl 20(%r15),%r9d + movl 24(%r15),%r10d + movl 28(%r15),%r11d + + vmovdqa 0(%r13,%r14,8),%xmm14 + vmovdqa 16(%r13,%r14,8),%xmm13 + vmovdqa 32(%r13,%r14,8),%xmm12 + vmovdqu 0-128(%rdi),%xmm10 + jmp .Lloop_xop +.align 16 +.Lloop_xop: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi,%r12,1),%xmm0 + vmovdqu 16(%rsi,%r12,1),%xmm1 + vmovdqu 32(%rsi,%r12,1),%xmm2 + vmovdqu 48(%rsi,%r12,1),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%esi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%esi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + subq $-32*4,%rbp + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorl $14,%r13d + movl %r14d,%eax + vpalignr $4,%xmm2,%xmm3,%xmm7 + movl %r9d,%r12d + xorl %r8d,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %r10d,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %eax,%r14d + vpaddd %xmm7,%xmm0,%xmm0 + andl %r8d,%r12d + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %r10d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi +.byte 143,232,120,194,251,13 + xorl %eax,%r14d + addl %r13d,%r11d + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%esi + addl %r11d,%edx + vpsrld $10,%xmm3,%xmm6 + rorl $2,%r14d + addl %esi,%r11d + vpaddd %xmm4,%xmm0,%xmm0 + movl %edx,%r13d + addl %r11d,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%r11d + vpxor %xmm6,%xmm7,%xmm7 + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 4(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d +.byte 143,232,120,194,248,13 + xorl %r11d,%r14d + addl %r13d,%r10d + vpsrld $10,%xmm0,%xmm6 + xorl %eax,%r15d + addl %r10d,%ecx +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%r10d + vpxor %xmm6,%xmm7,%xmm7 + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + vpxor %xmm5,%xmm7,%xmm7 + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + vpaddd %xmm7,%xmm0,%xmm0 + addl 8(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorl $14,%r13d + movl %r14d,%r8d + vpalignr $4,%xmm3,%xmm0,%xmm7 + movl %ebx,%r12d + xorl %eax,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %ecx,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %r8d,%r14d + vpaddd %xmm7,%xmm1,%xmm1 + andl %eax,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %ecx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi +.byte 143,232,120,194,248,13 + xorl %r8d,%r14d + addl %r13d,%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %r9d,%esi + addl %edx,%r11d + vpsrld $10,%xmm0,%xmm6 + rorl $2,%r14d + addl %esi,%edx + vpaddd %xmm4,%xmm1,%xmm1 + movl %r11d,%r13d + addl %edx,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%edx + vpxor %xmm6,%xmm7,%xmm7 + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 20(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d +.byte 143,232,120,194,249,13 + xorl %edx,%r14d + addl %r13d,%ecx + vpsrld $10,%xmm1,%xmm6 + xorl %r8d,%r15d + addl %ecx,%r10d +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%ecx + vpxor %xmm6,%xmm7,%xmm7 + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + vpxor %xmm5,%xmm7,%xmm7 + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + vpaddd %xmm7,%xmm1,%xmm1 + addl 24(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorl $14,%r13d + movl %r14d,%eax + vpalignr $4,%xmm0,%xmm1,%xmm7 + movl %r9d,%r12d + xorl %r8d,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %r10d,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %eax,%r14d + vpaddd %xmm7,%xmm2,%xmm2 + andl %r8d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %r10d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi +.byte 143,232,120,194,249,13 + xorl %eax,%r14d + addl %r13d,%r11d + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%esi + addl %r11d,%edx + vpsrld $10,%xmm1,%xmm6 + rorl $2,%r14d + addl %esi,%r11d + vpaddd %xmm4,%xmm2,%xmm2 + movl %edx,%r13d + addl %r11d,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%r11d + vpxor %xmm6,%xmm7,%xmm7 + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 36(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d +.byte 143,232,120,194,250,13 + xorl %r11d,%r14d + addl %r13d,%r10d + vpsrld $10,%xmm2,%xmm6 + xorl %eax,%r15d + addl %r10d,%ecx +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%r10d + vpxor %xmm6,%xmm7,%xmm7 + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + vpxor %xmm5,%xmm7,%xmm7 + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + vpaddd %xmm7,%xmm2,%xmm2 + addl 40(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorl $14,%r13d + movl %r14d,%r8d + vpalignr $4,%xmm1,%xmm2,%xmm7 + movl %ebx,%r12d + xorl %eax,%r13d +.byte 143,232,120,194,236,14 + rorl $9,%r14d + xorl %ecx,%r12d + vpsrld $3,%xmm4,%xmm4 + rorl $5,%r13d + xorl %r8d,%r14d + vpaddd %xmm7,%xmm3,%xmm3 + andl %eax,%r12d + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d +.byte 143,232,120,194,245,11 + rorl $11,%r14d + xorl %ecx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi +.byte 143,232,120,194,250,13 + xorl %r8d,%r14d + addl %r13d,%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %r9d,%esi + addl %edx,%r11d + vpsrld $10,%xmm2,%xmm6 + rorl $2,%r14d + addl %esi,%edx + vpaddd %xmm4,%xmm3,%xmm3 + movl %r11d,%r13d + addl %edx,%r14d +.byte 143,232,120,194,239,2 + rorl $14,%r13d + movl %r14d,%edx + vpxor %xmm6,%xmm7,%xmm7 + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + vpxor %xmm5,%xmm7,%xmm7 + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrldq $8,%xmm7,%xmm7 + addl 52(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d +.byte 143,232,120,194,251,13 + xorl %edx,%r14d + addl %r13d,%ecx + vpsrld $10,%xmm3,%xmm6 + xorl %r8d,%r15d + addl %ecx,%r10d +.byte 143,232,120,194,239,2 + rorl $2,%r14d + addl %r15d,%ecx + vpxor %xmm6,%xmm7,%xmm7 + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + vpxor %xmm5,%xmm7,%xmm7 + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + vpslldq $8,%xmm7,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + vpaddd %xmm7,%xmm3,%xmm3 + addl 56(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + movq 64+0(%rsp),%r12 + vpand %xmm14,%xmm11,%xmm11 + movq 64+8(%rsp),%r15 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r15,%r12,1) + leaq 16(%r12),%r12 + cmpb $0,131(%rbp) + jne .Lxop_00_47 + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + rorl $2,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + rorl $2,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + rorl $2,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + rorl $2,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + rorl $11,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + rorl $6,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + rorl $2,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%esi + rorl $11,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + rorl $6,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + rorl $2,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + rorl $11,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + rorl $6,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + rorl $2,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + rorl $11,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + rorl $6,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + rorl $2,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + rorl $11,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + rorl $6,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + rorl $2,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%esi + rorl $11,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + rorl $6,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + rorl $2,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + rorl $11,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + rorl $6,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + rorl $2,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + rorl $11,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + rorl $6,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + rorl $2,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%r12 + movq 64+8(%rsp),%r13 + movq 64+40(%rsp),%r15 + movq 64+48(%rsp),%rsi + + vpand %xmm14,%xmm11,%xmm11 + movl %r14d,%eax + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r12,%r13,1) + leaq 16(%r12),%r12 + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + addl 28(%r15),%r11d + + cmpq 64+16(%rsp),%r12 + + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + + jb .Lloop_xop + + movq 64+32(%rsp),%r8 + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vmovdqu %xmm8,(%r8) + vzeroall + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_xop: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_xop,.-aesni_cbc_sha256_enc_xop +.type aesni_cbc_sha256_enc_avx,@function +.align 64 +aesni_cbc_sha256_enc_avx: +.cfi_startproc +.Lavx_shortcut: + movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $128,%rsp + andq $-64,%rsp + + shlq $6,%rdx + subq %rdi,%rsi + subq %rdi,%r10 + addq %rdi,%rdx + + + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + + movq %r8,64+32(%rsp) + movq %r9,64+40(%rsp) + movq %r10,64+48(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + vzeroall + + movq %rdi,%r12 + leaq 128(%rcx),%rdi + leaq K256+544(%rip),%r13 + movl 240-128(%rdi),%r14d + movq %r9,%r15 + movq %r10,%rsi + vmovdqu (%r8),%xmm8 + subq $9,%r14 + + movl 0(%r15),%eax + movl 4(%r15),%ebx + movl 8(%r15),%ecx + movl 12(%r15),%edx + movl 16(%r15),%r8d + movl 20(%r15),%r9d + movl 24(%r15),%r10d + movl 28(%r15),%r11d + + vmovdqa 0(%r13,%r14,8),%xmm14 + vmovdqa 16(%r13,%r14,8),%xmm13 + vmovdqa 32(%r13,%r14,8),%xmm12 + vmovdqu 0-128(%rdi),%xmm10 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi,%r12,1),%xmm0 + vmovdqu 16(%rsi,%r12,1),%xmm1 + vmovdqu 32(%rsi,%r12,1),%xmm2 + vmovdqu 48(%rsi,%r12,1),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%esi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%esi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-32*4,%rbp + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + vpshufd $250,%xmm3,%xmm7 + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpaddd %xmm6,%xmm0,%xmm0 + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + vpshufd $80,%xmm0,%xmm7 + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + vpsrlq $17,%xmm7,%xmm7 + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpslldq $8,%xmm6,%xmm6 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + vpaddd 0(%rbp),%xmm0,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + vpshufd $250,%xmm0,%xmm7 + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpaddd %xmm6,%xmm1,%xmm1 + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + vpshufd $80,%xmm1,%xmm7 + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + vpsrlq $17,%xmm7,%xmm7 + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpslldq $8,%xmm6,%xmm6 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + vpaddd 32(%rbp),%xmm1,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + vpshufd $250,%xmm1,%xmm7 + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpaddd %xmm6,%xmm2,%xmm2 + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + vpshufd $80,%xmm2,%xmm7 + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + vpsrlq $17,%xmm7,%xmm7 + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpslldq $8,%xmm6,%xmm6 + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + vpaddd 64(%rbp),%xmm2,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + vpshufd $250,%xmm2,%xmm7 + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + vpslld $11,%xmm5,%xmm5 + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + vpxor %xmm5,%xmm4,%xmm4 + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + vpshufd $132,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpsrldq $8,%xmm6,%xmm6 + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpaddd %xmm6,%xmm3,%xmm3 + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + vpshufd $80,%xmm3,%xmm7 + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + vpsrld $10,%xmm7,%xmm6 + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + vpsrlq $17,%xmm7,%xmm7 + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpsrlq $2,%xmm7,%xmm7 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + vpshufd $232,%xmm6,%xmm6 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpslldq $8,%xmm6,%xmm6 + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + vpaddd 96(%rbp),%xmm3,%xmm6 + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + movq 64+0(%rsp),%r12 + vpand %xmm14,%xmm11,%xmm11 + movq 64+8(%rsp),%r15 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r15,%r12,1) + leaq 16(%r12),%r12 + cmpb $0,131(%rbp) + jne .Lavx_00_47 + vmovdqu (%r12),%xmm9 + movq %r12,64+0(%rsp) + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vpxor %xmm8,%xmm9,%xmm9 + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%esi + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%esi + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + xorl %r8d,%r13d + shrdl $9,%r14d,%r14d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + shrdl $11,%r14d,%r14d + xorl %r10d,%r12d + xorl %ebx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r11d + andl %r15d,%esi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%esi + addl %r11d,%edx + shrdl $2,%r14d,%r14d + addl %esi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + xorl %edx,%r13d + shrdl $9,%r14d,%r14d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%esi + shrdl $11,%r14d,%r14d + xorl %r9d,%r12d + xorl %eax,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r10d + andl %esi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + addl %r10d,%ecx + shrdl $2,%r14d,%r14d + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + xorl %ecx,%r13d + shrdl $9,%r14d,%r14d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + shrdl $11,%r14d,%r14d + xorl %r8d,%r12d + xorl %r11d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%r9d + andl %r15d,%esi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%esi + addl %r9d,%ebx + shrdl $2,%r14d,%r14d + addl %esi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + xorl %ebx,%r13d + shrdl $9,%r14d,%r14d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%esi + shrdl $11,%r14d,%r14d + xorl %edx,%r12d + xorl %r10d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%r8d + andl %esi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + addl %r8d,%eax + shrdl $2,%r14d,%r14d + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + xorl %eax,%r13d + shrdl $9,%r14d,%r14d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + shrdl $11,%r14d,%r14d + xorl %ecx,%r12d + xorl %r9d,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%edx + andl %r15d,%esi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%esi + addl %edx,%r11d + shrdl $2,%r14d,%r14d + addl %esi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + xorl %r11d,%r13d + shrdl $9,%r14d,%r14d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%esi + shrdl $11,%r14d,%r14d + xorl %ebx,%r12d + xorl %r8d,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%ecx + andl %esi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + addl %ecx,%r10d + shrdl $2,%r14d,%r14d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + xorl %r10d,%r13d + shrdl $9,%r14d,%r14d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + shrdl $11,%r14d,%r14d + xorl %eax,%r12d + xorl %edx,%r15d + shrdl $6,%r13d,%r13d + addl %r12d,%ebx + andl %r15d,%esi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%esi + addl %ebx,%r9d + shrdl $2,%r14d,%r14d + addl %esi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + xorl %r9d,%r13d + shrdl $9,%r14d,%r14d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%esi + shrdl $11,%r14d,%r14d + xorl %r11d,%r12d + xorl %ecx,%esi + shrdl $6,%r13d,%r13d + addl %r12d,%eax + andl %esi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + addl %eax,%r8d + shrdl $2,%r14d,%r14d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%r12 + movq 64+8(%rsp),%r13 + movq 64+40(%rsp),%r15 + movq 64+48(%rsp),%rsi + + vpand %xmm14,%xmm11,%xmm11 + movl %r14d,%eax + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r12,%r13,1) + leaq 16(%r12),%r12 + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + addl 28(%r15),%r11d + + cmpq 64+16(%rsp),%r12 + + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + jb .Lloop_avx + + movq 64+32(%rsp),%r8 + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vmovdqu %xmm8,(%r8) + vzeroall + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_avx,.-aesni_cbc_sha256_enc_avx +.type aesni_cbc_sha256_enc_avx2,@function +.align 64 +aesni_cbc_sha256_enc_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq 8(%rsp),%r10 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $576,%rsp + andq $-1024,%rsp + addq $448,%rsp + + shlq $6,%rdx + subq %rdi,%rsi + subq %rdi,%r10 + addq %rdi,%rdx + + + + movq %rdx,64+16(%rsp) + + movq %r8,64+32(%rsp) + movq %r9,64+40(%rsp) + movq %r10,64+48(%rsp) + movq %rax,120(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + vzeroall + + movq %rdi,%r13 + vpinsrq $1,%rsi,%xmm15,%xmm15 + leaq 128(%rcx),%rdi + leaq K256+544(%rip),%r12 + movl 240-128(%rdi),%r14d + movq %r9,%r15 + movq %r10,%rsi + vmovdqu (%r8),%xmm8 + leaq -9(%r14),%r14 + + vmovdqa 0(%r12,%r14,8),%xmm14 + vmovdqa 16(%r12,%r14,8),%xmm13 + vmovdqa 32(%r12,%r14,8),%xmm12 + + subq $-64,%r13 + movl 0(%r15),%eax + leaq (%rsi,%r13,1),%r12 + movl 4(%r15),%ebx + cmpq %rdx,%r13 + movl 8(%r15),%ecx + cmoveq %rsp,%r12 + movl 12(%r15),%edx + movl 16(%r15),%r8d + movl 20(%r15),%r9d + movl 24(%r15),%r10d + movl 28(%r15),%r11d + vmovdqu 0-128(%rdi),%xmm10 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi,%r13,1),%xmm0 + vmovdqu -64+16(%rsi,%r13,1),%xmm1 + vmovdqu -64+32(%rsi,%r13,1),%xmm2 + vmovdqu -64+48(%rsi,%r13,1),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + leaq -64(%r13),%r13 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + + movq 120(%rsp),%rsi +.cfi_def_cfa %rsi,8 + leaq -64(%rsp),%rsp + + + + movq %rsi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + movl %ebx,%esi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%esi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + vmovdqu (%r13),%xmm9 + vpinsrq $0,%r13,%xmm15,%xmm15 + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vpxor %xmm8,%xmm9,%xmm9 + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpshufd $80,%ymm0,%ymm7 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpsrlq $2,%ymm7,%ymm7 + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + vpaddd %ymm6,%ymm0,%ymm0 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + vpaddd 0(%rbp),%ymm0,%ymm6 + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpshufd $80,%ymm1,%ymm7 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpsrlq $2,%ymm7,%ymm7 + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + vpaddd %ymm6,%ymm1,%ymm1 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + vpaddd 32(%rbp),%ymm1,%ymm6 + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpshufd $80,%ymm2,%ymm7 + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpsrlq $2,%ymm7,%ymm7 + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + vpaddd %ymm6,%ymm2,%ymm2 + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + vpaddd 64(%rbp),%ymm2,%ymm6 + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%esi + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufd $132,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpsrldq $8,%ymm6,%ymm6 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpshufd $80,%ymm3,%ymm7 + andl %r15d,%esi + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + vpsrld $10,%ymm7,%ymm6 + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + vpsrlq $17,%ymm7,%ymm7 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpxor %ymm7,%ymm6,%ymm6 + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpsrlq $2,%ymm7,%ymm7 + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + vpxor %ymm7,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + vpshufd $232,%ymm6,%ymm6 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + vpslldq $8,%ymm6,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + vpaddd %ymm6,%ymm3,%ymm3 + andl %esi,%r15d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + vpaddd 96(%rbp),%ymm3,%ymm6 + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + vmovq %xmm15,%r13 + vpextrq $1,%xmm15,%r15 + vpand %xmm14,%xmm11,%xmm11 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r15,%r13,1) + leaq 16(%r13),%r13 + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + vmovdqu (%r13),%xmm9 + vpinsrq $0,%r13,%xmm15,%xmm15 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vpxor %xmm8,%xmm9,%xmm9 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vpextrq $1,%xmm15,%r12 + vmovq %xmm15,%r13 + movq 552(%rsp),%r15 + addl %r14d,%eax + leaq 448(%rsp),%rbp + + vpand %xmm14,%xmm11,%xmm11 + vpor %xmm11,%xmm8,%xmm8 + vmovdqu %xmm8,(%r12,%r13,1) + leaq 16(%r13),%r13 + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + addl 28(%r15),%r11d + + movl %eax,0(%r15) + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + + cmpq 80(%rbp),%r13 + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%esi + movl %r9d,%r12d + xorl %ecx,%esi + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + vmovdqu (%r13),%xmm9 + vpinsrq $0,%r13,%xmm15,%xmm15 + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vpxor %xmm10,%xmm9,%xmm9 + vmovdqu 16-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vpxor %xmm8,%xmm9,%xmm9 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 32-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 48-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 80-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 96-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 112-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 128-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ebx,%esi + xorl %r13d,%r14d + leal (%r11,%rsi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%esi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %esi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%esi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%esi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %esi,%r15d + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 144-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%esi + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r11d,%esi + xorl %r13d,%r14d + leal (%r9,%rsi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%esi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %esi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%esi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%esi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 176-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%esi + vpand %xmm12,%xmm11,%xmm8 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 192-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r9d,%esi + xorl %r13d,%r14d + leal (%rdx,%rsi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%esi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %esi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%esi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%esi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %esi,%r15d + vaesenclast %xmm10,%xmm9,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 208-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%esi + vpand %xmm13,%xmm11,%xmm11 + vaesenc %xmm10,%xmm9,%xmm9 + vmovdqu 224-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %edx,%esi + xorl %r13d,%r14d + leal (%rbx,%rsi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%esi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %esi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%esi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%esi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %esi,%r15d + vpor %xmm11,%xmm8,%xmm8 + vaesenclast %xmm10,%xmm9,%xmm11 + vmovdqu 0-128(%rdi),%xmm10 + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovq %xmm15,%r13 + vpextrq $1,%xmm15,%r15 + vpand %xmm14,%xmm11,%xmm11 + vpor %xmm11,%xmm8,%xmm8 + leaq -64(%rbp),%rbp + vmovdqu %xmm8,(%r15,%r13,1) + leaq 16(%r13),%r13 + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 552(%rsp),%r15 + leaq 64(%r13),%r13 + movq 560(%rsp),%rsi + addl %r14d,%eax + leaq 448(%rsp),%rsp + + addl 0(%r15),%eax + addl 4(%r15),%ebx + addl 8(%r15),%ecx + addl 12(%r15),%edx + addl 16(%r15),%r8d + addl 20(%r15),%r9d + addl 24(%r15),%r10d + leaq (%rsi,%r13,1),%r12 + addl 28(%r15),%r11d + + cmpq 64+16(%rsp),%r13 + + movl %eax,0(%r15) + cmoveq %rsp,%r12 + movl %ebx,4(%r15) + movl %ecx,8(%r15) + movl %edx,12(%r15) + movl %r8d,16(%r15) + movl %r9d,20(%r15) + movl %r10d,24(%r15) + movl %r11d,28(%r15) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0xf8,0x00,0x06,0x23,0x08 + +.Ldone_avx2: + movq 64+32(%rbp),%r8 + movq 64+56(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vmovdqu %xmm8,(%r8) + vzeroall + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_avx2,.-aesni_cbc_sha256_enc_avx2 +.type aesni_cbc_sha256_enc_shaext,@function +.align 32 +aesni_cbc_sha256_enc_shaext: +.cfi_startproc + movq 8(%rsp),%r10 + leaq K256+128(%rip),%rax + movdqu (%r9),%xmm1 + movdqu 16(%r9),%xmm2 + movdqa 512-128(%rax),%xmm3 + + movl 240(%rcx),%r11d + subq %rdi,%rsi + movups (%rcx),%xmm15 + movups (%r8),%xmm6 + movups 16(%rcx),%xmm4 + leaq 112(%rcx),%rcx + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm3,%xmm7 +.byte 102,15,58,15,202,8 + punpcklqdq %xmm0,%xmm2 + + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%r10),%xmm10 + movdqu 16(%r10),%xmm11 + movdqu 32(%r10),%xmm12 +.byte 102,68,15,56,0,211 + movdqu 48(%r10),%xmm13 + + movdqa 0-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 102,68,15,56,0,219 + movdqa %xmm2,%xmm9 + movdqa %xmm1,%xmm8 + movups 0(%rdi),%xmm14 + xorps %xmm15,%xmm14 + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 32-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 102,68,15,56,0,227 + leaq 64(%r10),%r10 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 64-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 102,68,15,56,0,235 +.byte 69,15,56,204,211 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm13,%xmm3 +.byte 102,65,15,58,15,220,4 + paddd %xmm3,%xmm10 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 96-128(%rax),%xmm0 + paddd %xmm13,%xmm0 +.byte 69,15,56,205,213 +.byte 69,15,56,204,220 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,221,4 + paddd %xmm3,%xmm11 +.byte 15,56,203,202 + movdqa 128-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 69,15,56,205,218 +.byte 69,15,56,204,229 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 + paddd %xmm3,%xmm12 + cmpl $11,%r11d + jb .Laesenclast1 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast1 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast1: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop +.byte 15,56,203,202 + movups 16(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm6,0(%rsi,%rdi,1) + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movdqa 160-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 69,15,56,205,227 +.byte 69,15,56,204,234 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm12,%xmm3 +.byte 102,65,15,58,15,219,4 + paddd %xmm3,%xmm13 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 192-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 69,15,56,205,236 +.byte 69,15,56,204,211 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm13,%xmm3 +.byte 102,65,15,58,15,220,4 + paddd %xmm3,%xmm10 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 224-128(%rax),%xmm0 + paddd %xmm13,%xmm0 +.byte 69,15,56,205,213 +.byte 69,15,56,204,220 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,221,4 + paddd %xmm3,%xmm11 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 256-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 69,15,56,205,218 +.byte 69,15,56,204,229 + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 + paddd %xmm3,%xmm12 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + cmpl $11,%r11d + jb .Laesenclast2 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast2 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast2: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop +.byte 15,56,203,202 + movups 32(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm6,16(%rsi,%rdi,1) + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movdqa 288-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 69,15,56,205,227 +.byte 69,15,56,204,234 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm12,%xmm3 +.byte 102,65,15,58,15,219,4 + paddd %xmm3,%xmm13 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 320-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 69,15,56,205,236 +.byte 69,15,56,204,211 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm13,%xmm3 +.byte 102,65,15,58,15,220,4 + paddd %xmm3,%xmm10 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 352-128(%rax),%xmm0 + paddd %xmm13,%xmm0 +.byte 69,15,56,205,213 +.byte 69,15,56,204,220 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm10,%xmm3 +.byte 102,65,15,58,15,221,4 + paddd %xmm3,%xmm11 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 384-128(%rax),%xmm0 + paddd %xmm10,%xmm0 +.byte 69,15,56,205,218 +.byte 69,15,56,204,229 + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm11,%xmm3 +.byte 102,65,15,58,15,218,4 + paddd %xmm3,%xmm12 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + movdqa 416-128(%rax),%xmm0 + paddd %xmm11,%xmm0 +.byte 69,15,56,205,227 +.byte 69,15,56,204,234 + cmpl $11,%r11d + jb .Laesenclast3 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast3 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast3: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm12,%xmm3 +.byte 102,65,15,58,15,219,4 + paddd %xmm3,%xmm13 + movups 48(%rdi),%xmm14 + xorps %xmm15,%xmm14 + movups %xmm6,32(%rsi,%rdi,1) + xorps %xmm14,%xmm6 + movups -80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movups -64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 448-128(%rax),%xmm0 + paddd %xmm12,%xmm0 +.byte 69,15,56,205,236 + movdqa %xmm7,%xmm3 + movups -48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups -32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,202 + + movdqa 480-128(%rax),%xmm0 + paddd %xmm13,%xmm0 + movups -16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + movups 0(%rcx),%xmm4 + aesenc %xmm5,%xmm6 +.byte 15,56,203,209 + pshufd $0x0e,%xmm0,%xmm0 + movups 16(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.byte 15,56,203,202 + + movups 32(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 48(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + cmpl $11,%r11d + jb .Laesenclast4 + movups 64(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 80(%rcx),%xmm5 + aesenc %xmm4,%xmm6 + je .Laesenclast4 + movups 96(%rcx),%xmm4 + aesenc %xmm5,%xmm6 + movups 112(%rcx),%xmm5 + aesenc %xmm4,%xmm6 +.Laesenclast4: + aesenclast %xmm5,%xmm6 + movups 16-112(%rcx),%xmm4 + nop + + paddd %xmm9,%xmm2 + paddd %xmm8,%xmm1 + + decq %rdx + movups %xmm6,48(%rsi,%rdi,1) + leaq 64(%rdi),%rdi + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm3 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 +.byte 102,15,58,15,211,8 + + movups %xmm6,(%r8) + movdqu %xmm1,(%r9) + movdqu %xmm2,16(%r9) + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_cbc_sha256_enc_shaext,.-aesni_cbc_sha256_enc_shaext Index: stable/12/secure/lib/libcrypto/amd64/chacha-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/chacha-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/chacha-x86_64.S (revision 364963) @@ -1,1169 +1,2195 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from chacha-x86_64.pl. */ .text .align 64 .Lzero: .long 0,0,0,0 .Lone: .long 1,0,0,0 .Linc: .long 0,1,2,3 .Lfour: .long 4,4,4,4 .Lincy: .long 0,2,4,6,1,3,5,7 .Leight: .long 8,8,8,8,8,8,8,8 .Lrot16: .byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd .Lrot24: .byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe .Ltwoy: .long 2,0,0,0, 2,0,0,0 .align 64 .Lzeroz: .long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 .Lfourz: .long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 .Lincz: .long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 .Lsixteen: .long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 .Lsigma: .byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .globl ChaCha20_ctr32 .type ChaCha20_ctr32,@function .align 64 ChaCha20_ctr32: .cfi_startproc cmpq $0,%rdx je .Lno_data movq OPENSSL_ia32cap_P+4(%rip),%r10 testl $512,%r10d jnz .LChaCha20_ssse3 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $64+24,%rsp .cfi_adjust_cfa_offset 64+24 .Lctr32_body: movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa .Lone(%rip),%xmm4 movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) movq %rdx,%rbp jmp .Loop_outer .align 32 .Loop_outer: movl $0x61707865,%eax movl $0x3320646e,%ebx movl $0x79622d32,%ecx movl $0x6b206574,%edx movl 16(%rsp),%r8d movl 20(%rsp),%r9d movl 24(%rsp),%r10d movl 28(%rsp),%r11d movd %xmm3,%r12d movl 52(%rsp),%r13d movl 56(%rsp),%r14d movl 60(%rsp),%r15d movq %rbp,64+0(%rsp) movl $10,%ebp movq %rsi,64+8(%rsp) .byte 102,72,15,126,214 movq %rdi,64+16(%rsp) movq %rsi,%rdi shrq $32,%rdi jmp .Loop .align 32 .Loop: addl %r8d,%eax xorl %eax,%r12d roll $16,%r12d addl %r9d,%ebx xorl %ebx,%r13d roll $16,%r13d addl %r12d,%esi xorl %esi,%r8d roll $12,%r8d addl %r13d,%edi xorl %edi,%r9d roll $12,%r9d addl %r8d,%eax xorl %eax,%r12d roll $8,%r12d addl %r9d,%ebx xorl %ebx,%r13d roll $8,%r13d addl %r12d,%esi xorl %esi,%r8d roll $7,%r8d addl %r13d,%edi xorl %edi,%r9d roll $7,%r9d movl %esi,32(%rsp) movl %edi,36(%rsp) movl 40(%rsp),%esi movl 44(%rsp),%edi addl %r10d,%ecx xorl %ecx,%r14d roll $16,%r14d addl %r11d,%edx xorl %edx,%r15d roll $16,%r15d addl %r14d,%esi xorl %esi,%r10d roll $12,%r10d addl %r15d,%edi xorl %edi,%r11d roll $12,%r11d addl %r10d,%ecx xorl %ecx,%r14d roll $8,%r14d addl %r11d,%edx xorl %edx,%r15d roll $8,%r15d addl %r14d,%esi xorl %esi,%r10d roll $7,%r10d addl %r15d,%edi xorl %edi,%r11d roll $7,%r11d addl %r9d,%eax xorl %eax,%r15d roll $16,%r15d addl %r10d,%ebx xorl %ebx,%r12d roll $16,%r12d addl %r15d,%esi xorl %esi,%r9d roll $12,%r9d addl %r12d,%edi xorl %edi,%r10d roll $12,%r10d addl %r9d,%eax xorl %eax,%r15d roll $8,%r15d addl %r10d,%ebx xorl %ebx,%r12d roll $8,%r12d addl %r15d,%esi xorl %esi,%r9d roll $7,%r9d addl %r12d,%edi xorl %edi,%r10d roll $7,%r10d movl %esi,40(%rsp) movl %edi,44(%rsp) movl 32(%rsp),%esi movl 36(%rsp),%edi addl %r11d,%ecx xorl %ecx,%r13d roll $16,%r13d addl %r8d,%edx xorl %edx,%r14d roll $16,%r14d addl %r13d,%esi xorl %esi,%r11d roll $12,%r11d addl %r14d,%edi xorl %edi,%r8d roll $12,%r8d addl %r11d,%ecx xorl %ecx,%r13d roll $8,%r13d addl %r8d,%edx xorl %edx,%r14d roll $8,%r14d addl %r13d,%esi xorl %esi,%r11d roll $7,%r11d addl %r14d,%edi xorl %edi,%r8d roll $7,%r8d decl %ebp jnz .Loop movl %edi,36(%rsp) movl %esi,32(%rsp) movq 64(%rsp),%rbp movdqa %xmm2,%xmm1 movq 64+8(%rsp),%rsi paddd %xmm4,%xmm3 movq 64+16(%rsp),%rdi addl $0x61707865,%eax addl $0x3320646e,%ebx addl $0x79622d32,%ecx addl $0x6b206574,%edx addl 16(%rsp),%r8d addl 20(%rsp),%r9d addl 24(%rsp),%r10d addl 28(%rsp),%r11d addl 48(%rsp),%r12d addl 52(%rsp),%r13d addl 56(%rsp),%r14d addl 60(%rsp),%r15d paddd 32(%rsp),%xmm1 cmpq $64,%rbp jb .Ltail xorl 0(%rsi),%eax xorl 4(%rsi),%ebx xorl 8(%rsi),%ecx xorl 12(%rsi),%edx xorl 16(%rsi),%r8d xorl 20(%rsi),%r9d xorl 24(%rsi),%r10d xorl 28(%rsi),%r11d movdqu 32(%rsi),%xmm0 xorl 48(%rsi),%r12d xorl 52(%rsi),%r13d xorl 56(%rsi),%r14d xorl 60(%rsi),%r15d leaq 64(%rsi),%rsi pxor %xmm1,%xmm0 movdqa %xmm2,32(%rsp) movd %xmm3,48(%rsp) movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) movdqu %xmm0,32(%rdi) movl %r12d,48(%rdi) movl %r13d,52(%rdi) movl %r14d,56(%rdi) movl %r15d,60(%rdi) leaq 64(%rdi),%rdi subq $64,%rbp jnz .Loop_outer jmp .Ldone .align 16 .Ltail: movl %eax,0(%rsp) movl %ebx,4(%rsp) xorq %rbx,%rbx movl %ecx,8(%rsp) movl %edx,12(%rsp) movl %r8d,16(%rsp) movl %r9d,20(%rsp) movl %r10d,24(%rsp) movl %r11d,28(%rsp) movdqa %xmm1,32(%rsp) movl %r12d,48(%rsp) movl %r13d,52(%rsp) movl %r14d,56(%rsp) movl %r15d,60(%rsp) .Loop_tail: movzbl (%rsi,%rbx,1),%eax movzbl (%rsp,%rbx,1),%edx leaq 1(%rbx),%rbx xorl %edx,%eax movb %al,-1(%rdi,%rbx,1) decq %rbp jnz .Loop_tail .Ldone: leaq 64+24+48(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lno_data: .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_ctr32,.-ChaCha20_ctr32 .type ChaCha20_ssse3,@function .align 32 ChaCha20_ssse3: .cfi_startproc .LChaCha20_ssse3: movq %rsp,%r9 .cfi_def_cfa_register %r9 + testl $2048,%r10d + jnz .LChaCha20_4xop cmpq $128,%rdx je .LChaCha20_128 ja .LChaCha20_4x .Ldo_sse3_after_all: subq $64+8,%rsp movdqa .Lsigma(%rip),%xmm0 movdqu (%rcx),%xmm1 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa .Lrot16(%rip),%xmm6 movdqa .Lrot24(%rip),%xmm7 movdqa %xmm0,0(%rsp) movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) movq $10,%r8 jmp .Loop_ssse3 .align 32 .Loop_outer_ssse3: movdqa .Lone(%rip),%xmm3 movdqa 0(%rsp),%xmm0 movdqa 16(%rsp),%xmm1 movdqa 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 movq $10,%r8 movdqa %xmm3,48(%rsp) jmp .Loop_ssse3 .align 32 .Loop_ssse3: paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $57,%xmm1,%xmm1 pshufd $147,%xmm3,%xmm3 nop paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 decq %r8 jnz .Loop_ssse3 paddd 0(%rsp),%xmm0 paddd 16(%rsp),%xmm1 paddd 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 cmpq $64,%rdx jb .Ltail_ssse3 movdqu 0(%rsi),%xmm4 movdqu 16(%rsi),%xmm5 pxor %xmm4,%xmm0 movdqu 32(%rsi),%xmm4 pxor %xmm5,%xmm1 movdqu 48(%rsi),%xmm5 leaq 64(%rsi),%rsi pxor %xmm4,%xmm2 pxor %xmm5,%xmm3 movdqu %xmm0,0(%rdi) movdqu %xmm1,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) leaq 64(%rdi),%rdi subq $64,%rdx jnz .Loop_outer_ssse3 jmp .Ldone_ssse3 .align 16 .Ltail_ssse3: movdqa %xmm0,0(%rsp) movdqa %xmm1,16(%rsp) movdqa %xmm2,32(%rsp) movdqa %xmm3,48(%rsp) xorq %r8,%r8 .Loop_tail_ssse3: movzbl (%rsi,%r8,1),%eax movzbl (%rsp,%r8,1),%ecx leaq 1(%r8),%r8 xorl %ecx,%eax movb %al,-1(%rdi,%r8,1) decq %rdx jnz .Loop_tail_ssse3 .Ldone_ssse3: leaq (%r9),%rsp .cfi_def_cfa_register %rsp .Lssse3_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_ssse3,.-ChaCha20_ssse3 .type ChaCha20_128,@function .align 32 ChaCha20_128: .cfi_startproc .LChaCha20_128: movq %rsp,%r9 .cfi_def_cfa_register %r9 subq $64+8,%rsp movdqa .Lsigma(%rip),%xmm8 movdqu (%rcx),%xmm9 movdqu 16(%rcx),%xmm2 movdqu (%r8),%xmm3 movdqa .Lone(%rip),%xmm1 movdqa .Lrot16(%rip),%xmm6 movdqa .Lrot24(%rip),%xmm7 movdqa %xmm8,%xmm10 movdqa %xmm8,0(%rsp) movdqa %xmm9,%xmm11 movdqa %xmm9,16(%rsp) movdqa %xmm2,%xmm0 movdqa %xmm2,32(%rsp) paddd %xmm3,%xmm1 movdqa %xmm3,48(%rsp) movq $10,%r8 jmp .Loop_128 .align 32 .Loop_128: paddd %xmm9,%xmm8 pxor %xmm8,%xmm3 paddd %xmm11,%xmm10 pxor %xmm10,%xmm1 .byte 102,15,56,0,222 .byte 102,15,56,0,206 paddd %xmm3,%xmm2 paddd %xmm1,%xmm0 pxor %xmm2,%xmm9 pxor %xmm0,%xmm11 movdqa %xmm9,%xmm4 psrld $20,%xmm9 movdqa %xmm11,%xmm5 pslld $12,%xmm4 psrld $20,%xmm11 por %xmm4,%xmm9 pslld $12,%xmm5 por %xmm5,%xmm11 paddd %xmm9,%xmm8 pxor %xmm8,%xmm3 paddd %xmm11,%xmm10 pxor %xmm10,%xmm1 .byte 102,15,56,0,223 .byte 102,15,56,0,207 paddd %xmm3,%xmm2 paddd %xmm1,%xmm0 pxor %xmm2,%xmm9 pxor %xmm0,%xmm11 movdqa %xmm9,%xmm4 psrld $25,%xmm9 movdqa %xmm11,%xmm5 pslld $7,%xmm4 psrld $25,%xmm11 por %xmm4,%xmm9 pslld $7,%xmm5 por %xmm5,%xmm11 pshufd $78,%xmm2,%xmm2 pshufd $57,%xmm9,%xmm9 pshufd $147,%xmm3,%xmm3 pshufd $78,%xmm0,%xmm0 pshufd $57,%xmm11,%xmm11 pshufd $147,%xmm1,%xmm1 paddd %xmm9,%xmm8 pxor %xmm8,%xmm3 paddd %xmm11,%xmm10 pxor %xmm10,%xmm1 .byte 102,15,56,0,222 .byte 102,15,56,0,206 paddd %xmm3,%xmm2 paddd %xmm1,%xmm0 pxor %xmm2,%xmm9 pxor %xmm0,%xmm11 movdqa %xmm9,%xmm4 psrld $20,%xmm9 movdqa %xmm11,%xmm5 pslld $12,%xmm4 psrld $20,%xmm11 por %xmm4,%xmm9 pslld $12,%xmm5 por %xmm5,%xmm11 paddd %xmm9,%xmm8 pxor %xmm8,%xmm3 paddd %xmm11,%xmm10 pxor %xmm10,%xmm1 .byte 102,15,56,0,223 .byte 102,15,56,0,207 paddd %xmm3,%xmm2 paddd %xmm1,%xmm0 pxor %xmm2,%xmm9 pxor %xmm0,%xmm11 movdqa %xmm9,%xmm4 psrld $25,%xmm9 movdqa %xmm11,%xmm5 pslld $7,%xmm4 psrld $25,%xmm11 por %xmm4,%xmm9 pslld $7,%xmm5 por %xmm5,%xmm11 pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm9,%xmm9 pshufd $57,%xmm3,%xmm3 pshufd $78,%xmm0,%xmm0 pshufd $147,%xmm11,%xmm11 pshufd $57,%xmm1,%xmm1 decq %r8 jnz .Loop_128 paddd 0(%rsp),%xmm8 paddd 16(%rsp),%xmm9 paddd 32(%rsp),%xmm2 paddd 48(%rsp),%xmm3 paddd .Lone(%rip),%xmm1 paddd 0(%rsp),%xmm10 paddd 16(%rsp),%xmm11 paddd 32(%rsp),%xmm0 paddd 48(%rsp),%xmm1 movdqu 0(%rsi),%xmm4 movdqu 16(%rsi),%xmm5 pxor %xmm4,%xmm8 movdqu 32(%rsi),%xmm4 pxor %xmm5,%xmm9 movdqu 48(%rsi),%xmm5 pxor %xmm4,%xmm2 movdqu 64(%rsi),%xmm4 pxor %xmm5,%xmm3 movdqu 80(%rsi),%xmm5 pxor %xmm4,%xmm10 movdqu 96(%rsi),%xmm4 pxor %xmm5,%xmm11 movdqu 112(%rsi),%xmm5 pxor %xmm4,%xmm0 pxor %xmm5,%xmm1 movdqu %xmm8,0(%rdi) movdqu %xmm9,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) movdqu %xmm10,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm0,96(%rdi) movdqu %xmm1,112(%rdi) leaq (%r9),%rsp .cfi_def_cfa_register %rsp .L128_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_128,.-ChaCha20_128 .type ChaCha20_4x,@function .align 32 ChaCha20_4x: .cfi_startproc .LChaCha20_4x: movq %rsp,%r9 .cfi_def_cfa_register %r9 movq %r10,%r11 + shrq $32,%r10 + testq $32,%r10 + jnz .LChaCha20_8x cmpq $192,%rdx ja .Lproceed4x andq $71303168,%r11 cmpq $4194304,%r11 je .Ldo_sse3_after_all .Lproceed4x: subq $0x140+8,%rsp movdqa .Lsigma(%rip),%xmm11 movdqu (%rcx),%xmm15 movdqu 16(%rcx),%xmm7 movdqu (%r8),%xmm3 leaq 256(%rsp),%rcx leaq .Lrot16(%rip),%r10 leaq .Lrot24(%rip),%r11 pshufd $0x00,%xmm11,%xmm8 pshufd $0x55,%xmm11,%xmm9 movdqa %xmm8,64(%rsp) pshufd $0xaa,%xmm11,%xmm10 movdqa %xmm9,80(%rsp) pshufd $0xff,%xmm11,%xmm11 movdqa %xmm10,96(%rsp) movdqa %xmm11,112(%rsp) pshufd $0x00,%xmm15,%xmm12 pshufd $0x55,%xmm15,%xmm13 movdqa %xmm12,128-256(%rcx) pshufd $0xaa,%xmm15,%xmm14 movdqa %xmm13,144-256(%rcx) pshufd $0xff,%xmm15,%xmm15 movdqa %xmm14,160-256(%rcx) movdqa %xmm15,176-256(%rcx) pshufd $0x00,%xmm7,%xmm4 pshufd $0x55,%xmm7,%xmm5 movdqa %xmm4,192-256(%rcx) pshufd $0xaa,%xmm7,%xmm6 movdqa %xmm5,208-256(%rcx) pshufd $0xff,%xmm7,%xmm7 movdqa %xmm6,224-256(%rcx) movdqa %xmm7,240-256(%rcx) pshufd $0x00,%xmm3,%xmm0 pshufd $0x55,%xmm3,%xmm1 paddd .Linc(%rip),%xmm0 pshufd $0xaa,%xmm3,%xmm2 movdqa %xmm1,272-256(%rcx) pshufd $0xff,%xmm3,%xmm3 movdqa %xmm2,288-256(%rcx) movdqa %xmm3,304-256(%rcx) jmp .Loop_enter4x .align 32 .Loop_outer4x: movdqa 64(%rsp),%xmm8 movdqa 80(%rsp),%xmm9 movdqa 96(%rsp),%xmm10 movdqa 112(%rsp),%xmm11 movdqa 128-256(%rcx),%xmm12 movdqa 144-256(%rcx),%xmm13 movdqa 160-256(%rcx),%xmm14 movdqa 176-256(%rcx),%xmm15 movdqa 192-256(%rcx),%xmm4 movdqa 208-256(%rcx),%xmm5 movdqa 224-256(%rcx),%xmm6 movdqa 240-256(%rcx),%xmm7 movdqa 256-256(%rcx),%xmm0 movdqa 272-256(%rcx),%xmm1 movdqa 288-256(%rcx),%xmm2 movdqa 304-256(%rcx),%xmm3 paddd .Lfour(%rip),%xmm0 .Loop_enter4x: movdqa %xmm6,32(%rsp) movdqa %xmm7,48(%rsp) movdqa (%r10),%xmm7 movl $10,%eax movdqa %xmm0,256-256(%rcx) jmp .Loop4x .align 32 .Loop4x: paddd %xmm12,%xmm8 paddd %xmm13,%xmm9 pxor %xmm8,%xmm0 pxor %xmm9,%xmm1 .byte 102,15,56,0,199 .byte 102,15,56,0,207 paddd %xmm0,%xmm4 paddd %xmm1,%xmm5 pxor %xmm4,%xmm12 pxor %xmm5,%xmm13 movdqa %xmm12,%xmm6 pslld $12,%xmm12 psrld $20,%xmm6 movdqa %xmm13,%xmm7 pslld $12,%xmm13 por %xmm6,%xmm12 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm13 paddd %xmm12,%xmm8 paddd %xmm13,%xmm9 pxor %xmm8,%xmm0 pxor %xmm9,%xmm1 .byte 102,15,56,0,198 .byte 102,15,56,0,206 paddd %xmm0,%xmm4 paddd %xmm1,%xmm5 pxor %xmm4,%xmm12 pxor %xmm5,%xmm13 movdqa %xmm12,%xmm7 pslld $7,%xmm12 psrld $25,%xmm7 movdqa %xmm13,%xmm6 pslld $7,%xmm13 por %xmm7,%xmm12 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm13 movdqa %xmm4,0(%rsp) movdqa %xmm5,16(%rsp) movdqa 32(%rsp),%xmm4 movdqa 48(%rsp),%xmm5 paddd %xmm14,%xmm10 paddd %xmm15,%xmm11 pxor %xmm10,%xmm2 pxor %xmm11,%xmm3 .byte 102,15,56,0,215 .byte 102,15,56,0,223 paddd %xmm2,%xmm4 paddd %xmm3,%xmm5 pxor %xmm4,%xmm14 pxor %xmm5,%xmm15 movdqa %xmm14,%xmm6 pslld $12,%xmm14 psrld $20,%xmm6 movdqa %xmm15,%xmm7 pslld $12,%xmm15 por %xmm6,%xmm14 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm15 paddd %xmm14,%xmm10 paddd %xmm15,%xmm11 pxor %xmm10,%xmm2 pxor %xmm11,%xmm3 .byte 102,15,56,0,214 .byte 102,15,56,0,222 paddd %xmm2,%xmm4 paddd %xmm3,%xmm5 pxor %xmm4,%xmm14 pxor %xmm5,%xmm15 movdqa %xmm14,%xmm7 pslld $7,%xmm14 psrld $25,%xmm7 movdqa %xmm15,%xmm6 pslld $7,%xmm15 por %xmm7,%xmm14 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm15 paddd %xmm13,%xmm8 paddd %xmm14,%xmm9 pxor %xmm8,%xmm3 pxor %xmm9,%xmm0 .byte 102,15,56,0,223 .byte 102,15,56,0,199 paddd %xmm3,%xmm4 paddd %xmm0,%xmm5 pxor %xmm4,%xmm13 pxor %xmm5,%xmm14 movdqa %xmm13,%xmm6 pslld $12,%xmm13 psrld $20,%xmm6 movdqa %xmm14,%xmm7 pslld $12,%xmm14 por %xmm6,%xmm13 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm14 paddd %xmm13,%xmm8 paddd %xmm14,%xmm9 pxor %xmm8,%xmm3 pxor %xmm9,%xmm0 .byte 102,15,56,0,222 .byte 102,15,56,0,198 paddd %xmm3,%xmm4 paddd %xmm0,%xmm5 pxor %xmm4,%xmm13 pxor %xmm5,%xmm14 movdqa %xmm13,%xmm7 pslld $7,%xmm13 psrld $25,%xmm7 movdqa %xmm14,%xmm6 pslld $7,%xmm14 por %xmm7,%xmm13 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm14 movdqa %xmm4,32(%rsp) movdqa %xmm5,48(%rsp) movdqa 0(%rsp),%xmm4 movdqa 16(%rsp),%xmm5 paddd %xmm15,%xmm10 paddd %xmm12,%xmm11 pxor %xmm10,%xmm1 pxor %xmm11,%xmm2 .byte 102,15,56,0,207 .byte 102,15,56,0,215 paddd %xmm1,%xmm4 paddd %xmm2,%xmm5 pxor %xmm4,%xmm15 pxor %xmm5,%xmm12 movdqa %xmm15,%xmm6 pslld $12,%xmm15 psrld $20,%xmm6 movdqa %xmm12,%xmm7 pslld $12,%xmm12 por %xmm6,%xmm15 psrld $20,%xmm7 movdqa (%r11),%xmm6 por %xmm7,%xmm12 paddd %xmm15,%xmm10 paddd %xmm12,%xmm11 pxor %xmm10,%xmm1 pxor %xmm11,%xmm2 .byte 102,15,56,0,206 .byte 102,15,56,0,214 paddd %xmm1,%xmm4 paddd %xmm2,%xmm5 pxor %xmm4,%xmm15 pxor %xmm5,%xmm12 movdqa %xmm15,%xmm7 pslld $7,%xmm15 psrld $25,%xmm7 movdqa %xmm12,%xmm6 pslld $7,%xmm12 por %xmm7,%xmm15 psrld $25,%xmm6 movdqa (%r10),%xmm7 por %xmm6,%xmm12 decl %eax jnz .Loop4x paddd 64(%rsp),%xmm8 paddd 80(%rsp),%xmm9 paddd 96(%rsp),%xmm10 paddd 112(%rsp),%xmm11 movdqa %xmm8,%xmm6 punpckldq %xmm9,%xmm8 movdqa %xmm10,%xmm7 punpckldq %xmm11,%xmm10 punpckhdq %xmm9,%xmm6 punpckhdq %xmm11,%xmm7 movdqa %xmm8,%xmm9 punpcklqdq %xmm10,%xmm8 movdqa %xmm6,%xmm11 punpcklqdq %xmm7,%xmm6 punpckhqdq %xmm10,%xmm9 punpckhqdq %xmm7,%xmm11 paddd 128-256(%rcx),%xmm12 paddd 144-256(%rcx),%xmm13 paddd 160-256(%rcx),%xmm14 paddd 176-256(%rcx),%xmm15 movdqa %xmm8,0(%rsp) movdqa %xmm9,16(%rsp) movdqa 32(%rsp),%xmm8 movdqa 48(%rsp),%xmm9 movdqa %xmm12,%xmm10 punpckldq %xmm13,%xmm12 movdqa %xmm14,%xmm7 punpckldq %xmm15,%xmm14 punpckhdq %xmm13,%xmm10 punpckhdq %xmm15,%xmm7 movdqa %xmm12,%xmm13 punpcklqdq %xmm14,%xmm12 movdqa %xmm10,%xmm15 punpcklqdq %xmm7,%xmm10 punpckhqdq %xmm14,%xmm13 punpckhqdq %xmm7,%xmm15 paddd 192-256(%rcx),%xmm4 paddd 208-256(%rcx),%xmm5 paddd 224-256(%rcx),%xmm8 paddd 240-256(%rcx),%xmm9 movdqa %xmm6,32(%rsp) movdqa %xmm11,48(%rsp) movdqa %xmm4,%xmm14 punpckldq %xmm5,%xmm4 movdqa %xmm8,%xmm7 punpckldq %xmm9,%xmm8 punpckhdq %xmm5,%xmm14 punpckhdq %xmm9,%xmm7 movdqa %xmm4,%xmm5 punpcklqdq %xmm8,%xmm4 movdqa %xmm14,%xmm9 punpcklqdq %xmm7,%xmm14 punpckhqdq %xmm8,%xmm5 punpckhqdq %xmm7,%xmm9 paddd 256-256(%rcx),%xmm0 paddd 272-256(%rcx),%xmm1 paddd 288-256(%rcx),%xmm2 paddd 304-256(%rcx),%xmm3 movdqa %xmm0,%xmm8 punpckldq %xmm1,%xmm0 movdqa %xmm2,%xmm7 punpckldq %xmm3,%xmm2 punpckhdq %xmm1,%xmm8 punpckhdq %xmm3,%xmm7 movdqa %xmm0,%xmm1 punpcklqdq %xmm2,%xmm0 movdqa %xmm8,%xmm3 punpcklqdq %xmm7,%xmm8 punpckhqdq %xmm2,%xmm1 punpckhqdq %xmm7,%xmm3 cmpq $256,%rdx jb .Ltail4x movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu 0(%rsi),%xmm6 movdqu %xmm11,80(%rdi) movdqu 16(%rsi),%xmm11 movdqu %xmm2,96(%rdi) movdqu 32(%rsi),%xmm2 movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi movdqu 48(%rsi),%xmm7 pxor 32(%rsp),%xmm6 pxor %xmm10,%xmm11 pxor %xmm14,%xmm2 pxor %xmm8,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 48(%rsp),%xmm6 pxor %xmm15,%xmm11 pxor %xmm9,%xmm2 pxor %xmm3,%xmm7 movdqu %xmm6,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm2,96(%rdi) movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi subq $256,%rdx jnz .Loop_outer4x jmp .Ldone4x .Ltail4x: cmpq $192,%rdx jae .L192_or_more4x cmpq $128,%rdx jae .L128_or_more4x cmpq $64,%rdx jae .L64_or_more4x xorq %r10,%r10 movdqa %xmm12,16(%rsp) movdqa %xmm4,32(%rsp) movdqa %xmm0,48(%rsp) jmp .Loop_tail4x .align 32 .L64_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu %xmm11,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm7,48(%rdi) je .Ldone4x movdqa 16(%rsp),%xmm6 leaq 64(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm13,16(%rsp) leaq 64(%rdi),%rdi movdqa %xmm5,32(%rsp) subq $64,%rdx movdqa %xmm1,48(%rsp) jmp .Loop_tail4x .align 32 .L128_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu %xmm11,80(%rdi) movdqu %xmm2,96(%rdi) movdqu %xmm7,112(%rdi) je .Ldone4x movdqa 32(%rsp),%xmm6 leaq 128(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm10,16(%rsp) leaq 128(%rdi),%rdi movdqa %xmm14,32(%rsp) subq $128,%rdx movdqa %xmm8,48(%rsp) jmp .Loop_tail4x .align 32 .L192_or_more4x: movdqu 0(%rsi),%xmm6 movdqu 16(%rsi),%xmm11 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm7 pxor 0(%rsp),%xmm6 pxor %xmm12,%xmm11 pxor %xmm4,%xmm2 pxor %xmm0,%xmm7 movdqu %xmm6,0(%rdi) movdqu 64(%rsi),%xmm6 movdqu %xmm11,16(%rdi) movdqu 80(%rsi),%xmm11 movdqu %xmm2,32(%rdi) movdqu 96(%rsi),%xmm2 movdqu %xmm7,48(%rdi) movdqu 112(%rsi),%xmm7 leaq 128(%rsi),%rsi pxor 16(%rsp),%xmm6 pxor %xmm13,%xmm11 pxor %xmm5,%xmm2 pxor %xmm1,%xmm7 movdqu %xmm6,64(%rdi) movdqu 0(%rsi),%xmm6 movdqu %xmm11,80(%rdi) movdqu 16(%rsi),%xmm11 movdqu %xmm2,96(%rdi) movdqu 32(%rsi),%xmm2 movdqu %xmm7,112(%rdi) leaq 128(%rdi),%rdi movdqu 48(%rsi),%xmm7 pxor 32(%rsp),%xmm6 pxor %xmm10,%xmm11 pxor %xmm14,%xmm2 pxor %xmm8,%xmm7 movdqu %xmm6,0(%rdi) movdqu %xmm11,16(%rdi) movdqu %xmm2,32(%rdi) movdqu %xmm7,48(%rdi) je .Ldone4x movdqa 48(%rsp),%xmm6 leaq 64(%rsi),%rsi xorq %r10,%r10 movdqa %xmm6,0(%rsp) movdqa %xmm15,16(%rsp) leaq 64(%rdi),%rdi movdqa %xmm9,32(%rsp) subq $192,%rdx movdqa %xmm3,48(%rsp) .Loop_tail4x: movzbl (%rsi,%r10,1),%eax movzbl (%rsp,%r10,1),%ecx leaq 1(%r10),%r10 xorl %ecx,%eax movb %al,-1(%rdi,%r10,1) decq %rdx jnz .Loop_tail4x .Ldone4x: leaq (%r9),%rsp .cfi_def_cfa_register %rsp .L4x_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ChaCha20_4x,.-ChaCha20_4x +.type ChaCha20_4xop,@function +.align 32 +ChaCha20_4xop: +.cfi_startproc +.LChaCha20_4xop: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $0x140+8,%rsp + vzeroupper + + vmovdqa .Lsigma(%rip),%xmm11 + vmovdqu (%rcx),%xmm3 + vmovdqu 16(%rcx),%xmm15 + vmovdqu (%r8),%xmm7 + leaq 256(%rsp),%rcx + + vpshufd $0x00,%xmm11,%xmm8 + vpshufd $0x55,%xmm11,%xmm9 + vmovdqa %xmm8,64(%rsp) + vpshufd $0xaa,%xmm11,%xmm10 + vmovdqa %xmm9,80(%rsp) + vpshufd $0xff,%xmm11,%xmm11 + vmovdqa %xmm10,96(%rsp) + vmovdqa %xmm11,112(%rsp) + + vpshufd $0x00,%xmm3,%xmm0 + vpshufd $0x55,%xmm3,%xmm1 + vmovdqa %xmm0,128-256(%rcx) + vpshufd $0xaa,%xmm3,%xmm2 + vmovdqa %xmm1,144-256(%rcx) + vpshufd $0xff,%xmm3,%xmm3 + vmovdqa %xmm2,160-256(%rcx) + vmovdqa %xmm3,176-256(%rcx) + + vpshufd $0x00,%xmm15,%xmm12 + vpshufd $0x55,%xmm15,%xmm13 + vmovdqa %xmm12,192-256(%rcx) + vpshufd $0xaa,%xmm15,%xmm14 + vmovdqa %xmm13,208-256(%rcx) + vpshufd $0xff,%xmm15,%xmm15 + vmovdqa %xmm14,224-256(%rcx) + vmovdqa %xmm15,240-256(%rcx) + + vpshufd $0x00,%xmm7,%xmm4 + vpshufd $0x55,%xmm7,%xmm5 + vpaddd .Linc(%rip),%xmm4,%xmm4 + vpshufd $0xaa,%xmm7,%xmm6 + vmovdqa %xmm5,272-256(%rcx) + vpshufd $0xff,%xmm7,%xmm7 + vmovdqa %xmm6,288-256(%rcx) + vmovdqa %xmm7,304-256(%rcx) + + jmp .Loop_enter4xop + +.align 32 +.Loop_outer4xop: + vmovdqa 64(%rsp),%xmm8 + vmovdqa 80(%rsp),%xmm9 + vmovdqa 96(%rsp),%xmm10 + vmovdqa 112(%rsp),%xmm11 + vmovdqa 128-256(%rcx),%xmm0 + vmovdqa 144-256(%rcx),%xmm1 + vmovdqa 160-256(%rcx),%xmm2 + vmovdqa 176-256(%rcx),%xmm3 + vmovdqa 192-256(%rcx),%xmm12 + vmovdqa 208-256(%rcx),%xmm13 + vmovdqa 224-256(%rcx),%xmm14 + vmovdqa 240-256(%rcx),%xmm15 + vmovdqa 256-256(%rcx),%xmm4 + vmovdqa 272-256(%rcx),%xmm5 + vmovdqa 288-256(%rcx),%xmm6 + vmovdqa 304-256(%rcx),%xmm7 + vpaddd .Lfour(%rip),%xmm4,%xmm4 + +.Loop_enter4xop: + movl $10,%eax + vmovdqa %xmm4,256-256(%rcx) + jmp .Loop4xop + +.align 32 +.Loop4xop: + vpaddd %xmm0,%xmm8,%xmm8 + vpaddd %xmm1,%xmm9,%xmm9 + vpaddd %xmm2,%xmm10,%xmm10 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor %xmm4,%xmm8,%xmm4 + vpxor %xmm5,%xmm9,%xmm5 + vpxor %xmm6,%xmm10,%xmm6 + vpxor %xmm7,%xmm11,%xmm7 +.byte 143,232,120,194,228,16 +.byte 143,232,120,194,237,16 +.byte 143,232,120,194,246,16 +.byte 143,232,120,194,255,16 + vpaddd %xmm4,%xmm12,%xmm12 + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm15,%xmm15 + vpxor %xmm0,%xmm12,%xmm0 + vpxor %xmm1,%xmm13,%xmm1 + vpxor %xmm14,%xmm2,%xmm2 + vpxor %xmm15,%xmm3,%xmm3 +.byte 143,232,120,194,192,12 +.byte 143,232,120,194,201,12 +.byte 143,232,120,194,210,12 +.byte 143,232,120,194,219,12 + vpaddd %xmm8,%xmm0,%xmm8 + vpaddd %xmm9,%xmm1,%xmm9 + vpaddd %xmm2,%xmm10,%xmm10 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor %xmm4,%xmm8,%xmm4 + vpxor %xmm5,%xmm9,%xmm5 + vpxor %xmm6,%xmm10,%xmm6 + vpxor %xmm7,%xmm11,%xmm7 +.byte 143,232,120,194,228,8 +.byte 143,232,120,194,237,8 +.byte 143,232,120,194,246,8 +.byte 143,232,120,194,255,8 + vpaddd %xmm4,%xmm12,%xmm12 + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm15,%xmm15 + vpxor %xmm0,%xmm12,%xmm0 + vpxor %xmm1,%xmm13,%xmm1 + vpxor %xmm14,%xmm2,%xmm2 + vpxor %xmm15,%xmm3,%xmm3 +.byte 143,232,120,194,192,7 +.byte 143,232,120,194,201,7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,219,7 + vpaddd %xmm1,%xmm8,%xmm8 + vpaddd %xmm2,%xmm9,%xmm9 + vpaddd %xmm3,%xmm10,%xmm10 + vpaddd %xmm0,%xmm11,%xmm11 + vpxor %xmm7,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm4 + vpxor %xmm5,%xmm10,%xmm5 + vpxor %xmm6,%xmm11,%xmm6 +.byte 143,232,120,194,255,16 +.byte 143,232,120,194,228,16 +.byte 143,232,120,194,237,16 +.byte 143,232,120,194,246,16 + vpaddd %xmm7,%xmm14,%xmm14 + vpaddd %xmm4,%xmm15,%xmm15 + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm6,%xmm13,%xmm13 + vpxor %xmm1,%xmm14,%xmm1 + vpxor %xmm2,%xmm15,%xmm2 + vpxor %xmm12,%xmm3,%xmm3 + vpxor %xmm13,%xmm0,%xmm0 +.byte 143,232,120,194,201,12 +.byte 143,232,120,194,210,12 +.byte 143,232,120,194,219,12 +.byte 143,232,120,194,192,12 + vpaddd %xmm8,%xmm1,%xmm8 + vpaddd %xmm9,%xmm2,%xmm9 + vpaddd %xmm3,%xmm10,%xmm10 + vpaddd %xmm0,%xmm11,%xmm11 + vpxor %xmm7,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm4 + vpxor %xmm5,%xmm10,%xmm5 + vpxor %xmm6,%xmm11,%xmm6 +.byte 143,232,120,194,255,8 +.byte 143,232,120,194,228,8 +.byte 143,232,120,194,237,8 +.byte 143,232,120,194,246,8 + vpaddd %xmm7,%xmm14,%xmm14 + vpaddd %xmm4,%xmm15,%xmm15 + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm6,%xmm13,%xmm13 + vpxor %xmm1,%xmm14,%xmm1 + vpxor %xmm2,%xmm15,%xmm2 + vpxor %xmm12,%xmm3,%xmm3 + vpxor %xmm13,%xmm0,%xmm0 +.byte 143,232,120,194,201,7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,192,7 + decl %eax + jnz .Loop4xop + + vpaddd 64(%rsp),%xmm8,%xmm8 + vpaddd 80(%rsp),%xmm9,%xmm9 + vpaddd 96(%rsp),%xmm10,%xmm10 + vpaddd 112(%rsp),%xmm11,%xmm11 + + vmovdqa %xmm14,32(%rsp) + vmovdqa %xmm15,48(%rsp) + + vpunpckldq %xmm9,%xmm8,%xmm14 + vpunpckldq %xmm11,%xmm10,%xmm15 + vpunpckhdq %xmm9,%xmm8,%xmm8 + vpunpckhdq %xmm11,%xmm10,%xmm10 + vpunpcklqdq %xmm15,%xmm14,%xmm9 + vpunpckhqdq %xmm15,%xmm14,%xmm14 + vpunpcklqdq %xmm10,%xmm8,%xmm11 + vpunpckhqdq %xmm10,%xmm8,%xmm8 + vpaddd 128-256(%rcx),%xmm0,%xmm0 + vpaddd 144-256(%rcx),%xmm1,%xmm1 + vpaddd 160-256(%rcx),%xmm2,%xmm2 + vpaddd 176-256(%rcx),%xmm3,%xmm3 + + vmovdqa %xmm9,0(%rsp) + vmovdqa %xmm14,16(%rsp) + vmovdqa 32(%rsp),%xmm9 + vmovdqa 48(%rsp),%xmm14 + + vpunpckldq %xmm1,%xmm0,%xmm10 + vpunpckldq %xmm3,%xmm2,%xmm15 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm15,%xmm10,%xmm1 + vpunpckhqdq %xmm15,%xmm10,%xmm10 + vpunpcklqdq %xmm2,%xmm0,%xmm3 + vpunpckhqdq %xmm2,%xmm0,%xmm0 + vpaddd 192-256(%rcx),%xmm12,%xmm12 + vpaddd 208-256(%rcx),%xmm13,%xmm13 + vpaddd 224-256(%rcx),%xmm9,%xmm9 + vpaddd 240-256(%rcx),%xmm14,%xmm14 + + vpunpckldq %xmm13,%xmm12,%xmm2 + vpunpckldq %xmm14,%xmm9,%xmm15 + vpunpckhdq %xmm13,%xmm12,%xmm12 + vpunpckhdq %xmm14,%xmm9,%xmm9 + vpunpcklqdq %xmm15,%xmm2,%xmm13 + vpunpckhqdq %xmm15,%xmm2,%xmm2 + vpunpcklqdq %xmm9,%xmm12,%xmm14 + vpunpckhqdq %xmm9,%xmm12,%xmm12 + vpaddd 256-256(%rcx),%xmm4,%xmm4 + vpaddd 272-256(%rcx),%xmm5,%xmm5 + vpaddd 288-256(%rcx),%xmm6,%xmm6 + vpaddd 304-256(%rcx),%xmm7,%xmm7 + + vpunpckldq %xmm5,%xmm4,%xmm9 + vpunpckldq %xmm7,%xmm6,%xmm15 + vpunpckhdq %xmm5,%xmm4,%xmm4 + vpunpckhdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm15,%xmm9,%xmm5 + vpunpckhqdq %xmm15,%xmm9,%xmm9 + vpunpcklqdq %xmm6,%xmm4,%xmm7 + vpunpckhqdq %xmm6,%xmm4,%xmm4 + vmovdqa 0(%rsp),%xmm6 + vmovdqa 16(%rsp),%xmm15 + + cmpq $256,%rdx + jb .Ltail4xop + + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vpxor 64(%rsi),%xmm15,%xmm15 + vpxor 80(%rsi),%xmm10,%xmm10 + vpxor 96(%rsi),%xmm2,%xmm2 + vpxor 112(%rsi),%xmm9,%xmm9 + leaq 128(%rsi),%rsi + vpxor 0(%rsi),%xmm11,%xmm11 + vpxor 16(%rsi),%xmm3,%xmm3 + vpxor 32(%rsi),%xmm14,%xmm14 + vpxor 48(%rsi),%xmm7,%xmm7 + vpxor 64(%rsi),%xmm8,%xmm8 + vpxor 80(%rsi),%xmm0,%xmm0 + vpxor 96(%rsi),%xmm12,%xmm12 + vpxor 112(%rsi),%xmm4,%xmm4 + leaq 128(%rsi),%rsi + + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + vmovdqu %xmm15,64(%rdi) + vmovdqu %xmm10,80(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm9,112(%rdi) + leaq 128(%rdi),%rdi + vmovdqu %xmm11,0(%rdi) + vmovdqu %xmm3,16(%rdi) + vmovdqu %xmm14,32(%rdi) + vmovdqu %xmm7,48(%rdi) + vmovdqu %xmm8,64(%rdi) + vmovdqu %xmm0,80(%rdi) + vmovdqu %xmm12,96(%rdi) + vmovdqu %xmm4,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz .Loop_outer4xop + + jmp .Ldone4xop + +.align 32 +.Ltail4xop: + cmpq $192,%rdx + jae .L192_or_more4xop + cmpq $128,%rdx + jae .L128_or_more4xop + cmpq $64,%rdx + jae .L64_or_more4xop + + xorq %r10,%r10 + vmovdqa %xmm6,0(%rsp) + vmovdqa %xmm1,16(%rsp) + vmovdqa %xmm13,32(%rsp) + vmovdqa %xmm5,48(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L64_or_more4xop: + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + je .Ldone4xop + + leaq 64(%rsi),%rsi + vmovdqa %xmm15,0(%rsp) + xorq %r10,%r10 + vmovdqa %xmm10,16(%rsp) + leaq 64(%rdi),%rdi + vmovdqa %xmm2,32(%rsp) + subq $64,%rdx + vmovdqa %xmm9,48(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L128_or_more4xop: + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vpxor 64(%rsi),%xmm15,%xmm15 + vpxor 80(%rsi),%xmm10,%xmm10 + vpxor 96(%rsi),%xmm2,%xmm2 + vpxor 112(%rsi),%xmm9,%xmm9 + + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + vmovdqu %xmm15,64(%rdi) + vmovdqu %xmm10,80(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm9,112(%rdi) + je .Ldone4xop + + leaq 128(%rsi),%rsi + vmovdqa %xmm11,0(%rsp) + xorq %r10,%r10 + vmovdqa %xmm3,16(%rsp) + leaq 128(%rdi),%rdi + vmovdqa %xmm14,32(%rsp) + subq $128,%rdx + vmovdqa %xmm7,48(%rsp) + jmp .Loop_tail4xop + +.align 32 +.L192_or_more4xop: + vpxor 0(%rsi),%xmm6,%xmm6 + vpxor 16(%rsi),%xmm1,%xmm1 + vpxor 32(%rsi),%xmm13,%xmm13 + vpxor 48(%rsi),%xmm5,%xmm5 + vpxor 64(%rsi),%xmm15,%xmm15 + vpxor 80(%rsi),%xmm10,%xmm10 + vpxor 96(%rsi),%xmm2,%xmm2 + vpxor 112(%rsi),%xmm9,%xmm9 + leaq 128(%rsi),%rsi + vpxor 0(%rsi),%xmm11,%xmm11 + vpxor 16(%rsi),%xmm3,%xmm3 + vpxor 32(%rsi),%xmm14,%xmm14 + vpxor 48(%rsi),%xmm7,%xmm7 + + vmovdqu %xmm6,0(%rdi) + vmovdqu %xmm1,16(%rdi) + vmovdqu %xmm13,32(%rdi) + vmovdqu %xmm5,48(%rdi) + vmovdqu %xmm15,64(%rdi) + vmovdqu %xmm10,80(%rdi) + vmovdqu %xmm2,96(%rdi) + vmovdqu %xmm9,112(%rdi) + leaq 128(%rdi),%rdi + vmovdqu %xmm11,0(%rdi) + vmovdqu %xmm3,16(%rdi) + vmovdqu %xmm14,32(%rdi) + vmovdqu %xmm7,48(%rdi) + je .Ldone4xop + + leaq 64(%rsi),%rsi + vmovdqa %xmm8,0(%rsp) + xorq %r10,%r10 + vmovdqa %xmm0,16(%rsp) + leaq 64(%rdi),%rdi + vmovdqa %xmm12,32(%rsp) + subq $192,%rdx + vmovdqa %xmm4,48(%rsp) + +.Loop_tail4xop: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail4xop + +.Ldone4xop: + vzeroupper + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L4xop_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_4xop,.-ChaCha20_4xop +.type ChaCha20_8x,@function +.align 32 +ChaCha20_8x: +.cfi_startproc +.LChaCha20_8x: + movq %rsp,%r9 +.cfi_def_cfa_register %r9 + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + + + + + + + + + + + vbroadcasti128 .Lsigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd .Lincy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd .Leight(%rip),%ymm4,%ymm4 + +.Loop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz .Loop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb .Ltail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmpq $448,%rdx + jae .L448_or_more8x + cmpq $384,%rdx + jae .L384_or_more8x + cmpq $320,%rdx + jae .L320_or_more8x + cmpq $256,%rdx + jae .L256_or_more8x + cmpq $192,%rdx + jae .L192_or_more8x + cmpq $128,%rdx + jae .L128_or_more8x + cmpq $64,%rdx + jae .L64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je .Ldone8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je .Ldone8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je .Ldone8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je .Ldone8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je .Ldone8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je .Ldone8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je .Ldone8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +.Loop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail8x + +.Ldone8x: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register %rsp +.L8x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ChaCha20_8x,.-ChaCha20_8x Index: stable/12/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/ecp_nistz256-x86_64.S (revision 364963) @@ -1,5292 +1,7345 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from ecp_nistz256-x86_64.pl. */ .text .globl ecp_nistz256_precomputed .type ecp_nistz256_precomputed,@object .align 4096 ecp_nistz256_precomputed: .long 0x18a9143c,0x79e730d4,0x5fedb601,0x75ba95fc,0x77622510,0x79fb732b,0xa53755c6,0x18905f76,0xce95560a,0xddf25357,0xba19e45c,0x8b4ab8e4,0xdd21f325,0xd2e88688,0x25885d85,0x8571ff18 .long 0x10ddd64d,0x850046d4,0xa433827d,0xaa6ae3c1,0x8d1490d9,0x73220503,0x3dcf3a3b,0xf6bb32e4,0x61bee1a5,0x2f3648d3,0xeb236ff8,0x152cd7cb,0x92042dbe,0x19a8fb0e,0x0a5b8a3b,0x78c57751 .long 0x4eebc127,0xffac3f90,0x087d81fb,0xb027f84a,0x87cbbc98,0x66ad77dd,0xb6ff747e,0x26936a3f,0xc983a7eb,0xb04c5c1f,0x0861fe1a,0x583e47ad,0x1a2ee98e,0x78820831,0xe587cc07,0xd5f06a29 .long 0x46918dcc,0x74b0b50d,0xc623c173,0x4650a6ed,0xe8100af2,0x0cdaacac,0x41b0176b,0x577362f5,0xe4cbaba6,0x2d96f24c,0xfad6f447,0x17628471,0xe5ddd22e,0x6b6c36de,0x4c5ab863,0x84b14c39 .long 0xc45c61f5,0xbe1b8aae,0x94b9537d,0x90ec649a,0xd076c20c,0x941cb5aa,0x890523c8,0xc9079605,0xe7ba4f10,0xeb309b4a,0xe5eb882b,0x73c568ef,0x7e7a1f68,0x3540a987,0x2dd1e916,0x73a076bb .long 0x3e77664a,0x40394737,0x346cee3e,0x55ae744f,0x5b17a3ad,0xd50a961a,0x54213673,0x13074b59,0xd377e44b,0x93d36220,0xadff14b5,0x299c2b53,0xef639f11,0xf424d44c,0x4a07f75f,0xa4c9916d .long 0xa0173b4f,0x0746354e,0xd23c00f7,0x2bd20213,0x0c23bb08,0xf43eaab5,0xc3123e03,0x13ba5119,0x3f5b9d4d,0x2847d030,0x5da67bdd,0x6742f2f2,0x77c94195,0xef933bdc,0x6e240867,0xeaedd915 .long 0x9499a78f,0x27f14cd1,0x6f9b3455,0x462ab5c5,0xf02cfc6b,0x8f90f02a,0xb265230d,0xb763891e,0x532d4977,0xf59da3a9,0xcf9eba15,0x21e3327d,0xbe60bbf0,0x123c7b84,0x7706df76,0x56ec12f2 .long 0x264e20e8,0x75c96e8f,0x59a7a841,0xabe6bfed,0x44c8eb00,0x2cc09c04,0xf0c4e16b,0xe05b3080,0xa45f3314,0x1eb7777a,0xce5d45e3,0x56af7bed,0x88b12f1a,0x2b6e019a,0xfd835f9b,0x086659cd .long 0x9dc21ec8,0x2c18dbd1,0x0fcf8139,0x98f9868a,0x48250b49,0x737d2cd6,0x24b3428f,0xcc61c947,0x80dd9e76,0x0c2b4078,0x383fbe08,0xc43a8991,0x779be5d2,0x5f7d2d65,0xeb3b4ab5,0x78719a54 .long 0x6245e404,0xea7d260a,0x6e7fdfe0,0x9de40795,0x8dac1ab5,0x1ff3a415,0x649c9073,0x3e7090f1,0x2b944e88,0x1a768561,0xe57f61c8,0x250f939e,0x1ead643d,0x0c0daa89,0xe125b88e,0x68930023 .long 0xd2697768,0x04b71aa7,0xca345a33,0xabdedef5,0xee37385e,0x2409d29d,0xcb83e156,0x4ee1df77,0x1cbb5b43,0x0cac12d9,0xca895637,0x170ed2f6,0x8ade6d66,0x28228cfa,0x53238aca,0x7ff57c95 .long 0x4b2ed709,0xccc42563,0x856fd30d,0x0e356769,0x559e9811,0xbcbcd43f,0x5395b759,0x738477ac,0xc00ee17f,0x35752b90,0x742ed2e3,0x68748390,0xbd1f5bc1,0x7cd06422,0xc9e7b797,0xfbc08769 .long 0xb0cf664a,0xa242a35b,0x7f9707e3,0x126e48f7,0xc6832660,0x1717bf54,0xfd12c72e,0xfaae7332,0x995d586b,0x27b52db7,0x832237c2,0xbe29569e,0x2a65e7db,0xe8e4193e,0x2eaa1bbb,0x152706dc .long 0xbc60055b,0x72bcd8b7,0x56e27e4b,0x03cc23ee,0xe4819370,0xee337424,0x0ad3da09,0xe2aa0e43,0x6383c45d,0x40b8524f,0x42a41b25,0xd7663554,0x778a4797,0x64efa6de,0x7079adf4,0x2042170a .long 0x0bc6fb80,0x808b0b65,0x3ffe2e6b,0x5882e075,0x2c83f549,0xd5ef2f7c,0x9103b723,0x54d63c80,0x52a23f9b,0xf2f11bd6,0x4b0b6587,0x3670c319,0xb1580e9e,0x55c4623b,0x01efe220,0x64edf7b2 .long 0xd53c5c9d,0x97091dcb,0xac0a177b,0xf17624b6,0x2cfe2dff,0xb0f13975,0x6c7a574e,0xc1a35c0a,0x93e79987,0x227d3146,0xe89cb80e,0x0575bf30,0x0d1883bb,0x2f4e247f,0x3274c3d0,0xebd51226 .long 0x56ada97a,0x5f3e51c8,0x8f8b403e,0x4afc964d,0x412e2979,0xa6f247ab,0x6f80ebda,0x675abd1b,0x5e485a1d,0x66a2bd72,0x8f4f0b3c,0x4b2a5caf,0x1b847bba,0x2626927f,0x0502394d,0x6c6fc7d9 .long 0xa5659ae8,0xfea912ba,0x25e1a16e,0x68363aba,0x752c41ac,0xb8842277,0x2897c3fc,0xfe545c28,0xdc4c696b,0x2d36e9e7,0xfba977c5,0x5806244a,0xe39508c1,0x85665e9b,0x6d12597b,0xf720ee25 .long 0xd2337a31,0x8a979129,0x0f862bdc,0x5916868f,0x5dd283ba,0x048099d9,0xfe5bfb4e,0xe2d1eeb6,0x7884005d,0x82ef1c41,0xffffcbae,0xa2d4ec17,0x8aa95e66,0x9161c53f,0xc5fee0d0,0x5ee104e1 .long 0xc135b208,0x562e4cec,0x4783f47d,0x74e1b265,0x5a3f3b30,0x6d2a506c,0xc16762fc,0xecead9f4,0xe286e5b9,0xf29dd4b2,0x83bb3c61,0x1b0fadc0,0x7fac29a4,0x7a75023e,0xc9477fa3,0xc086d5f1 .long 0x2f6f3076,0x0fc61135,0xe3912a9a,0xc99ffa23,0xd2f8ba3d,0x6a0b0685,0xe93358a4,0xfdc777e8,0x35415f04,0x94a787bb,0x4d23fea4,0x640c2d6a,0x153a35b5,0x9de917da,0x5d5cd074,0x793e8d07 .long 0x2de45068,0xf4f87653,0x9e2e1f6e,0x37c7a7e8,0xa3584069,0xd0825fa2,0x1727bf42,0xaf2cea7c,0x9e4785a9,0x0360a4fb,0x27299f4a,0xe5fda49c,0x71ac2f71,0x48068e13,0x9077666f,0x83d0687b .long 0x15d02819,0x6d3883b2,0x40dd9a35,0x6d0d7550,0x1d2b469f,0x61d7cbf9,0x2efc3115,0xf97b232f,0xb24bcbc7,0xa551d750,0x88a1e356,0x11ea4949,0x93cb7501,0x7669f031,0xca737b8a,0x595dc55e .long 0xd837879f,0xa4a319ac,0xed6b67b0,0x6fc1b49e,0x32f1f3af,0xe3959933,0x65432a2e,0x966742eb,0xb4966228,0x4b8dc9fe,0x43f43950,0x96cc6312,0xc9b731ee,0x12068859,0x56f79968,0x7b948dc3 .long 0xed1f8008,0x61e4ad32,0xd8b17538,0xe6c9267a,0x857ff6fb,0x1ac7c5eb,0x55f2fb10,0x994baaa8,0x1d248018,0x84cf14e1,0x628ac508,0x5a39898b,0x5fa944f5,0x14fde97b,0xd12e5ac7,0xed178030 .long 0x97e2feb4,0x042c2af4,0xaebf7313,0xd36a42d7,0x084ffdd7,0x49d2c9eb,0x2ef7c76a,0x9f8aa54b,0x09895e70,0x9200b7ba,0xddb7fb58,0x3bd0c66f,0x78eb4cbb,0x2d97d108,0xd84bde31,0x2d431068 .long 0x172ccd1f,0x4b523eb7,0x30a6a892,0x7323cb28,0xcfe153eb,0x97082ec0,0xf2aadb97,0xe97f6b6a,0xd1a83da1,0x1d3d393e,0x804b2a68,0xa6a7f9c7,0x2d0cb71e,0x4a688b48,0x40585278,0xa9b4cc5f .long 0xcb66e132,0x5e5db46a,0x0d925880,0xf1be963a,0x0317b9e2,0x944a7027,0x48603d48,0xe266f959,0x5c208899,0x98db6673,0xa2fb18a3,0x90472447,0x777c619f,0x8a966939,0x2a3be21b,0x3798142a .long 0x3298b343,0xb4241cb1,0xb44f65a1,0xa3a14e49,0x3ac77acd,0xc5f4d6cd,0x52b6fc3c,0xd0288cb5,0x1c040abc,0xd5cc8c2f,0x06bf9b4a,0xb675511e,0x9b3aa441,0xd667da37,0x51601f72,0x460d45ce .long 0x6755ff89,0xe2f73c69,0x473017e6,0xdd3cf7e7,0x3cf7600d,0x8ef5689d,0xb1fc87b4,0x948dc4f8,0x4ea53299,0xd9e9fe81,0x98eb6028,0x2d921ca2,0x0c9803fc,0xfaecedfd,0x4d7b4745,0xf38ae891 .long 0xc5e3a3d8,0xd8c5fccf,0x4079dfbf,0xbefd904c,0xfead0197,0xbc6d6a58,0x695532a4,0x39227077,0xdbef42f5,0x09e23e6d,0x480a9908,0x7e449b64,0xad9a2e40,0x7b969c1a,0x9591c2a4,0x6231d792 .long 0x0f664534,0x87151456,0x4b68f103,0x85ceae7c,0x65578ab9,0xac09c4ae,0xf044b10c,0x33ec6868,0x3a8ec1f1,0x6ac4832b,0x5847d5ef,0x5509d128,0x763f1574,0xf909604f,0xc32f63c4,0xb16c4303 .long 0x7ca23cd3,0xb6ab2014,0xa391849d,0xcaa7a5c6,0x75678d94,0x5b0673a3,0xdd303e64,0xc982ddd4,0x5db6f971,0xfd7b000b,0x6f876f92,0xbba2cb1f,0x3c569426,0xc77332a3,0x570d74f8,0xa159100c .long 0xdec67ef5,0xfd16847f,0x233e76b7,0x742ee464,0xefc2b4c8,0x0b8e4134,0x42a3e521,0xca640b86,0x8ceb6aa9,0x653a0190,0x547852d5,0x313c300c,0x6b237af7,0x24e4ab12,0x8bb47af8,0x2ba90162 .long 0xa8219bb7,0x3d5e58d6,0x1b06c57f,0xc691d0bd,0xd257576e,0x0ae4cb10,0xd54a3dc3,0x3569656c,0x94cda03a,0xe5ebaebd,0x162bfe13,0x934e82d3,0xe251a0c6,0x450ac0ba,0xdd6da526,0x480b9e11 .long 0x8cce08b5,0x00467bc5,0x7f178d55,0xb636458c,0xa677d806,0xc5748bae,0xdfa394eb,0x2763a387,0x7d3cebb6,0xa12b448a,0x6f20d850,0xe7adda3e,0x1558462c,0xf63ebce5,0x620088a8,0x58b36143 .long 0x4d63c0ee,0x8a2cc3ca,0x0fe948ce,0x51233117,0x222ef33b,0x7463fd85,0x7c603d6c,0xadf0c7dc,0xfe7765e5,0x0ec32d3b,0xbf380409,0xccaab359,0x8e59319c,0xbdaa84d6,0x9c80c34d,0xd9a4c280 .long 0xa059c142,0xa9d89488,0xff0b9346,0x6f5ae714,0x16fb3664,0x068f237d,0x363186ac,0x5853e4c4,0x63c52f98,0xe2d87d23,0x81828876,0x2ec4a766,0xe14e7b1c,0x47b864fa,0x69192408,0x0c0bc0e5 .long 0xb82e9f3e,0xe4d7681d,0xdf25e13c,0x83200f0b,0x66f27280,0x8909984c,0x75f73227,0x462d7b00,0xf2651798,0xd90ba188,0x36ab1c34,0x74c6e18c,0x5ef54359,0xab256ea3,0xd1aa702f,0x03466612 .long 0x2ed22e91,0x624d6049,0x6f072822,0x6fdfe0b5,0x39ce2271,0xeeca1115,0xdb01614f,0x98100a4f,0xa35c628f,0xb6b0daa2,0xc87e9a47,0xb6f94d2e,0x1d57d9ce,0xc6773259,0x03884a7b,0xf70bfeec .long 0xed2bad01,0x5fb35ccf,0x1da6a5c7,0xa155cbe3,0x30a92f8f,0xc2e2594c,0x5bfafe43,0x649c89ce,0xe9ff257a,0xd158667d,0xf32c50ae,0x9b359611,0x906014cf,0x4b00b20b,0x89bc7d3d,0xf3a8cfe3 .long 0x248a7d06,0x4ff23ffd,0x878873fa,0x80c5bfb4,0x05745981,0xb7d9ad90,0x3db01994,0x179c85db,0x61a6966c,0xba41b062,0xeadce5a8,0x4d82d052,0xa5e6a318,0x9e91cd3b,0x95b2dda0,0x47795f4f .long 0xd55a897c,0xecfd7c1f,0xb29110fb,0x009194ab,0xe381d3b0,0x5f0e2046,0xa98dd291,0x5f3425f6,0x730d50da,0xbfa06687,0x4b083b7f,0x0423446c,0xd69d3417,0x397a247d,0x387ba42a,0xeb629f90 .long 0xd5cd79bf,0x1ee426cc,0x946c6e18,0x0032940b,0x57477f58,0x1b1e8ae0,0x6d823278,0xe94f7d34,0x782ba21a,0xc747cb96,0xf72b33a5,0xc5254469,0xc7f80c81,0x772ef6de,0x2cd9e6b5,0xd73acbfe .long 0x49ee90d9,0x4075b5b1,0xa06e9eba,0x785c339a,0xabf825e0,0xa1030d5b,0xa42931dc,0xcec684c3,0xc1586e63,0x42ab62c9,0x5ab43f2b,0x45431d66,0x55f7835d,0x57c8b2c0,0xc1b7f865,0x033da338 .long 0xcaa76097,0x283c7513,0x36c83906,0x0a624fa9,0x715af2c7,0x6b20afec,0xeba78bfd,0x4b969974,0xd921d60e,0x220755cc,0x7baeca13,0x9b944e10,0x5ded93d4,0x04819d51,0x6dddfd27,0x9bbff86e .long 0x77adc612,0x6b344130,0xbbd803a0,0xa7496529,0x6d8805bd,0x1a1baaa7,0x470343ad,0xc8403902,0x175adff1,0x39f59f66,0xb7d8c5b7,0x0b26d7fb,0x529d75e3,0xa875f5ce,0x41325cc2,0x85efc7e9 .long 0x1ff6acd3,0x21950b42,0x53dc6909,0xffe70484,0x28766127,0xff4cd0b2,0x4fb7db2b,0xabdbe608,0x5e1109e8,0x837c9228,0xf4645b5a,0x26147d27,0xf7818ed8,0x4d78f592,0xf247fa36,0xd394077e .long 0x488c171a,0x0fb9c2d0,0x13685278,0xa78bfbaa,0xd5b1fa6a,0xedfbe268,0x2b7eaba7,0x0dceb8db,0x9ae2b710,0xbf9e8089,0xa4449c96,0xefde7ae6,0xcc143a46,0x43b7716b,0xc3628c13,0xd7d34194 .long 0x3b3f64c9,0x508cec1c,0x1e5edf3f,0xe20bc0ba,0x2f4318d4,0xda1deb85,0x5c3fa443,0xd20ebe0d,0x73241ea3,0x370b4ea7,0x5e1a5f65,0x61f1511c,0x82681c62,0x99a5e23d,0xa2f54c2d,0xd731e383 .long 0x83445904,0x2692f36e,0xaf45f9c0,0x2e0ec469,0xc67528b7,0x905a3201,0xd0e5e542,0x88f77f34,0x5864687c,0xf67a8d29,0x22df3562,0x23b92eae,0x9bbec39e,0x5c27014b,0x9c0f0f8d,0x7ef2f226 .long 0x546c4d8d,0x97359638,0x92f24679,0x5f9c3fc4,0xa8c8acd9,0x912e8bed,0x306634b0,0xec3a318d,0xc31cb264,0x80167f41,0x522113f2,0x3db82f6f,0xdcafe197,0xb155bcd2,0x43465283,0xfba1da59 .long 0xb212cf53,0xa0425b8e,0xf8557c5f,0x4f2e512e,0x25c4d56c,0xc1286ff9,0xee26c851,0xbb8a0fea,0xe7d6107e,0xc28f70d2,0xe76265aa,0x7ee0c444,0x1d1936b1,0x3df277a4,0xea9595eb,0x1a556e3f .long 0xe7305683,0x258bbbf9,0x07ef5be6,0x31eea5bf,0x46c814c1,0x0deb0e4a,0xa7b730dd,0x5cee8449,0xa0182bde,0xeab495c5,0x9e27a6b4,0xee759f87,0x80e518ca,0xc2cf6a68,0xf14cf3f4,0x25e8013f .long 0x7e8d7a14,0x8fc44140,0x9556f36a,0xbb1ff3ca,0x14600044,0x6a844385,0x7451ae63,0xba3f0c4a,0x1f9af32a,0xdfcac25b,0xb1f2214b,0x01e0db86,0xa4b596ac,0x4e9a5bc2,0x026c2c08,0x83927681 .long 0x7acaca28,0x3ec832e7,0xc7385b29,0x1bfeea57,0xfd1eaf38,0x068212e3,0x6acf8ccc,0xc1329830,0x2aac9e59,0xb909f2db,0xb661782a,0x5748060d,0xc79b7a01,0xc5ab2632,0x00017626,0xda44c6c6 .long 0xa7ea82f0,0xf26c00e8,0xe4299aaf,0x99cac80d,0x7ed78be1,0xd66fe3b6,0x648d02cd,0x305f725f,0x623fb21b,0x33ed1bc4,0x7a6319ad,0xfa70533e,0xbe5ffb3e,0x17ab562d,0x56674741,0x06374994 .long 0x5c46aa8e,0x69d44ed6,0xa8d063d1,0x2100d5d3,0xa2d17c36,0xcb9727ea,0x8add53b7,0x4c2bab1b,0x15426704,0xa084e90c,0xa837ebea,0x778afcd3,0x7ce477f8,0x6651f701,0x46fb7a8b,0xa0624998 .long 0xed8a6e19,0xdc1e6828,0x4189d9c7,0x33fc2336,0x671c39bc,0x026f8fe2,0xbc6f9915,0xd40c4ccd,0xf80e75ca,0xafa135bb,0x22adff2c,0x12c651a0,0x4f51ad96,0xc40a04bd,0xbbe4e832,0x04820109 .long 0x7f4c04cc,0x3667eb1a,0xa9404f84,0x59556621,0x7eceb50a,0x71cdf653,0x9b8335fa,0x994a44a6,0xdbeb9b69,0xd7faf819,0xeed4350d,0x473c5680,0xda44bba2,0xb6658466,0x872bdbf3,0x0d1bc780 .long 0xa1962f91,0xe535f175,0xed58f5a7,0x6ed7e061,0x2089a233,0x177aa4c0,0xe539b413,0x0dbcb03a,0xbb32e38e,0xe3dc424e,0x6806701e,0x6472e5ef,0x814be9ee,0xdd47ff98,0x35ace009,0x6b60cfff .long 0x9ff91fe5,0xb8d3d931,0xf0518eed,0x039c4800,0x9182cb26,0x95c37632,0x82fc568d,0x0763a434,0x383e76ba,0x707c04d5,0x824e8197,0xac98b930,0x91230de0,0x92bf7c8f,0x40959b70,0x90876a01 .long 0x05968b80,0xdb6d96f3,0x089f73b9,0x380a0913,0xc2c61e01,0x7da70b83,0x569b38c7,0x95fb8394,0x80edfe2f,0x9a3c6512,0x8faeaf82,0x8f726bb9,0x78424bf8,0x8010a4a0,0x0e844970,0x29672044 .long 0x7a2ad62a,0x63c5cb81,0xac62ff54,0x7ef2b6b9,0xb3ad9db5,0x3749bba4,0x46d5a617,0xad311f2c,0xc2ff3b6d,0xb77a8087,0x367834ff,0xb46feaf3,0x75d6b138,0xf8aa266d,0xec008188,0xfa38d320 .long 0x696946fc,0x486d8ffa,0xb9cba56d,0x50fbc6d8,0x90f35a15,0x7e3d423e,0xc0dd962c,0x7c3da195,0x3cfd5d8b,0xe673fdb0,0x889dfca5,0x0704b7c2,0xf52305aa,0xf6ce581f,0x914d5e53,0x399d49eb .long 0x6ec293cd,0x380a496d,0x8e7051f5,0x733dbda7,0xb849140a,0x037e388d,0x5946dbf6,0xee4b32b0,0xcae368d1,0xb1c4fda9,0xfdb0b2f3,0x5001a7b0,0x2e3ac46e,0x6df59374,0x39b3e656,0x4af675f2 .long 0x39949296,0x44e38110,0x361db1b5,0x5b63827b,0x206eaff5,0x3e5323ed,0xc21f4290,0x942370d2,0xe0d985a1,0xf2caaf2e,0x7239846d,0x192cc64b,0xae6312f8,0x7c0b8f47,0x96620108,0x7dc61f91 .long 0xc2da7de9,0xb830fb5b,0x0ff8d3be,0xd0e643df,0x188a9641,0x31ee77ba,0xbcf6d502,0x4e8aa3aa,0x9a49110f,0xf9fb6532,0x2dd6b220,0xd18317f6,0x52c3ea5a,0x7e3ced41,0x7d579c4a,0x0d296a14 .long 0xed4c3717,0x35d6a53e,0x3d0ed2a3,0x9f8240cf,0xe5543aa5,0x8c0d4d05,0xdd33b4b4,0x45d5bbfb,0x137fd28e,0xfa04cc73,0xc73b3ffd,0x862ac6ef,0x31f51ef2,0x403ff9f5,0xbc73f5a2,0x34d5e0fc .long 0x08913f4f,0xf2526820,0xeac93d95,0xea20ed61,0x6ca6b26c,0x51ed38b4,0xea4327b0,0x8662dcbc,0x725d2aaa,0x6daf295c,0x8e52dcda,0xbad2752f,0x0b17dacc,0x2210e721,0xd51e8232,0xa37f7912 .long 0x44cc3add,0x4f7081e1,0x87be82cf,0xd5ffa1d6,0x0edd6472,0x89890b6c,0x3ed17863,0xada26e1a,0x63483caa,0x276f2715,0x2f6077fd,0xe6924cd9,0x0a466e3c,0x05a7fe98,0xb1902d1f,0xf1c794b0 .long 0x82a8042c,0xe5213688,0xcd278298,0xd931cfaf,0xf597a740,0x069a0ae0,0xeb59107c,0x0adbb3f3,0x5eaa8eb8,0x983e951e,0x11b48e78,0xe663a8b5,0x8a03f2c5,0x1631cc0d,0x11e271e2,0x7577c11e .long 0x08369a90,0x33b2385c,0x190eb4f8,0x2990c59b,0xc68eac80,0x819a6145,0x2ec4a014,0x7a786d62,0x20ac3a8d,0x33faadbe,0x5aba2d30,0x31a21781,0xdba4f565,0x209d2742,0x55aa0fbb,0xdb2ce9e3 .long 0x168984df,0x8cef334b,0x33879638,0xe81dce17,0x263720f0,0xf6e6949c,0xf593cbec,0x5c56feaf,0xfde58c84,0x8bff5601,0x2eccb314,0x74e24117,0x4c9a8a78,0xbcf01b61,0x544c9868,0xa233e35e .long 0x8bd7aff1,0xb3156bf3,0x1d81b146,0x1b5ee4cb,0xd628a915,0x7ba1ac41,0xfd89699e,0x8f3a8f9c,0xa0748be7,0x7329b9c9,0xa92e621f,0x1d391c95,0x4d10a837,0xe51e6b21,0x4947b435,0xd255f53a .long 0xf1788ee3,0x07669e04,0xa86938a2,0xc14f27af,0xe93a01c0,0x8b47a334,0xd9366808,0xff627438,0xca2a5965,0x7a0985d8,0xd6e9b9b3,0x3d9a5542,0x4cf972e8,0xc23eb80b,0x4fdf72fd,0x5c1c33bb .long 0x74a86108,0x0c4a58d4,0xee4c5d90,0xf8048a8f,0xe86d4c80,0xe3c7c924,0x056a1e60,0x28c889de,0xb214a040,0x57e2662e,0x37e10347,0xe8c48e98,0x80ac748a,0x87742862,0x186b06f2,0xf1c24022 .long 0x5f74040a,0xac2dd4c3,0xfceac957,0x409aeb71,0x55c4ec23,0x4fbad782,0x8a7b76ec,0xb359ed61,0xed6f4a60,0x12744926,0x4b912de3,0xe21e8d7f,0xfc705a59,0xe2575a59,0xed2dbc0e,0x72f1d4de .long 0xeb7926b8,0x3d2b24b9,0xcdbe5509,0xbff88cb3,0xe4dd640b,0xd0f399af,0x2f76ed45,0x3c5fe130,0x3764fb3d,0x6f3562f4,0x3151b62d,0x7b5af318,0xd79ce5f3,0xd5bd0bc7,0xec66890f,0xfdaf6b20 .long 0x6063540c,0x735c67ec,0xe5f9cb8f,0x50b259c2,0x3f99c6ab,0xb8734f9a,0xa3a7bc85,0xf8cc13d5,0xc5217659,0x80c1b305,0x4ec12a54,0xfe5364d4,0x681345fe,0xbd87045e,0x582f897f,0x7f8efeb1 .long 0xd5923359,0xe8cbf1e5,0x539b9fb0,0xdb0cea9d,0x49859b98,0x0c5b34cf,0xa4403cc6,0x5e583c56,0xd48185b7,0x11fc1a2d,0x6e521787,0xc93fbc7e,0x05105b8b,0x47e7a058,0xdb8260c8,0x7b4d4d58 .long 0x46eb842a,0xe33930b0,0x7bdae56d,0x8e844a9a,0x13f7fdfc,0x34ef3a9e,0x636ca176,0xb3768f82,0x4e09e61c,0x2821f4e0,0xa0c7cddc,0x414dc3a1,0x54945fcd,0xd5379437,0xb3555ff1,0x151b6eef .long 0x6339c083,0xb31bd613,0xdfb64701,0x39ff8155,0xe29604ab,0x7c3388d2,0xa6b10442,0x1e19084b,0xeccd47ef,0x17cf54c0,0x4a5dfb30,0x89693385,0x47daf9f6,0x69d023fb,0x7d91d959,0x9222840b .long 0x803bac62,0x439108f5,0x379bd45f,0x0b7dd91d,0xca63c581,0xd651e827,0x509c104f,0x5c5d75f6,0x1f2dc308,0x7d5fc738,0xd98454be,0x20faa7bf,0xa517b031,0x95374bee,0x642692ac,0xf036b9b1 .long 0x39842194,0xc5106109,0x49d05295,0xb7e2353e,0xefb42ee0,0xfc8c1d5c,0x08ce811c,0xe04884eb,0x7419f40e,0xf1f75d81,0xa995c241,0x5b0ac162,0xc4c55646,0x120921bb,0x8d33cf97,0x713520c2 .long 0xe98c5100,0xb4a65a5c,0x2ddd0f5a,0x6cec871d,0x9ba2e78b,0x251f0b7f,0xce3a2a5f,0x224a8434,0x25f5c46f,0x26827f61,0x48545ec0,0x6a22bedc,0xb1bb5cdc,0x25ae5fa0,0xfcb9b98f,0xd693682f .long 0x91e5d7d3,0x32027fe8,0x73a07678,0xf14b7d17,0xc0dfdd61,0xf88497b3,0x2a8c4f48,0xf7c2eec0,0x3756e621,0xaa5573f4,0x1825b948,0xc013a240,0x63878572,0x1c03b345,0x653a4184,0xa0472bea .long 0x0ac69a80,0xf4222e27,0xf51e54f6,0x34096d25,0x8fffa591,0x00a648cb,0x69b6527f,0x4e87acdc,0xe285ccb4,0x0575e037,0x50ddcf52,0x188089e4,0x870ff719,0xaa96c9a8,0x1fc7e369,0x74a56cd8 .long 0x1726931a,0x41d04ee2,0x3660ecfd,0x0bbbb2c8,0x24818e18,0xa6ef6de5,0xe7d57887,0xe421cc51,0xbea87be6,0xf127d208,0xb1cdd682,0x16a475d3,0x439b63f7,0x9db1b684,0xf0f113b6,0x5359b3db .long 0x8bf06e31,0xdfccf1de,0xdd383901,0x1fdf8f44,0x5017e7d2,0x10775cad,0x58d11eef,0xdfc3a597,0xb1ecff10,0x6ec9c8a0,0x28400549,0xee6ed6cc,0x1b4f8d73,0xb5ad7bae,0xe00aaab9,0x61b4f11d .long 0xd4eff2d7,0x7b32d69b,0x4288b60f,0x88ae6771,0x37a1e723,0x159461b4,0x570aae8c,0x1f3d4789,0x7f9871da,0x869118c0,0xf635e278,0x35fbda78,0xe1541dac,0x738f3641,0xc0dae45f,0x6794b13a .long 0x09cc0917,0x065064ac,0xc68540fd,0x27c53729,0xef227671,0x0d2d4c8e,0xa1785a04,0xd23a9f80,0x52650359,0x98c59528,0x74a1acad,0xfa09ad01,0x0b55bf5c,0x082d5a29,0x419b8084,0xa40f1c67 .long 0xdcc18770,0x3a5c752e,0x8825c3a5,0x4baf1f2f,0x21b153ed,0xebd63f74,0xb2f64723,0xa2383e47,0x2646d19a,0xe7bf620a,0x03c83ffd,0x56cb44ec,0x4f6be9f1,0xaf7267c9,0xc06bb5e9,0x8b2dfd7b .long 0xa672c5c7,0xb87072f2,0x0d53c5e2,0xeacb11c8,0xff435932,0x22dac29d,0x4408693c,0x37bdb99d,0x2899c20f,0xf6e62fb6,0x447ece24,0x3535d512,0xff577ce3,0xfbdc6b88,0x190575f2,0x726693bd .long 0xab4b35a2,0x6772b0e5,0xf5eeaacf,0x1d8b6001,0x795b9580,0x728f7ce4,0x41fb81da,0x4a20ed2a,0x4fec01e6,0x9f685cd4,0xa7ff50ad,0x3ed7ddcc,0x0c2d97fd,0x460fd264,0xeb82f4f9,0x3a241426 .long 0x6a8ea820,0x17d1df2c,0xf22cc254,0xb2b50d3b,0xb7291426,0x03856cba,0x04f5ee39,0x87fd26ae,0x02bee4ba,0x9cb696cc,0x06820fd6,0x53121804,0x0212e985,0xa5dfc269,0x160f9a09,0x666f7ffa .long 0xbccd9617,0xc503cd33,0xba7730a3,0x365dede4,0x5ddb0786,0x798c6355,0xfc9cd3bc,0xa6c3200e,0xe5e35efd,0x060ffb2c,0x5555a1c1,0x99a4e25b,0xf70b3751,0x11d95375,0x160e1bf6,0x0a57354a .long 0xf8e4b065,0xecb3ae4b,0x2e53022b,0x07a834c4,0x8692ed96,0x1cd300b3,0x61ee14ec,0x16a6f792,0x6a8649ed,0x8f1063c6,0x869f3e14,0xfbcdfcfe,0x00a7b3ec,0x2cfb97c1,0x7130c2f1,0xcea49b3c .long 0xe9d96488,0x462d044f,0x8182a0c1,0x4b53d52e,0x0391e9e9,0x84b6ddd3,0xb1741a09,0x80ab7b48,0x27d3317f,0xec0e15d4,0x1a64671e,0x8dfc1ddb,0xd49c5b92,0x93cc5d5f,0x3674a331,0xc995d53d .long 0x090090ae,0x302e41ec,0xedb06830,0x2278a0cc,0xfbc99690,0x1d025932,0xb80d68da,0x0c32fbd2,0xf341a6c1,0xd79146da,0x1bef68a0,0xae0ba139,0x8d774b3a,0xc6b8a563,0x880ba4d7,0x1cf307bd .long 0x19803511,0xc033bdc7,0x8888c3be,0xa9f97b3b,0x85c6d05e,0x3d68aebc,0x193919eb,0xc3b88a9d,0xc48b0ee3,0x2d300748,0x07a746c1,0x7506bc7c,0x6e6d57f3,0xfc48437c,0xcfeaa91a,0x5bd71587 .long 0xc1bc5225,0xa4ed0408,0x2719226d,0xd0b946db,0x758d2d43,0x109ecd62,0x2751759b,0x75c8485a,0x9ce4177a,0xb0b75f49,0x79c10c3d,0x4fa61a1e,0xa167fcd7,0xc062d300,0x750f0fa8,0x4df3874c .long 0x83dfedc9,0x29ae2cf9,0x8d87631a,0xf8437134,0x7429c8d2,0xaf571711,0x146d9272,0x18d15867,0x69769bb7,0x83053ecf,0xc479ab82,0xc55eb856,0x21b0f4b2,0x5ef7791c,0x3d491525,0xaa5956ba .long 0x9fe20eba,0x407a96c2,0xe52a5ad3,0xf27168bb,0xbf1d9d89,0x43b60ab3,0x710e727a,0xe45c51ef,0x099b4221,0xdfca5276,0x2557a159,0x8dc6407c,0x91035895,0x0ead8335,0x9c55dc32,0x0a9db957 .long 0xdf61bc76,0xe40736d3,0x3f778cdb,0x13a619c0,0xc56ea28f,0x6dd921a4,0x2fa647b4,0x76a52433,0xac5bdc5d,0x23591891,0xbac7dc01,0xff4a1a72,0x62df8453,0x9905e261,0xe63b265f,0x3ac045df .long 0xad53dba7,0x8a3f341b,0x837b625a,0x8ec269cc,0x3ae31189,0xd71a2782,0x55e96120,0x8fb4f9a3,0xff9875cf,0x804af823,0x5d442a9b,0x23224f57,0xecc62679,0x1c4d3b9e,0xa0e7ddb1,0x91da22fb .long 0x6c04a661,0xa370324d,0x5e376d17,0x9710d3b6,0x3044e357,0xed8c98f0,0x6422701c,0xc364ebbe,0x7733d61c,0x347f5d51,0xcea826c3,0xd55644b9,0x55a25548,0x80c6e0ad,0x844220a7,0x0aa7641d .long 0x31810660,0x1438ec81,0xde4b4043,0x9dfa6507,0xcc3e0273,0x10b515d8,0x28d8cfb2,0x1b6066dd,0x9c9efebd,0xd3b04591,0xa21c1ff4,0x425d4bdf,0xd57607d3,0x5fe5af19,0x54481084,0xbbf773f7 .long 0x94b03ed1,0x8435bd69,0x634cc546,0xd9ad1de3,0x00e420ca,0x2cf423fc,0xa03096dd,0xeed26d80,0xa4db09d2,0xd7f60be7,0x960622f7,0xf47f569d,0x7296c729,0xe5925fd7,0x26ca2715,0xeff2db26 .long 0xb913e759,0xa6fcd014,0x8ff4de93,0x53da4786,0xc32068e1,0x14616d79,0xccdf352e,0xb187d664,0x1dc90b59,0xf7afb650,0x7daa1b26,0x8170e943,0x700c0a84,0xc8e3bdd8,0x6482bdfa,0x6e8d345f .long 0xc5c5ea50,0x84cfbfa1,0x67960681,0xd3baf14c,0x0dd50942,0x26398403,0x4716a663,0xe4b7839c,0xe7de6dc0,0xd5f1f794,0x622aa7ce,0x5cd0f4d4,0x59acfeec,0x5295f3f1,0x953e0607,0x8d933552 .long 0x776c5722,0xc7db8ec5,0x2b5f290c,0xdc467e62,0x4ff425a9,0xd4297e70,0x0cf7bb72,0x4be924c1,0xa1892131,0x0d5dc5ae,0xa705c992,0x8bf8a8e3,0x7a305ac5,0x73a0b064,0x9a8c77a8,0x00c9ca4e .long 0x83774bdd,0x5dfee80f,0x85734485,0x63131602,0x914a69a9,0xa1b524ae,0xd4e300d7,0xebc2ffaf,0x7cfa46a5,0x52c93db7,0x21653b50,0x71e6161f,0xa4bc580a,0x3574fc57,0xe1bc1253,0xc09015dd .long 0xd174d7aa,0x4b7b47b2,0xf3a15d04,0x4072d8e8,0xd6fa07ed,0xeeb7d47f,0xedbdafb1,0x6f2b9ff9,0x3760fe8a,0x18c51615,0xf06c6c13,0x7a96e6bf,0x0ea2d071,0x4d7a0410,0x0be2a5ce,0xa1914e9b .long 0xd8a3c5cf,0x5726e357,0x2abb2b13,0x1197ecc3,0x31ae88dd,0x6c0d7f7f,0xfdbb3efe,0x15b20d1a,0x70584039,0xcd06aa26,0xa7dc9747,0x2277c969,0x7855d815,0xbca69587,0x5188b32a,0x899ea238 .long 0x760c1c9d,0x37d9228b,0x9b5c18da,0xc7efbb11,0x19f6dbc5,0x7f0d1bc8,0x07e6905b,0x4875384b,0x3ba8cd86,0xc7c50baa,0xc2905de0,0xb0ce40fb,0x7a231952,0x70840673,0xcf43de26,0xa912a262 .long 0xeb5b76c1,0x9c38ddcc,0x26fc0ab4,0x746f5285,0xd62c269f,0x52a63a50,0x99458621,0x60049c55,0x3c2f7c9e,0xe7f48f82,0x917d5cf3,0x6bd99043,0x8701f469,0xeb1317a8,0x9a449fe0,0xbd3fe2ed .long 0x12ef3d36,0x421e79ca,0x3e7ea5de,0x9ee3c36c,0xcdff36f7,0xe48198b5,0xc6b82228,0xaff4f967,0xc47adb7e,0x15e19dd0,0x032e7dfa,0x45699b23,0x1fae026a,0x40680c8b,0x550dbf4d,0x5a347a48 .long 0x3cef0d7d,0xe652533b,0x2bbb4381,0xd94f7b18,0x0e80f500,0x838752be,0x9e9c9bfb,0x8e6e2488,0x16caca6a,0xc9751697,0x38531ad9,0x866c49d8,0x7151ade1,0xc917e239,0x6037c407,0x2d016ec1 .long 0x00eac3f9,0xa407ccc9,0xe2ed4748,0x835f6280,0x1cc98e0d,0xcc54c347,0xdcb572eb,0x0e969937,0x8f30c9cb,0x1b16c8e8,0x373c4661,0xa606ae75,0x35502cab,0x47aa689b,0x4d9bb64f,0xf89014ae .long 0x31c71f7b,0x202f6a9c,0x296ffe5c,0x01f95aa3,0x53cec3a3,0x5fc06014,0x5f498a45,0xeb991237,0x5d91ba87,0xae9a935e,0x0b564a19,0xc6ac6281,0x3bd44e69,0x8a8fe81c,0x9dd11d45,0x7c8b467f .long 0xea5b8e69,0xf772251f,0xc5b75fbc,0xaeecb3bd,0x887ff0e5,0x1aca3331,0x19f0a131,0xbe5d49ff,0xe5c8646f,0x582c13aa,0x20e19980,0xdbaa12e8,0xf7abbd94,0x8f40f31a,0x1dfc7663,0x1f13f5a8 .long 0xaceb4fc0,0x5d81f1ee,0x5e6f0f42,0x36256002,0x751370c8,0x4b67d6d7,0x03e80589,0x2608b698,0x05268301,0xcfc0d2fc,0x40309212,0xa6943d39,0x1fd0e1c2,0x192a90c2,0x37f1dc76,0xb209f113 .long 0x97bf1298,0xefcc5e06,0x219d639e,0xcbdb6730,0xb81e8c6f,0xd009c116,0x1a7ce2e5,0xa3ffdde3,0xa914d3ba,0xc53fbaaa,0x88df85ee,0x836d500f,0x66ee0751,0xd98dc71b,0x714516fd,0x5a3d7005 .long 0x39eedbba,0x21d3634d,0x0455a46d,0x35cd2e68,0xf9d7eb0c,0xc8cafe65,0x00cefb3e,0xbda3ce9e,0x2c9cf7a4,0xddc17a60,0x7bcb8773,0x01572ee4,0x8c7548df,0xa92b2b01,0xa84600e3,0x732fd309 .long 0x16543a40,0xe22109c7,0xfede3c6c,0x9acafd36,0x6824e614,0xfb206852,0xda25dca0,0x2a4544a9,0x91d60b06,0x25985262,0x28753545,0x281b7be9,0x90f13b27,0xec667b1a,0x940e2eb4,0x33a83aff .long 0xd5d721d5,0x80009862,0x5bd3a182,0x0c3357a3,0x7aa2cda4,0x27f3a83b,0xf6f83085,0xb58ae74e,0x2e6dad6b,0x2a911a81,0xf43d6c5b,0xde286051,0xf996c4d8,0x4bdccc41,0x0ae1e24e,0xe7312ec0 .long 0x6e6485b3,0xf8d112e7,0x771c52f8,0x4d3e24db,0x684a2f6d,0x48e3ee41,0x21d95551,0x7161957d,0xcdb12a6c,0x19631283,0x2e50e164,0xbf3fa882,0x3166cc73,0xf6254b63,0xaee8cc38,0x3aefa7ae .long 0x3b36f9fd,0x79b0fe62,0xfde19fc0,0x26543b23,0x958482ef,0x136e64a0,0x9b095825,0x23f63771,0xb6a1142e,0x14cfd596,0x335aac0b,0x5ea6aac6,0xf3081dd5,0x86a0e8bd,0x003dc12a,0x5fb89d79 .long 0xf72e34d4,0xf615c33a,0x110eec35,0x0bd9ea40,0xc1dea34e,0x1c12bc5b,0x49ae4699,0x686584c9,0x8c97b942,0x13ad95d3,0x4e5c7562,0x4609561a,0xf2737f89,0x9e94a4ae,0x371c78b6,0xf57594c6 .long 0xe3779ee3,0x0f0165fc,0xbd495d9e,0xe00e7f9d,0x20284e7a,0x1fa4efa2,0x47ac6219,0x4564bade,0xc4708e8e,0x90e6312a,0xa71e9adf,0x4f5725fb,0x3d684b9f,0xe95f55ae,0x1e94b415,0x47f7ccb1 .long 0x8d946581,0x7322851b,0xbdf4a012,0xf0d13133,0x6584dae0,0xa3510f69,0x3c9f6c6d,0x03a7c171,0xe475381a,0x5be97f38,0x85823334,0xca1ba422,0x0be17dda,0xf83cc5c7,0x0b918c0f,0x158b1494 .long 0x522e6b69,0xda3a77e5,0xbbcd6c18,0x69c908c3,0xd924fd56,0x1f1b9e48,0xaa4bb3f7,0x37c64e36,0xee478d7d,0x5a4fdbdf,0x0193f7a0,0xba75c8bc,0x56cd16df,0x84bc1e84,0x46fad151,0x1fb08f08 .long 0x842e9f30,0x8a7cabf9,0x5eab83af,0xa331d4bf,0x017f2a6a,0xd272cfba,0x83aba0e3,0x27560abc,0x0e3a6b75,0x94b83387,0x6b9f50f5,0x25c6aea2,0xb5fdf6d0,0x803d691d,0xe6333514,0x03b77509 .long 0x61a341c1,0x36178903,0x0cfd6142,0x3604dc60,0x8533316c,0x022295eb,0x44af2922,0x3dbde4ac,0x1c7eef69,0x898afc5d,0xd14f4fa1,0x58896805,0x203c21ca,0x05002160,0x40ef730b,0x6f0d1f30 .long 0x196224f8,0x8e8c44d4,0x374d079d,0x75a4ab95,0x7d48f123,0x79085ecc,0x1bf65ad8,0x56f04d31,0xbda602b2,0xe220bf1c,0xf9612c69,0x73ee1742,0x084fd06b,0x76008fc8,0xf11380d1,0x4000ef9f .long 0x12cfe297,0x48201b4b,0x292f74e5,0x3eee129c,0xc9e874e8,0xe1fe114e,0x92c5fc41,0x899b055c,0x3a39c8cf,0x4e477a64,0x78963cc9,0x82f09efe,0xd333f863,0x6fd3fd8f,0xdc949c63,0x85132b2a .long 0x516eb17b,0x7e06a3ab,0xd2c7372b,0x73bec06f,0xba896da6,0xe4f74f55,0x8e9eb40f,0xbb4afef8,0xe61d66b0,0x2d75bec8,0xef29300b,0x02bda4b4,0x026baa5a,0x8bbaa8de,0xa07f4440,0xff54befd .long 0xbe7a2af3,0xbd9b8b1d,0x4fb74a72,0xec51caa9,0x63879697,0xb9937a4b,0xec2687d5,0x7c9a9d20,0x6ef5f014,0x1773e44f,0xe90c6900,0x8abcf412,0x8142161e,0x387bd022,0xfcb6ff2a,0x50393755 .long 0xed6def63,0x9813fd56,0x7d53106c,0x53cf6482,0x431f7ac1,0x991a35bd,0x63e65faf,0xf1e274dd,0x44cc7880,0xf63ffa3c,0x7c256981,0x411a426b,0x93a420e0,0xb698b9fd,0xae53f8fe,0x89fdddc0 .long 0x32398baa,0x766e0722,0x5cfca031,0x205fee42,0x7a029cf2,0xa49f5341,0x4023890d,0xa88c68b8,0x7337aaa8,0xbc275041,0x0eb384f4,0x9ed364ad,0x29aba92f,0xe0816f85,0x04e38a88,0x2e9e1941 .long 0x3dafd2d5,0x57eef44a,0x97ed98d8,0x35d1fae5,0x2307f9b1,0x50628c09,0xd6cba5c6,0x09d84aae,0x88aaa691,0x67071bc7,0xafe6cb03,0x2dea57a9,0x3d78ac01,0xdfe11bb4,0x7fd7aa51,0x7286418c .long 0x77f7195a,0xfabf7709,0xadeb838f,0x8ec86167,0xbb4f012d,0xea1285a8,0x9a3eab3f,0xd6883503,0x309004c2,0xee5d24f8,0x13ffe95e,0xa96e4b76,0xbd223ea4,0x0cdffe12,0xb6739a53,0x8f5c2ee5 .long 0xdd968198,0x5cb4aaa5,0x72413a6c,0xfa131c52,0x9536d903,0x53d46a90,0x48606d8e,0xb270f0d3,0xa053a3bc,0x518c7564,0x1a86caef,0x088254b7,0x0ab5efd0,0xb3ba8cb4,0x4605945d,0x5c59900e .long 0xa1887395,0xecace1dd,0x932a65de,0x40960f36,0x3aa95529,0x9611ff5c,0x7c1e5a36,0xc58215b0,0xf0e1a524,0xd48c9b58,0xf590dfb8,0xb406856b,0x9cd95662,0xc7605e04,0xa33ecf82,0x0dd036ee .long 0xc33156b3,0xa50171ac,0x4a80172e,0xf09d24ea,0x76dc8eef,0x4e1f72c6,0x5e3d44ee,0xe60caadc,0x979b1d8f,0x006ef8a6,0x97788d26,0x60908a1c,0x266feec0,0x6e08f95b,0x22e8c94e,0x618427c2 .long 0x59145a65,0x3d613339,0xfa406337,0xcd9bc368,0x2d8a52a0,0x82d11be3,0x97a1c590,0xf6877b27,0xf5cbdb25,0x837a819b,0xde090249,0x2a4fd1d8,0x74990e5f,0x622a7de7,0x7945511b,0x840fa5a0 .long 0x6558842d,0x30b974be,0x17f3d0a6,0x70df8c64,0x7542e46d,0x7c803520,0xe4ecc823,0x7251fe7f,0x5e9aac9a,0xe59134cb,0xf0045d71,0x11bb0934,0xdbcb1d4e,0x53e5d9b5,0x92defc91,0x8d97a905 .long 0x7946d3f9,0xfe289327,0x07472273,0xe132bd24,0x1eb6ae86,0xeeeb510c,0xf0595067,0x777708c5,0x1297029e,0x18e2c8cd,0xbbf9305e,0x2c61095c,0x6b85d6d9,0xe466c258,0xda1ea530,0x8ac06c36 .long 0xa1304668,0xa365dc39,0x07f89606,0xe4a9c885,0xacc7228d,0x65a4898f,0x84ca8303,0x3e2347ff,0xea7d23a3,0xa5f6fb77,0x672a71cd,0x2fac257d,0x7e6a44d3,0x6908bef8,0x891d3d7a,0x8ff87566 .long 0x6b0cf82e,0xe58e90b3,0x2615b5e7,0x6438d246,0x669c145a,0x07b1f8fc,0x36f1e1cb,0xb0d8b2da,0xd9184c4d,0x54d5dadb,0xf93d9976,0x3dbb18d5,0xd1147d47,0x0a3e0f56,0xa0a48609,0x2afa8c8d .long 0xbc36742c,0x275353e8,0xeea0ed90,0x898f427e,0x3e477b00,0x26f4947e,0x308741e3,0x8ad8848a,0xd74a2a46,0x6c703c38,0x9ba17ba2,0x5e3e05a9,0x4ab9a9e4,0xc1fa6f66,0x3841d6ec,0x474a2d9a .long 0x653ae326,0x871239ad,0xa74cbb43,0x14bcf72a,0x20d4c083,0x8737650e,0x110ed4af,0x3df86536,0xb53ca555,0xd2d86fe7,0xabd5d538,0x688cb00d,0x1ad38468,0xcf81bda3,0xf01167b6,0x7ccfe3cc .long 0x6c4c1fe6,0xcf4f47e0,0x298bbb79,0x557e1f1a,0x30d45a14,0xf93b974f,0x0baf97c4,0x174a1d2d,0xc51fbf53,0x7a003b30,0xee68b225,0xd8940991,0x1c0f4173,0x5b0aa7b7,0xa20a7153,0x975797c9 .long 0xe3533d77,0x26e08c07,0x2e341c99,0xd7222e6a,0x8d2dc4ed,0x9d60ec3d,0x7c476cf8,0xbdfe0d8f,0x1d056605,0x1fe59ab6,0x86a8551f,0xa9ea9df6,0x47fb8d8c,0x8489941e,0x4a7f1b10,0xfeb874eb .long 0x7ee0d98f,0xfe5fea86,0xdbf61864,0x201ad34b,0x37c031d4,0x45d8fe47,0x795f0822,0xd5f49fae,0xc7f4a40c,0xdb0fb291,0x730ddd92,0x2e69d9c1,0x49d76987,0x754e1054,0x7662db87,0x8a24911d .long 0x60a71676,0x61fc1810,0xf66a8ad1,0xe852d1a8,0x6417231e,0x172bbd65,0x3babb11f,0x0d6de7bd,0xc8e347f8,0x6fde6f88,0x9bd99cc3,0x1c587547,0x34076950,0x78e54ed0,0x796e83ba,0x97f0f334 .long 0x4924867a,0xe4dbe1ce,0x60b84917,0xbd5f51b0,0x3cb09a79,0x37530040,0xff1743d8,0xdb3fe0f8,0x556fa9db,0xed7894d8,0x23412fbf,0xfa262169,0xba7b9291,0x563be0db,0x0c9fb234,0x6ca8b8c0 .long 0xbd763802,0xed406aa9,0x65303da1,0xc21486a0,0xc7e62ec4,0x61ae291e,0xdf99333e,0x622a0492,0xbb7a8ee0,0x7fd80c9d,0x6c01aedb,0xdc2ed3bc,0x08be74ec,0x35c35a12,0x469f671f,0xd540cb1a .long 0xcf84f6c7,0xd16ced4e,0x2d090f43,0x8561fb9c,0x6f239db4,0x7e693d79,0x77bd0d94,0xa736f928,0x2c1950ee,0x07b4d929,0x56dc11b3,0xda177543,0x7a6a878e,0xa5dfbbaa,0x4decb08a,0x1c70cb29 .long 0x6f0f7c50,0xfba28c8b,0x854dcc6d,0xa8eba2b8,0x36b78642,0x5ff8e89a,0xf6873adf,0x070c1c8e,0x6484d2e4,0xbbd3c371,0x0d414129,0xfb78318f,0x6ad93b0b,0x2621a39c,0xa9e917f7,0x979d74c2 .long 0x61fb0428,0xfc195647,0xbee624d4,0x4d78954a,0xb8ae86fd,0xb94896e0,0xc91c8b13,0x6667ac0c,0x43bcf832,0x9f180512,0xa0010137,0xfbadf8b7,0xb3ba8aa7,0xc69b4089,0xe687ce85,0xfac4bacd .long 0x977eab40,0x9164088d,0x2760b390,0x51f4c5b6,0x340dd553,0xd238238f,0xdb1d31c9,0x358566c3,0x5068f5ff,0x3a5ad69e,0xdaff6b06,0xf31435fc,0xd6debff0,0xae549a5b,0x75e01331,0x59e5f0b7 .long 0x98559acf,0x5d492fb8,0x4db79b50,0x96018c2e,0x609f66aa,0x55f4a48f,0x4900a14f,0x1943b3af,0x15a40d39,0xc22496df,0x4c20f7c5,0xb2a44684,0x3b98404c,0x76a35afa,0xff5d1b77,0xbec75725 .long 0xbea06444,0xb67aa163,0xf724b6f2,0x27e95bb2,0xd238c8ab,0x3c20e3e9,0xddd6ae17,0x1213754e,0x716e0f74,0x8c431020,0xffc095c2,0x6679c82e,0xd0ac2932,0x2eb3adf4,0x01bb7a76,0x2cc970d3 .long 0x740f0e66,0x70c71f2f,0x2b6b23cc,0x545c616b,0xb40a8bd7,0x4528cfcb,0x2ab27722,0xff839633,0x025ac99a,0x049127d9,0x2b63e33b,0xd314d4a0,0x28d84519,0xc8c310e7,0xb3bc84ba,0x0fcb8983 .long 0x38634818,0x2cc52261,0xb44c2e0b,0x501814f4,0x54dfdba3,0xf7e181aa,0xe759718c,0xcfd58ff0,0xd3b507a8,0xf90cdb14,0xc50bdad8,0x57bd478e,0x50e5f9aa,0x29c197e2,0xe40bc855,0x4db6eef8 .long 0xd1fc0654,0x2cc8f21a,0x81269d73,0xc71cc963,0x077f49f9,0xecfbb204,0xca56b793,0xdde92571,0xf97ad8f7,0x9abed6a3,0x924de3bd,0xe6c19d3f,0xa140a800,0x8dce92f4,0x1337af07,0x85f44d1e .long 0x09d64c52,0x5953c08b,0xf5df9749,0xa1b5e49f,0x52735f7d,0x336a8fb8,0x9add676b,0xb332b6db,0xb4511aa4,0x558b88a0,0xdbd5cc55,0x09788752,0xd8cd52bd,0x16b43b9c,0xc2a2696b,0x7f0bc5a0 .long 0xc11f61ef,0x146e12d4,0x3a83e79e,0x9ce10754,0x6cbfca15,0x08ec73d9,0x5b49653f,0x09ff29ad,0xe7da946e,0xe31b72bd,0xee80a4f2,0xebf9eb3b,0x17598ce4,0xd1aabd08,0x53f37e80,0x18b5fef4 .long 0x5958cd79,0xd5d5cdd3,0x1d373114,0x3580a1b5,0xfa935726,0xa36e4c91,0xef20d760,0xa38c534d,0x2ff5845b,0x7088e40a,0xbd78177f,0xe5bb40bd,0x857f9920,0x4f06a7a8,0xe968f05d,0xe3cc3e50 .long 0xe5682d26,0x1d68b7fe,0xaec7f87c,0x5206f76f,0x041951ab,0x41110530,0xd4b5a71a,0x58ec52c1,0x0f75cf9a,0xf3488f99,0xba82d0d5,0xf411951f,0x618895ab,0x27ee75be,0x6d8aab14,0xeae060d4 .long 0x7fb54dc2,0x9ae1df73,0x25963649,0x1f3e391b,0xfe055081,0x242ec32a,0x8491c9bd,0x5bd450ef,0x981eb389,0x367efc67,0x3a0550d5,0xed7e1928,0xab3ce75c,0x362e776b,0x1f24c523,0xe890e308 .long 0xfeccef76,0xb961b682,0x8bba6d92,0x8b8e11f5,0x2b2375c4,0x8f2ccc4c,0xe2f86cfa,0x0d7f7a52,0x9efe5633,0xfd94d30a,0x5451f934,0x2d8d246b,0x244e6a00,0x2234c6e3,0xddec8c50,0xde2b5b0d .long 0xbf776f5b,0x2ce53c5a,0x60357b05,0x6f724071,0x71bf3f7a,0xb2593717,0x440c4a9f,0x87d2501c,0x87b05340,0x440552e1,0x21624c32,0xb7bf7cc8,0x22facddb,0x4155a6ce,0x889837ef,0x5a4228cb .long 0xfd4fd671,0xef87d6d6,0xc2daa10e,0xa233687e,0x03c0eb96,0x75622244,0x8bf19be6,0x7632d184,0x40735ff4,0x05d0f8e9,0xc00931f1,0x3a3e6e13,0xdafe3f18,0x31ccde6a,0xcfe51207,0xf381366a .long 0x60167d92,0x24c222a9,0x7529f18c,0x62f9d6f8,0x0353b114,0x412397c0,0xef808043,0x334d89dc,0x2a4383ce,0xd9ec63ba,0x5cf92ba0,0xcec8e937,0xc8be74c0,0xfb8b4288,0x105d4391,0x67d6912f .long 0x1b913149,0x7b996c46,0x3a4e02da,0x36aae2ef,0x972de594,0xb68aa003,0x4ec6d545,0x284ec70d,0x61391d54,0xf3d2b2d0,0xfe114e92,0x69c5d5d6,0xb4482dff,0xbe0f00b5,0xf5bf33c5,0xe1596fa5 .long 0x96a71cba,0x10595b56,0xfdcadeb7,0x944938b2,0xfccd8471,0xa282da4c,0x0d37bfe1,0x98ec05f3,0x0698304a,0xe171ce1b,0x21bdf79b,0x2d691444,0x1b21dec1,0xd0cd3b74,0x16a15f71,0x712ecd8b .long 0x00fd56e1,0x8d4c00a7,0xf9527c18,0x02ec9692,0x4a3e42e1,0x21c44937,0x1392ae0a,0x9176fbab,0x44b7b618,0x8726f1ba,0xf1de491c,0xb4d7aae9,0x07b582c0,0xf91df7b9,0xef60aa3a,0x7e116c30 .long 0x466265d7,0x99270f81,0x4df7adf0,0xb15b6fe2,0xf9738f7f,0xfe33b2d3,0xd6d70f95,0x48553ab9,0xc21e94db,0x2cc72ac8,0xbdc0bbee,0x795ac38d,0x2e40478f,0x0a1be449,0x052bde55,0x81bd3394 .long 0x56b3c4f2,0x63c8dbe9,0x904177cc,0x017a99cf,0x4d010fc1,0x947bbddb,0xbb2c9b21,0xacf9b00b,0x47173611,0x2970bc8d,0xac7d756f,0x1a4cbe08,0x67d541a2,0x06d9f4aa,0x59c2cf44,0xa3e8b689 .long 0x4d88f1dd,0xaad066da,0x7ad35dea,0xc604f165,0x4478ca67,0x7edc0720,0xba02ce06,0xa10dfae0,0xaf36f4e4,0xeceb1c76,0xaf3f8f48,0x994b2292,0x77c8a68c,0xbf9ed77b,0x51744c9d,0x74f544ea .long 0x8113a757,0x82d05bb9,0x8a9885e4,0x4ef2d2b4,0x1aa7865f,0x1e332be5,0x290d1a52,0x22b76b18,0x44351683,0x308a2310,0xa3f22840,0x9d861896,0x841ed947,0x5959ddcd,0x154b73bf,0x0def0c94 .long 0x4c7c15e0,0xf0105417,0x3a277c32,0x539bfb02,0xf9dccf5f,0xe699268e,0x0247a3bd,0x9f5796a5,0x4f157269,0x8b839de8,0x7a30196b,0xc825c1e5,0xdc8a5a91,0x6ef0aabc,0x498b7fe6,0xf4a8ce6c .long 0x70cbac78,0x1cce35a7,0xf6b23958,0x83488e9b,0xd76cb011,0x0341a070,0xae1b2658,0xda6c9d06,0xdd648c52,0xb701fb30,0x52fb9fd1,0x994ca02c,0x6f563086,0x06933117,0x17856bab,0x3d2b8100 .long 0x5963a46e,0xe89f48c8,0xa99e61c7,0x658ab875,0x4b8517b4,0x6e296f87,0xfc1bc656,0x36c4fcdc,0xa3906def,0xde5227a1,0x62418945,0x9fe95f57,0xfdd96cde,0x20c91e81,0xda4480de,0x5adbe47e .long 0x396de2b6,0xa009370f,0xf0ecc7bd,0x98583d4b,0xe51d0672,0xf44f6b57,0x556b1984,0x03d6b078,0xb0b64912,0x27dbdd93,0x15687b09,0x9b3a3434,0x51ec20a9,0x0dba6461,0xff28187c,0xec93db7f .long 0x66e48bdd,0x00ff8c24,0x11ccd78e,0x2514f2f9,0xe1250603,0xeba11f4f,0x243fa156,0x8a22cd41,0xb283e4c6,0xa4e58df4,0x8b39783f,0x78c29859,0xa5259809,0x5235aee2,0x0e0227dd,0xc16284b5 .long 0x1338830d,0xa5f57916,0xd2123fca,0x6d4b8a6b,0xf9c546f8,0x236ea68a,0xfa608d36,0xc1d36873,0x8d436d13,0xcd76e495,0x8fb080af,0xd4d9c221,0xe8ad3fb5,0x665c1728,0xb3d572e0,0xcf1ebe4d .long 0x584c5e20,0xa7a8746a,0xb9dc7035,0x267e4ea1,0xb9548c9b,0x593a15cf,0x4bd012f3,0x5e6e2135,0x8c8f936e,0xdf31cc6a,0xb5c241dc,0x8af84d04,0x345efb86,0x63990a6f,0xb9b962cb,0x6fef4e61 .long 0x25722608,0xf6368f09,0x131cf5c6,0x131260db,0xfab4f7ac,0x40eb353b,0x37eee829,0x85c78880,0xc3bdf24e,0x4c1581ff,0xf5c3c5a8,0x5bff75cb,0xa14e6f40,0x35e8c83f,0x0295e0ca,0xb81d1c0f .long 0xf43a730f,0xfcde7cc8,0x33ab590e,0xe89b6f3c,0xad03240b,0xc823f529,0x98bea5db,0x82b79afe,0x962fe5de,0x568f2856,0x60c591f3,0x0c590adb,0x4a28a858,0x1fc74a14,0xb3203f4c,0x3b662498 .long 0x6c39765a,0x91e3cf0d,0xac3cca0b,0xa2db3acd,0xcb953b50,0x288f2f08,0xcf43cf1a,0x2414582c,0x60eee9a8,0x8dec8bbc,0x729aa042,0x54c79f02,0x6532f5d5,0xd81cd5ec,0xcf82e15f,0xa672303a .long 0x719c0563,0x376aafa8,0xbc5fc79f,0xcd8ad2dc,0xcb750cd3,0x303fdb9f,0x4418b08e,0x14ff052f,0x3e2d6520,0xf75084cf,0x144ed509,0x7ebdf0f8,0xd3f25b98,0xf43bf0f2,0xa354d837,0x86ad71cf .long 0x26f43572,0xb827fe92,0x5d824758,0xdfd3ab5b,0x539094c1,0x315dd23a,0x66623d68,0x85c0e37a,0x7be19ae0,0x575c7972,0xdf0d36b5,0x616a3396,0x26b1ff7e,0xa1ebb3c8,0x140ad453,0x635b9485 .long 0xda430c0b,0x92bf3cda,0x3a96dac6,0x4702850e,0x15ac326a,0xc91cf0a5,0xab8c25e4,0x95de4f49,0xe265c17c,0xb01bad09,0x087b3881,0x24e45464,0xe1fac5ca,0xd43e583c,0x6ead97a6,0xe17cb318 .long 0x74dcec46,0x6cc39243,0x54c2b73f,0x33cfc02d,0xf26cd99c,0x82917844,0xd1773f89,0x8819dd95,0x0871f427,0x09572aa6,0xf6f01c34,0x8e0cf365,0xbff1f5af,0x7fa52988,0xe75e8e50,0x4eb357ea .long 0x868af75d,0xd9d0c8c4,0x45c8c7ea,0xd7325cff,0xcc81ecb0,0xab471996,0x611824ed,0xff5d55f3,0x1977a0ee,0xbe314541,0x722038c6,0x5085c4c5,0xf94bb495,0x2d5335bf,0xc8e2a082,0x894ad8a6 .long 0xada35438,0x5c3e2341,0x049b8c4e,0xf4a9fc89,0x9f17cf34,0xbeeb355a,0x6c91fe10,0x3f311e0e,0x92ab9891,0xc2d20038,0x3e8ce9a9,0x257bdcc1,0x88c53bee,0x1b2d9789,0xcdba143a,0x927ce89a .long 0x523db280,0xb0a32cca,0x50d43783,0x5c889f8a,0x4897d16f,0x503e04b3,0x08f5f2e8,0x8cdb6e78,0x179c8e74,0x6ab91cf0,0x48211d60,0xd8874e52,0xea851200,0xf948d4d5,0xe6f9840a,0x4076d41e .long 0x47b517ea,0xc20e263c,0x30685e5e,0x79a448fd,0xf90631a0,0xe55f6f78,0xa79e6346,0x88a790b1,0x80969fe8,0x62160c7d,0x41491bb9,0x54f92fd4,0x5c957526,0xa6645c23,0xbea3ce7b,0xf44cc5ae .long 0x8b1e68b7,0xf7628327,0x303f29d3,0xc731ad7a,0x57d03ecb,0xfe5a9ca9,0x41bc97a7,0x96c0d50c,0x9b4f7f24,0xc4669fe7,0x3d9967ef,0xfdd781d8,0x5d2c208d,0x7892c7c3,0xae545cb3,0x8bf64f7c .long 0x467be912,0xc01f862c,0xc73d30cc,0xf4c85ee9,0x6ab83ec7,0x1fa6f4be,0x4e3e3cf9,0xa07a3c1c,0x0c00beb3,0x87f8ef45,0x000d4c3e,0x30e2c2b3,0xfe08bf5b,0x1aa00b94,0x9224ef52,0x32c133aa .long 0x32e5685d,0x38df16bb,0x58e6f544,0x68a9e069,0xcdc5ebc6,0x495aaff7,0x378b135f,0xf894a645,0x09e27ecf,0xf316350a,0x58f7179d,0xeced201e,0xe97861ba,0x2eec273c,0xd693be2e,0x47ec2cae .long 0xf68367ce,0xfa4c97c4,0xbe5a5755,0xe4f47d0b,0xb298a979,0x17de815d,0xc177dc7d,0xd7eca659,0x49ded0a3,0x20fdbb71,0xfb34d3c5,0x4cb2aad4,0x60858a33,0x2cf31d28,0xa24aa40f,0x3b6873ef .long 0x2c11bb37,0x540234b2,0xed4c74a3,0x2d0366dd,0xeec5f25d,0xf9a968da,0x67b63142,0x36601068,0x68d7b6d4,0x07cd6d2c,0x0c842942,0xa8f74f09,0x7768b1ee,0xe2751404,0xfe62aee4,0x4b5f7e89 .long 0x89070d26,0xc6a77177,0xdd1c8bc7,0xa1f28e4e,0x469e1f17,0xea5f4f06,0xfbdb78e0,0x78fc242a,0x8b0588f1,0xc9c7c592,0x1535921e,0xb6b7a0fd,0xbde5ae35,0xcc5bdb91,0x12ff1864,0xb42c485e .long 0xdbab98aa,0xa1113e13,0xa17b1024,0xde9d469b,0xc0462d3a,0x23f48b37,0x7c5c078d,0x3752e537,0x15544eb9,0xe3a86add,0x80fba279,0xf013aea7,0xf22001b5,0x8b5bb76c,0xf02891ab,0xe617ba14 .long 0x936219d3,0xd39182a6,0xae51cb19,0x5ce1f194,0xbf07a74c,0xc78f8598,0x22cbf1bc,0x6d7158f2,0xe300ce18,0x3b846b21,0x2d11275d,0x35fba630,0xa0239b9b,0x5fe25c36,0xdf05d940,0xd8beb35d .long 0x1f7e320d,0x4db02bb0,0x6da320ea,0x0641c364,0x821389a3,0x6d95fa5d,0x8fcd8e3d,0x92699748,0xceb6c143,0x316fef17,0xd933762b,0x67fcb841,0x118b17f8,0xbb837e35,0x9fd24821,0x4b92552f .long 0x46aca793,0xae6bc70e,0xe579311b,0x1cf0b0e4,0x5802f716,0x8dc631be,0xbddbee4d,0x099bdc6f,0x0caf8b05,0xcc352bb2,0x72d63df2,0xf74d505a,0x91c4f408,0xb9876d4b,0x9e229b2d,0x1ce18473 .long 0x83abdb4a,0x49507597,0xdee84b18,0x850fbcb6,0x609e67dc,0x6325236e,0x9336c6d8,0x04d831d9,0xfa12d45d,0x8deaae3b,0x4746e246,0xe425f8ce,0x24f5f31e,0x8004c175,0xad62c3b7,0xaca16d8f .long 0x9152f934,0x0dc15a6a,0xed0e12c1,0xf1235e5d,0xda477dac,0xc33c06ec,0xb2ea0006,0x76be8732,0x0c0cd313,0xcf3f7831,0xa614260d,0x3c524553,0xcab22d15,0x31a756f8,0x77827a20,0x03ee10d1 .long 0x1994ef20,0xd1e059b2,0x638ae318,0x2a653b69,0x2f699010,0x70d5eb58,0x09f5f84a,0x279739f7,0x8b799336,0x5da4663c,0x203c37eb,0xfdfdf14d,0xa1dbfb2d,0x32d8a9dc,0x77d48f9b,0xab40cff0 .long 0xd20b42d5,0xc018b383,0x9f78845f,0xf9a810ef,0xbdba9df0,0x40af3753,0x131dfdf9,0xb90bdcfc,0xf01ab782,0x18720591,0x6af12a88,0xc823f211,0x0dc14401,0xa51b80f3,0xfb2dfbe3,0xde248f77 .long 0x0cafe751,0xef5a44e5,0xd4dcd221,0x73997c9c,0xde854024,0x32fd86d1,0xa09b84bb,0xd5b53adc,0xdcedd8d1,0x008d7a11,0x74b32c84,0x406bd1c8,0x05dde8b1,0x5d4472ff,0xfce2b32f,0x2e25f2cd .long 0x29dfc254,0xbec0dd5e,0x2b98b267,0x4455fcf6,0xc72df2ad,0x0b4d43a5,0x48a75397,0xea70e6be,0x5820f3bf,0x2aad6169,0x9e37f68f,0xf410d2dd,0x7be5ac83,0x70fb7dba,0x36ec3eec,0x636bb645 .long 0x9754e21c,0x27104ea3,0x8d63c373,0xbc87a3e6,0x4109db9a,0x483351d7,0x60134da7,0x0fa724e3,0xb0720b16,0x9ff44c29,0x06aceead,0x2dd0cf13,0xe26929a6,0x5942758c,0xb766a92b,0x96c5db92 .long 0x5f18395e,0xcec7d4c0,0x1f80d032,0xd3f22744,0xcb86075b,0x7a68b37a,0xafef92db,0x074764dd,0x7bc7f389,0xded1e950,0xb9756460,0xc580c850,0x7da48157,0xaeeec2a4,0x82c587b3,0x3f0b4e7f .long 0xa9f19c53,0x231c6de8,0x6974e34e,0x5717bd73,0xf1508fa9,0xd9e1d216,0xdadaa124,0x9f112361,0x823b7348,0x80145e31,0xac634069,0x4dd8f0d5,0x2297c258,0xe3d82fc7,0x9cee7431,0x276fcfee .long 0x2bc0aea9,0x8eb61b5e,0xde329431,0x4f668fd5,0x38e4b87e,0x03a32ab1,0x73d0ef0b,0xe1374517,0x853ac983,0x1a46f7e6,0x68e78a57,0xc3bdf42e,0x2ea96dd1,0xacf20785,0xf1638460,0xa10649b9 .long 0x879fbbed,0xf2369f0b,0xda9d1869,0x0ff0ae86,0x56766f45,0x5251d759,0x2be8d0fc,0x4984d8c0,0xd21008f0,0x7ecc95a6,0x3a1a1c49,0x29bd54a0,0xd26c50f3,0xab9828c5,0x51d0d251,0x32c0087c .long 0x0c1cdb26,0x9bac3ce6,0x557ca205,0xcd94d947,0x9db1fdcd,0x1b1bd598,0xa3d8b149,0x0eda0108,0x56152fcc,0x95066610,0xe7192b33,0xc2f037e6,0xc92e05a4,0xdeffb41a,0xc2f6c62e,0x1105f6c2 .long 0x8733913c,0x68e73500,0x3f3adc40,0xcce86163,0x38a278e9,0xf407a942,0x2ab21292,0xd13c1b9d,0x1c74cf5c,0x93ed7ec7,0xf1a4c1b4,0x8887dc48,0x4b3a11f1,0x3830ff30,0x58937cb6,0x358c5a3c .long 0x89022829,0x027dc404,0x3b798f79,0x40e93977,0x38be6ead,0x90ad3337,0xf34c0a5d,0x9c23f6bc,0xfbffd8bb,0xd1711a35,0x1949d3dd,0x60fcfb49,0x7825d93a,0x09c8ef4b,0xa0a8c968,0x24233cff .long 0xe6d982af,0x67ade46c,0xe7544d7c,0xebb6bf3e,0x3d8bd087,0xd6b9ba76,0x4dc61280,0x46fe382d,0xb5bdbd75,0xbd39a7e8,0xb8f228fe,0xab381331,0xce1c4300,0x0709a77c,0xf337ceac,0x6a247e56 .long 0x636288be,0x8f34f21b,0xc8a7c305,0x9dfdca74,0xea919e04,0x6decfd1b,0x8e1991f8,0xcdf2688d,0xd0f8a67e,0xe607df44,0x0b58d010,0xd985df4b,0x0c24f8f4,0x57f834c5,0xa0bf01ae,0xe976ef56 .long 0xa1c32373,0x536395ac,0x734c0a13,0x351027aa,0x5e6bd5bc,0xd2f1b5d6,0x223debed,0x2b539e24,0x0eaa1d71,0xd4994cec,0x661dcf65,0x2a83381d,0x7b54c740,0x5f1aed2f,0xd6dda5ee,0x0bea3fa5 .long 0x36cc6134,0x9d4fb684,0xc0a443dd,0x8eb9bbf3,0x383b7d2a,0xfc500e2e,0x5b775257,0x7aad621c,0x0a8f7cc0,0x69284d74,0x07562d65,0xe820c2ce,0x499758ee,0xbf9531b9,0x6ee0cc2d,0x73e95ca5 .long 0xfbaf50a5,0xf61790ab,0x684e0750,0xdf55e76b,0xf176b005,0xec516da7,0x7a2dddc7,0x575553bb,0x553afa73,0x37c87ca3,0x4d55c251,0x315f3ffc,0xaf3e5d35,0xe846442a,0x6495ff28,0x61b91149 .long 0xfa326dc3,0x23cc95d3,0x18fc2cea,0x1df4da1f,0xd0a37d59,0x24bf9adc,0x320d6e1e,0xb6710053,0x618344d1,0x96f9667e,0xa06445af,0xcc7ce042,0xd68dbc3a,0xa02d8514,0x280b5a5b,0x4ea109e4 .long 0xb40961bf,0x5741a7ac,0x6aa56bfa,0x4ada5937,0x02b765d1,0x7feb9145,0xe6ad1582,0x561e97be,0xda3982f5,0xbbc4a5b6,0xb546f468,0x0c2659ed,0x59612d20,0xb8e7e6aa,0xac19e8e0,0xd83dfe20 .long 0xb835398c,0x8530c45f,0xb38a41c2,0x6106a8bf,0x35f5dcdb,0x21e8f9a6,0xcae498ed,0x39707137,0xd8249f00,0x70c23834,0xab2537a0,0x9f14b58f,0x5f61c0c2,0xd043c365,0x09a194a7,0xdc5926d6 .long 0x8e77738a,0xddec0339,0xfba46426,0xd07a63ef,0xee7f6e86,0x2e58e79c,0xff32d241,0xe59b0459,0x20fa0338,0xc5ec84e5,0xeaff5ace,0x97939ac8,0xb4a38313,0x0310a4e3,0x8f9d9885,0x9115fba2 .long 0x5fadf8c3,0x8dd710c2,0xce19c0e2,0x66be38a2,0x4cfe5022,0xd42a279c,0x0e24e1b8,0x597bb530,0xc153ca7f,0x3cde86b7,0x707d63bd,0xa8d30fb3,0xbd60d21e,0xac905f92,0x7b9a54ab,0x98e7ffb6 .long 0xe9726a30,0xd7147df8,0xafce3533,0xb5e216ff,0x2ff1ec40,0xb550b799,0xa1e953fd,0x6b613b87,0x792d5610,0x87b88dba,0xa190fbe1,0x2ee1270a,0x2ef581da,0x02f4e2dc,0xeff82a95,0x016530e4 .long 0x8fd6ee89,0xcbb93dfd,0x46848fff,0x16d3d986,0x1da47adf,0x600eff24,0x0ad47a71,0x1b9754a0,0x70c33b98,0x8f9266df,0xdf34186e,0xaadc87ae,0x4ad24132,0x0d2ce8e1,0x19946eba,0x8a47cbfc .long 0x62b5f3af,0x47feeb66,0x0abb3734,0xcefab561,0x19f35cb1,0x449de60e,0x157f0eb9,0x39f8db14,0x3c61bfd6,0xffaecc5b,0x41216703,0xa5a4d41d,0x224e1cc2,0x7f8fabed,0x871ad953,0x0d5a8186 .long 0xd22da9a9,0xf10774f7,0xcc8a9b0d,0x45b8a678,0xbdc32cff,0xd9c2e722,0x337202a5,0xbf71b5f5,0x69fc4db9,0x95c57f2f,0x765d01e1,0xb6dad34c,0xcb904635,0x7e0bd13f,0x763a588c,0x61751253 .long 0x81af2c2d,0xd85c2997,0x81b9d7da,0xc0f7d9c4,0x08533e8d,0x838a34ae,0x311d8311,0x15c4cb08,0x8e121e14,0x97f83285,0x85000a5f,0xeea7dc1e,0x5d256274,0x0c6059b6,0xb95075c0,0xec9beace .long 0x1df97828,0x173daad7,0xa8937877,0xbf851cb5,0x01646f3c,0xb083c594,0x50c6d352,0x3bad30cf,0x496bbcea,0xfeb2b202,0x18a1e8ba,0x3cf9fd4f,0x1c066029,0xd26de7ff,0x4e9ed4f8,0x39c81e9e .long 0x7b390d35,0xd8be0cb9,0x964aab27,0x01df2bbd,0xc3ef64f8,0x3e8c1a65,0x716ed1dd,0x567291d1,0x5f5406d3,0x95499c6c,0x5ba8e23f,0x71fdda39,0xd5096ece,0xcfeb320e,0xca66dd16,0xbe7ba92b .long 0xc6fb5a7d,0x4608d36b,0x6d2dd0e0,0xe3eea15a,0x8f97a36a,0x75b0a3eb,0x1c83de1e,0xf59814cc,0x1c33c23f,0x56c9c5b0,0x6faa4136,0xa96c1da4,0xde316551,0x46bf2074,0x1f756c8f,0x3b866e7b .long 0x1495ed6b,0x727727d8,0xb682dce7,0xb2394243,0x758610f3,0x8ab8454e,0x857d72a4,0xc243ce84,0xdbbf370f,0x7b320d71,0x78e0f7ca,0xff9afa37,0xea7b523f,0x0119d1e0,0x058c7d42,0xb997f8cb .long 0x37bbb184,0x285bcd2a,0xa45d1fa6,0x51dcec49,0xe29634cb,0x6ade3b64,0x26b86ef1,0x080c94a7,0x2283fbe3,0xba583db1,0x5a9315ed,0x902bddc8,0x86964bec,0x07c1ccb3,0xb6258301,0x78f4eacf .long 0x56f90823,0x4bdf3a49,0x741d777b,0xba0f5080,0xf38bf760,0x091d71c3,0x9b625b02,0x9633d50f,0xb8c9de61,0x03ecb743,0x5de74720,0xb4751254,0x74ce1cb2,0x9f9defc9,0x00bd32ef,0x774a4f6a .long 0x73848f22,0xaca385f7,0xf3f8558e,0x53dad716,0x93c471f9,0xab7b34b0,0x19644bc7,0xf530e069,0xdd59d31a,0x3d9fb1ff,0x08daa795,0x4382e0df,0xd5cc88d7,0x165c6f4b,0x4a18c900,0xeaa392d5 .long 0x648024ee,0x94203c67,0x8c2fabcd,0x188763f2,0xbbaec835,0xa80f87ac,0xf29d8d54,0x632c96e0,0x4c00a95e,0x29b0a60e,0xe011e9fa,0x2ef17f40,0x15b77223,0xf6c0e1d1,0x14b04e32,0xaaec2c62 .long 0x3d84e58c,0xd35688d8,0x958571db,0x2af5094c,0x760682a6,0x4fff7e19,0xe39a407c,0x4cb27077,0x4ff0e321,0x0f59c547,0x1b34c8ff,0x169f34a6,0x52bc1ba7,0x2bff1096,0x83583544,0xa25423b7 .long 0x0ac8b782,0x5d55d5d5,0x2db3c892,0xff6622ec,0x6b8bb642,0x48fce741,0x69d7e3dc,0x31d6998c,0xcadcaed0,0xdbaf8004,0xd81d053c,0x801b0142,0x59630ec6,0x94b189fc,0xaf762c8e,0x120e9934 .long 0xfdc6a404,0x53a29aa4,0xa1909948,0x19d8e01e,0xd7e89681,0x3cfcabf1,0x4e132d37,0x3321a50d,0xe9a86111,0xd0496863,0x06a3bc65,0x8c0cde61,0xfc9f8eef,0xaf866c49,0xff7f5141,0x2066350e .long 0xe56ddfbd,0x4f8a4689,0xfe32983a,0xea1b0c07,0x873cb8cb,0x2b317462,0x2d93229f,0x658deddc,0x0f64ef58,0x65efaf4d,0x730cc7a8,0xfe43287d,0x3d047d70,0xaebc0c72,0xd92d26c9,0x92efa539 .long 0x94b56526,0x06e78457,0x0961002d,0x415cb80f,0x76dcb10f,0x89e5c565,0xff9259fe,0x8bbb6982,0x9abc2668,0x4fe8795b,0x1e678fb1,0xb5d4f534,0x7b7da2b9,0x6601f3be,0xa13d6805,0x98da59e2 .long 0x01799a52,0x190d8ea6,0xb86d2952,0xa20cec41,0x7fff2a7c,0x3062ffb2,0x79f19d37,0x741b32e5,0x4eb57d47,0xf80d8181,0x16aef06b,0x7a2d0ed4,0x1cecb588,0x09735fb0,0xc6061f5b,0x1641caaa .long 0x20151427,0x7f99824f,0x92430206,0x206828b6,0xe1112357,0xaa9097d7,0x09e414ec,0xacf9a2f2,0x27915356,0xdbdac9da,0x001efee3,0x7e0734b7,0xd2b288e2,0x54fab5bb,0xf62dd09c,0x4c630fc4 .long 0x1ac2703b,0x8537107a,0x6bc857b5,0xb49258d8,0xbcdaccd1,0x57df14de,0xc4ae8529,0x24ab68d7,0x734e59d0,0x7ed8b5d4,0xc495cc80,0x5f8740c8,0x291db9b3,0x84aedd5a,0x4fb995be,0x80b360f8 .long 0x5fa067d1,0xae915f5d,0x9668960c,0x4134b57f,0xa48edaac,0xbd3656d6,0xfc1d7436,0xdac1e3e4,0xd81fbb26,0x674ff869,0xb26c33d4,0x449ed3ec,0xd94203e8,0x85138705,0xbeeb6f4a,0xccde538b .long 0xa61a76fa,0x55d5c68d,0xca1554dc,0x598b441d,0x773b279c,0xd39923b9,0x36bf9efc,0x33331d3c,0x298de399,0x2d4c848e,0xa1a27f56,0xcfdb8e77,0x57b8ab70,0x94c855ea,0x6f7879ba,0xdcdb9dae .long 0x019f2a59,0x7bdff8c2,0xcb4fbc74,0xb3ce5bb3,0x8a9173dd,0xea907f68,0x95a75439,0x6cd3d0d3,0xefed021c,0x92ecc4d6,0x6a77339a,0x09a9f9b0,0x7188c64a,0x87ca6b15,0x44899158,0x10c29968 .long 0xed6e82ef,0x5859a229,0x65ebaf4e,0x16f338e3,0x5ead67ae,0x0cd31387,0x54ef0bb4,0x1c73d228,0x74a5c8c7,0x4cb55131,0x7f69ad6a,0x01cd2970,0xe966f87e,0xa04d00dd,0x0b7b0321,0xd96fe447 .long 0x88fbd381,0x342ac06e,0x5c35a493,0x02cd4a84,0x54f1bbcd,0xe8fa89de,0x2575ed4c,0x341d6367,0xd238202b,0xebe357fb,0xa984ead9,0x600b4d1a,0x52436ea0,0xc35c9f44,0xa370751b,0x96fe0a39 .long 0x7f636a38,0x4c4f0736,0x0e76d5cb,0x9f943fb7,0xa8b68b8b,0xb03510ba,0x9ed07a1f,0xc246780a,0x6d549fc2,0x3c051415,0x607781ca,0xc2953f31,0xd8d95413,0x955e2c69,0x7bd282e3,0xb300fadc .long 0x87e9189f,0x81fe7b50,0xf42dda27,0xdb17375c,0xcf0a5904,0x22f7d896,0xebe348e6,0xa0e57c5a,0xf40e3c80,0xa61011d3,0x8db705c5,0xb1189321,0x50fedec3,0x4ed9309e,0x4d6d5c1d,0xdcf14a10 .long 0x55691342,0x056c265b,0x91049dc7,0xe8e08504,0xc9bae20a,0x131329f5,0xd9dccdb4,0x96c8b3e8,0xfb4ee6b4,0x8c5ff838,0x41e8ccf0,0xfc5a9aeb,0xfae050c6,0x7417b764,0x00452080,0x0953c3d7 .long 0x38dfe7e8,0x21372682,0x2bb79d4b,0xea417e15,0x76e7cf2d,0x59641f1c,0xea0bcfcc,0x271e3059,0x7253ecbd,0x624c7dfd,0x4fca6186,0x2f552e25,0x4d866e9c,0xcbf84ecd,0xf68d4610,0x73967709 .long 0xc27901b4,0xa14b1163,0x899b8bf3,0xfd9236e0,0xcbc6da0a,0x42b091ec,0x5ad1d297,0xbb1dac6f,0xa91cf76e,0x80e61d53,0xd31f1ee7,0x4110a412,0x13efcf77,0x2d87c3ba,0xdf450d76,0x1f374bb4 .long 0x0d188dab,0x5e78e2f2,0xf4b885ef,0xe3968ed0,0x7314570f,0x46c0568e,0x01170521,0x31616338,0x4f0c8afe,0x18e1e7e2,0xdeea78da,0x4caa75ff,0x7c5d8a51,0x82db67f2,0x6f505370,0x36a44d86 .long 0x0333974f,0xd72c5bda,0x27a70146,0x5db516ae,0x210ef921,0x34705281,0x0c9c38e5,0xbff17a8f,0x12476da1,0x78f4814e,0x33c16980,0xc1e16613,0x424d4bca,0x9e5b386f,0xc85740de,0x4c274e87 .long 0x6c2f5226,0xb6a9b88d,0x550d7ca8,0x14d1b944,0x1fc41709,0x580c85fc,0x54c6d519,0xc1da368b,0xd5113cf7,0x2b0785ce,0x5a34708f,0x0670f633,0x15cc3f88,0x46e23767,0x50c72c8f,0x1b480cfa .long 0x4147519a,0x20288602,0x26b372f0,0xd0981eac,0xa785ebc8,0xa9d4a7ca,0xdbdf58e9,0xd953c50d,0xfd590f8f,0x9d6361cc,0x44e6c917,0x72e9626b,0x22eb64cf,0x7fd96110,0x9eb288f3,0x863ebb7e .long 0x6aca8ee7,0x6e6ab761,0xd7b40358,0x97d10b39,0x1e5feb0d,0x1687d377,0x8265a27a,0xc83e50e4,0xc954b313,0x8f75a9fe,0x310d1f61,0xcc2e8f47,0x6557d0e0,0xf5ba81c5,0x3eaf6207,0x25f9680c .long 0x4354080b,0xf95c6609,0x7bf2fe1c,0x5225bfa5,0x5c7d98fa,0xc5c004e2,0x019aaf60,0x3561bf1c,0xba151474,0x5e6f9f17,0xb04f6eca,0xdec2f934,0x269acb1e,0x64e368a1,0x0cdda493,0x1332d9e4 .long 0xdf23de05,0x60d6cf69,0x009339a0,0x66d17da2,0x0a693923,0x9fcac985,0xed7c6a6d,0xbcf057fc,0xf0b5662c,0xc3c5c8c5,0xdcba4f24,0x25318dd8,0x082b69ff,0x60e8cb75,0x1e728c01,0x7c23b3ee .long 0x097e4403,0x15e10a0a,0x19854665,0xcb3d0a86,0xd67d4826,0x88d8e211,0x0b9d2839,0xb39af66e,0xbd475ca8,0xa5f94588,0xc077b80b,0xe06b7966,0xda27c26c,0xfedb1485,0xfe0fd5e0,0xd290d33a .long 0xf34fb0fa,0xa40bcc47,0x1fb1ab09,0xb4760cc8,0xa273bfe3,0x8fca0993,0xf70b213c,0x13e4fe07,0xfdb05163,0x3bcdb992,0x0c2b19b6,0x8c484b11,0xaaf2e3e2,0x1acb815f,0xb89ff1b4,0xc6905935 .long 0x586e74e1,0xb2ad6f9d,0x67b80484,0x488883ad,0x369c3ddb,0x758aa2c7,0x9f9afd31,0x8ab74e69,0x5e21beb1,0x10fc2d28,0x318c42f9,0x3484518a,0x53cf40c3,0x377427dc,0x391bc1d9,0x9de0781a .long 0x693807e1,0x8faee858,0x4e81ccc7,0xa3865327,0x6f835b84,0x02c30ff2,0x0d3d38d4,0xb604437b,0x5ca1823d,0xb3fc8a98,0x03be0324,0xb82f7ec9,0xcf684a33,0xee36d761,0x9f29bf7d,0x5a01df0e .long 0x1306583d,0x686202f3,0x437c622e,0x05b10da0,0x076a7bc8,0xbf9aaa0f,0x8f8f4e43,0x25e94efb,0xfa3dc26d,0x8a35c9b7,0x96ff03c5,0xe0e5fb93,0xebc394ce,0xa77e3843,0x8361de60,0xcede6595 .long 0xa1993545,0xd27c22f6,0x24d671ba,0xab01cc36,0xa169c28e,0x63fa2877,0x2eb08376,0x925ef904,0x53aa0b32,0x3b2fa3cf,0x71c49d7a,0xb27beb5b,0xd105e27f,0xb60e1834,0x4f68570d,0xd6089788 .long 0xd6fbc2ac,0x23094ce0,0x815ff551,0x738037a1,0x6bef119c,0xda73b1bb,0xeef506ba,0xdcf6c430,0xe3ef104a,0x00e4fe7b,0x0a065628,0xebdd9a2c,0x8792043e,0x853a81c3,0xb3b59108,0x22ad6ece .long 0x39cd297d,0x9fb813c0,0x05bda5d9,0x8ec7e16e,0x0d104b96,0x2834797c,0x7c511510,0xcc11a2e7,0x96ee6380,0x96ca5a53,0xcea38742,0x054c8655,0xd54dfa7d,0xb5946852,0x1f4ab207,0x97c422e7 .long 0x0c22b540,0xbf907509,0xb7c267d4,0x2cde42aa,0x5ab0d693,0xba18f9ed,0x6e4660d9,0x3ba62aa6,0xab9ea96a,0xb24bf97b,0xe3b60e32,0x5d039642,0x7c4d9bd5,0x4e6a4506,0x7ed4a6a4,0x666c5b9e .long 0x8edbd7cc,0xfa3fdcd9,0xc6ccd753,0x4660bb87,0x21e6b64f,0x9ae90820,0xb36bfb3f,0x8a56a713,0x5726d47f,0xabfce096,0x0b1a9a7f,0x9eed01b2,0x4eb74a37,0x30e9cad4,0x53e9666d,0x7b2524cc .long 0x8f4b002f,0x6a29683b,0x41f4fc20,0xc2200d7a,0x3a338acc,0xcf3af47a,0xe7128975,0x6539a4fb,0xc33c7fcf,0xcec31c14,0xc7be322b,0x7eb6799b,0x6646f623,0x119ef4e9,0x54d7299b,0x7b7a26a5 .long 0x403f46f2,0xcb37f08d,0x1a0ec0c7,0x94b8fc43,0xc332142f,0xbb8514e3,0xe80d2a7a,0xf3ed2c33,0xb639126c,0x8d2080af,0xe3553ade,0xf7b6be60,0x1c7e2b09,0x3950aa9f,0x6410f02b,0x847ff958 .long 0x678a31b0,0x877b7cf5,0x3998b620,0xd50301ae,0xc00fb396,0x734257c5,0x04e672a6,0xf9fb18a0,0xe8758851,0xff8bd8eb,0x5d99ba44,0x1e64e4c6,0x7dfd93b7,0x4b8eaedf,0x04e76b8c,0xba2f2a98 .long 0xe8053433,0x7d790cba,0x3d2c9585,0xc8e725a0,0xcdd8f5ed,0x58c5c476,0xefa9fe1d,0xd106b952,0x0eff13a9,0x3c5c775b,0xe057b930,0x242442ba,0xc9b70cbd,0xe9f458d4,0xa3cdb89a,0x69b71448 .long 0x0e2ed742,0x41ee46f6,0x40067493,0x573f1045,0x9d54c304,0xb1e154ff,0x8d3a7502,0x2ad0436a,0x431a8121,0xee4aaa2d,0x886f11ed,0xcd38b3ab,0x034a0eb7,0x57d49ea6,0xf7e85e58,0xd2b773bd .long 0x9b5c1f14,0x4a559ac4,0x3e54df2b,0xc444be1a,0xeda41891,0x13aad704,0x5eb5c788,0xcd927bec,0xe48c8a34,0xeb3c8516,0x4b546669,0x1b7ac812,0x594df8ec,0x1815f896,0x79227865,0x87c6a79c .long 0x9b56ddbd,0xae02a2f0,0x8a2f1cf3,0x1339b5ac,0x839dff0d,0xf2b569c7,0xfee9a43d,0xb0b9e864,0x77bb064e,0x4ff8ca41,0xfd249f63,0x145a2812,0xf86f689a,0x3ab7beac,0x01d35f5e,0x9bafec27 .long 0x4265aa91,0x28054c65,0x035efe42,0xa4b18304,0x9639dec7,0x6887b0e6,0x3d52aea5,0xf4b8f6ad,0x971a8a13,0xfb9293cc,0x4c934d07,0x3f159e5d,0x09acbc29,0x2c50e9b1,0x7154d129,0x08eb65e6 .long 0x30b75c3e,0x4feff589,0x94491c93,0x0bb82fe2,0x89af62bb,0xd8ac377a,0x9685e49f,0xd7b51490,0x04497f19,0xabca9a7b,0x1a7ad13f,0x1b35ed0a,0x3ec86ed6,0x6b601e21,0xce0c76f1,0xda91fcb9 .long 0xd7ab27e1,0x9e28507b,0x63945b7b,0x7c19a555,0xaafc9827,0x6b43f0a1,0x3aa55b91,0x443b4fbd,0x6962c88f,0x962b2e65,0xce0db0ca,0x139da8d4,0x1b8d6c4f,0xb93f05dd,0x180b9824,0x779cdff7 .long 0xae57c7b7,0xbba23fdd,0x1b932522,0x345342f2,0x556d4aa3,0xfd9c80fe,0x6525bb61,0xa03907ba,0xff218933,0x38b010e1,0xaa52117b,0xc066b654,0x94f2e6ea,0x8e141920,0x0d32f2b2,0x66a27dca .long 0x048b3717,0x69c7f993,0xb178ae1c,0xbf5a989a,0x564f1d6b,0x49fa9058,0xd31fde4e,0x27ec6e15,0x7276e7fc,0x4cce0373,0x89d6bf02,0x64086d79,0x4ccdd979,0x5a72f046,0x47775631,0x909c3566 .long 0x75dd7125,0x1c07bc6b,0x87a0428d,0xb4c6bc97,0xfdeb6b9d,0x507ece52,0xb2c95432,0xfca56512,0xd0e8bd06,0x15d97181,0xc6bb46ea,0x384dd317,0x3952b624,0x5441ea20,0x4e7dc2fb,0xbcf70dee .long 0x6628e8c3,0x372b016e,0xb60a7522,0x07a0d667,0x0a344ee2,0xcf05751b,0x118bdeec,0x0ec09a48,0xd83dce46,0x6e4b3d4e,0x99d2fc6e,0x43a6316d,0x56cf044c,0xa99d8989,0xae3e5fb7,0x7c7f4454 .long 0xfbabbe92,0xb2e6b121,0xe1330076,0x281850fb,0x97890015,0x093581ec,0x75ff77f5,0x69b1dded,0xab105105,0x7cf0b18f,0xa89ccfef,0x953ced31,0xeb914009,0x3151f85f,0x88ed48ad,0x3c9f1b87 .long 0x4a7eadcb,0xc9aba1a1,0x522e71cf,0x928e7501,0x3a2e4f83,0xeaede727,0x1ce3bbd3,0x467e10d1,0xb955dcf0,0xf3442ac3,0xd3d5e527,0xba96307d,0xfd77f474,0xf763a10e,0x6a6e1ff0,0x5d744bd0 .long 0xa777899e,0xd287282a,0xd03f3cde,0xe20eda8f,0x50b07d31,0x6a7e75bb,0x6f379de4,0x0b7e2a94,0x19f593cf,0x31cb64ad,0x1e76ef1d,0x7b1a9e4f,0xb62d609c,0xe18c9c9d,0xe779a650,0x439bad6d .long 0xe032f144,0x219d9066,0xe8b2ec6a,0x1db632b8,0xfda12f78,0xff0d0fd4,0x2a25d265,0x56fb4c2d,0x255a03f1,0x5f4e2ee1,0xe96af176,0x61cd6af2,0xd068bc97,0xe0317ba8,0x264b988e,0x927d6bab .long 0xe90fb21e,0xa18f07e0,0xbba7fca1,0x00fd2b80,0x95cd67b5,0x20387f27,0xd39707f7,0x5b89a4e7,0x894407ce,0x8f83ad3f,0x6c226132,0xa0025b94,0xf906c13b,0xc79563c7,0x4e7bb025,0x5f548f31 .long 0xeac6d113,0x2b4c6b8f,0x0e813c76,0xa67e3f9c,0x3fe1f4b9,0x3982717c,0x26d8050e,0x58865819,0xf7f06f20,0x99f3640c,0x2a66ebc2,0xdc610216,0x767a1e08,0x52f2c175,0x5999871b,0x05660e1a .long 0x6d3c4693,0x6b0f1762,0x37ed7bea,0xf0e7d627,0xb75b226d,0xc51758c7,0x1f91613b,0x40a88628,0xbbb38ce0,0x889dbaa7,0xbddcad81,0xe0404b65,0x8bc9671f,0xfebccd3a,0xee1f5375,0xfbf9a357 .long 0x28f33398,0x5dc169b0,0x72e90f65,0xb07ec11d,0xfaab1eb1,0xae7f3b4a,0x5f17538a,0xd970195e,0x0181e640,0x52b05cbe,0x2643313d,0xf5debd62,0x5df31f82,0x76148154,0x3a9e13c5,0x23e03b33 .long 0x4fde0c1f,0xff758949,0xe5b6ec20,0xbf8a1abe,0x87e1db6c,0x702278fb,0x35ed658f,0xc447ad7a,0x03d0ccf2,0x48d4aa38,0x819a7c03,0x80acb338,0x6e17cecc,0x9bc7c89e,0x03be1d82,0x46736b8b .long 0xc0432f96,0xd65d7b60,0xdeb5442f,0xddebe7a3,0x7dff69a2,0x79a25307,0x02cf3122,0x37a56d94,0xf2350d0a,0x8bab8aed,0x037b0d9a,0x13c3f276,0x44c65cae,0xc664957c,0xc2e71a88,0x88b44089 .long 0x5cb02664,0xdb88e5a3,0x8686c72e,0x5d4c0bf1,0xa682d53e,0xea3d9b62,0x0b2ad431,0x9b605ef4,0xc69645d0,0x71bac202,0x6a1b66e7,0xa115f03a,0x158f4dc4,0xfe2c563a,0x4d12a78c,0xf715b3a0 .long 0xd413213a,0x8f7f0a48,0xc04becdb,0x2035806d,0x5d8587f5,0xecd34a99,0x9f6d3a71,0x4d8c3079,0x8d95a8f6,0x1b2a2a67,0xf2110d0d,0xc58c9d7d,0xcf8fba3f,0xdeee81d5,0x0c7cdf68,0xa42be3c0 .long 0xd43b5eaa,0x2126f742,0xdfa59b85,0x054a0766,0x126bfd45,0x9d0d5e36,0x384f8a8f,0xa1f8fbd7,0xd563fccc,0x317680f5,0xf280a928,0x48ca5055,0x27b578cf,0xe00b81b2,0x2994a514,0x10aad918 .long 0xb7bdc953,0xd9e07b62,0x5bc086dd,0x9f0f6ff2,0x655eee77,0x09d1ccff,0x5bef7df1,0x45475f79,0x86f702cc,0x3faa28fa,0x0f021f07,0x92e60905,0x7f8fa8c6,0xe9e62968,0xf036ea2c,0xbd71419a .long 0x6028da9a,0x171ee1cc,0xc251f573,0x5352fe1a,0x3fa997f4,0xf8ff236e,0xa5749d5f,0xd831b6c9,0xe350e2c2,0x7c872e1d,0x1e0ce403,0xc56240d9,0x6974f5cb,0xf9deb077,0x961c3728,0x7d50ba87 .long 0x5a3a2518,0xd6f89426,0xc6303d43,0xcf817799,0x619e5696,0x510a0471,0x3a5e307b,0xab049ff6,0xfeb13ec7,0xe4cdf9b0,0x9d8ff90c,0xd5e97117,0x9afa96af,0xf6f64d06,0x9d2012a2,0x00d0bf5e .long 0x358bcdc0,0xe63f301f,0x0a9d47f8,0x07689e99,0x4f43d43a,0x1f689e2f,0x90920904,0x4d542a16,0x9ca0a707,0xaea293d5,0x8ac68065,0xd061fe45,0x0090008c,0x1033bf1b,0xc08a6db6,0x29749558 .long 0xc1d5d034,0x74b5fc59,0x67e215e0,0xf712e9f6,0x860200e6,0xfd520cbd,0x3ea22588,0x0229acb4,0xfff0c82e,0x9cd1e14c,0x59c69e73,0x87684b62,0x96ccb989,0xda85e61c,0xa3d06493,0x2d5dbb02 .long 0xe86b173c,0xf22ad33a,0xa79ff0e3,0xe8e41ea5,0xdd0d0c10,0x01d2d725,0x032d28f9,0x31f39088,0x7829839e,0x7b3f71e1,0x4502ae58,0x0cf691b4,0xbefc6115,0xef658dbd,0xb3ab5314,0xa5cd6ee5 .long 0x5f1d2347,0x206c8d7b,0x4cc2253a,0x794645ba,0x58389e08,0xd517d8ff,0x9f847288,0x4fa20dee,0xd797770a,0xeba072d8,0xbf429e26,0x7360c91d,0x80af8279,0x7200a3b3,0x82dadce3,0x6a1c9150 .long 0xc35d8794,0x0ee6d3a7,0x0356bae5,0x042e6558,0x643322fd,0x9f59698d,0x50a61967,0x9379ae15,0xfcc9981e,0x64b9ae62,0x6d2934c6,0xaed3d631,0x5e4e65eb,0x2454b302,0xf9950428,0xab09f647 .long 0x22248acc,0xb2083a12,0x3264e366,0x1f6ec0ef,0x5afdee28,0x5659b704,0xe6430bb5,0x7a823a40,0xe1900a79,0x24592a04,0xc9ee6576,0xcde09d4a,0x4b5ea54a,0x52b6463f,0xd3ca65a7,0x1efe9ed3 .long 0x305406dd,0xe27a6dbe,0xdd5d1957,0x8eb7dc7f,0x387d4d8f,0xf54a6876,0xc7762de4,0x9c479409,0x99b30778,0xbe4d5b5d,0x6e793682,0x25380c56,0xdac740e3,0x602d37f3,0x1566e4ae,0x140deabe .long 0xafd32acf,0x4481d067,0xe1f71ccf,0xd8f0fcca,0xb596f2da,0xd208dd0c,0x9aad93f9,0xd049d730,0x42ab580e,0xc79f263d,0x23f707b4,0x09411bb1,0x835e0eda,0x8cfde1ff,0x90f03402,0x72707490 .long 0xc49a861e,0xeaee6126,0xe14f0d06,0x024f3b65,0xc69bfc17,0x51a3f1e8,0xa7686381,0xc3c3a8e9,0xb103d4c8,0x3400752c,0x9218b36b,0x02bc4613,0x7651504a,0xc67f75eb,0xd02aebfa,0xd6848b56 .long 0xc30fa92b,0xbd9802e6,0x9a552784,0x5a70d96d,0x3f83169b,0x9085c4ea,0x06908228,0xfa9423bb,0xfe97a5b9,0x2ffebe12,0x71b99118,0x85da6049,0x63178846,0x9cbc2f7f,0x9153218e,0xfd96bc70 .long 0x1782269b,0x958381db,0x2597e550,0xae34bf79,0x5f385153,0xbb5c6064,0xe3088048,0x6f0e96af,0x77884456,0xbf6a0215,0x69310ea7,0xb3b5688c,0x04fad2de,0x17c94295,0x17896d4d,0xe020f0e5 .long 0x0976505f,0x730ba0ab,0x095e2ec5,0x567f6813,0x6331ab71,0x47062010,0x41d22b9f,0x72cfa977,0x8a2373da,0x33e55ead,0x7ba45a68,0xa8d0d5f4,0x03029d15,0xba1d8f9c,0xfc55b9f3,0x8f34f1cc .long 0xbbe5a1a9,0xcca4428d,0x3126bd67,0x8187fd5f,0x48105826,0x0036973a,0xb8bd61a0,0xa39b6663,0x2d65a808,0x6d42deef,0x94636b19,0x4969044f,0xdd5d564c,0xf611ee47,0xd2873077,0x7b2f3a49 .long 0x300eb294,0x94157d45,0x169c1494,0x2b2a656e,0xd3a47aa9,0xc000dd76,0xa6243ea4,0xa2864e4f,0xdb89842e,0x82716c47,0x61479fb7,0x12dfd7d7,0xe0b2f6dc,0x3b9a2c56,0xd7f85d67,0x46be862a .long 0x0f82b214,0x03b0d8dd,0xf103cbc6,0x460c34f9,0x18d79e19,0xf32e5c03,0xa84117f8,0x8b8888ba,0xc0722677,0x8f3c37dc,0x1c1c0f27,0x10d21be9,0xe0f7a0c6,0xd47c8468,0xadecc0e0,0x9bf02213 .long 0x42b48b99,0x0baa7d12,0x48424096,0x1bcb665d,0xebfb5cfb,0x8b847cd6,0x9ad4d10d,0x87c2ae56,0x0de36726,0xf1cbb122,0x3fdfbd21,0xe7043c68,0x4e79d460,0x4bd0826a,0x4bd1a2cb,0x11f5e598 .long 0xb7fe7b6e,0x97554160,0x400a3fb2,0x7d16189a,0xe328ca1e,0xd73e9bea,0xe793d8cc,0x0dd04b97,0x506db8cc,0xa9c83c9b,0xcf38814c,0x5cd47aae,0xb64b45e6,0x26fc430d,0xd818ea84,0x079b5499 .long 0xc1c24a3b,0xebb01102,0x1c161c1a,0xca24e568,0x36f00a4a,0x103eea69,0x76176c7b,0x9ad76ee8,0x538e0ff7,0x97451fc2,0x6604b3b0,0x94f89809,0x3249cfd7,0x6311436e,0x41224f69,0x27b4a7bd .long 0xe0ac2941,0x03b5d21a,0xc2d31937,0x279b0254,0xcac992d0,0x3307c052,0xefa8b1f3,0x6aa7cb92,0x0d37c7a5,0x5a182580,0x342d5422,0x13380c37,0xd5d2ef92,0x92ac2d66,0x030c63c6,0x035a70c9 .long 0x4ce4f152,0xc16025dd,0xf9df7c06,0x1f419a71,0x91e4bb14,0x6d5b2214,0x839fb4ce,0xfc43c6cc,0x925d6b2d,0x49f06591,0x62186598,0x4b37d9d3,0xd01b1629,0x8c54a971,0x51d50e05,0xe1a9c29f .long 0x71ba1861,0x5109b785,0xd0c8f93d,0x48b22d5c,0x8633bb93,0xe8fa84a7,0x5aebbd08,0x53fba6ba,0xe5eea7d8,0x7ff27df3,0x68ca7158,0x521c8796,0xce6f1a05,0xb9d5133b,0xfd0ebee4,0x2d50cd53 .long 0xc5a3ef16,0xc82115d6,0xba079221,0x993eff9d,0x4b5da81c,0xe4da2c5e,0x8033fd85,0x9a89dbdb,0x2b892891,0x60819ebf,0x5d14a4d5,0x53902b21,0xd7fda421,0x6ac35051,0x61c83284,0xcc6ab885 .long 0xf74cff17,0x14eba133,0xecb813f2,0x240aaa03,0x6f665bee,0xcfbb6540,0xa425ad73,0x084b1fe4,0xd081f6a6,0x009d5d16,0xeef82c90,0x35304fe8,0xaa9eaa22,0xf20346d5,0xac1c91e3,0x0ada9f07 .long 0x968a6144,0xa6e21678,0x07b31a1e,0x54c1f77c,0x5781fbe1,0xd6bb787e,0xe31f1c4a,0x61bd2ee0,0x781105fc,0xf25aa1e9,0x7b2f8e80,0x9cf2971f,0xcdff919b,0x26d15412,0x34bc896e,0x01db4ebe .long 0xb40df1cf,0x7d9b3e23,0x94e971b4,0x59337373,0x669cf921,0xbf57bd14,0x0c1a1064,0x865daedf,0x83279125,0x3eb70bd3,0x34ecdaab,0xbc3d5b9f,0x5f755caf,0x91e3ed7e,0xd41e6f02,0x49699f54 .long 0xd4a7a15b,0x185770e1,0xeaac87e7,0x08f3587a,0x473133ea,0x352018db,0x04fd30fc,0x674ce719,0x088b3e0e,0x7b8d9835,0x5d0d47a1,0x7a0356a9,0x6474a3c4,0x9d9e7659,0xff66966c,0x61ea48a7 .long 0x0f3e4834,0x30417758,0x17a9afcb,0xfdbb21c2,0x2f9a67b3,0x756fa17f,0xa245c1a8,0x2a6b2421,0x4af02291,0x64be2794,0x2a5804fe,0xade465c6,0xa6f08fd7,0x8dffbd39,0xaa14403b,0xc4efa84c .long 0x442b0f5c,0xa1b91b2a,0xcf997736,0xb748e317,0xcee90e16,0x8d1b62bf,0x0b2078c0,0x907ae271,0x0c9bcddd,0xdf31534b,0x39adce83,0x043fb054,0xd826846a,0x99031043,0xb144f393,0x61a9c0d6 .long 0x47718427,0xdab48046,0x6e830f8b,0xdf17ff9b,0xe49a1347,0x408d7ee8,0x91c1d4ae,0x6ac71e23,0x1defd73c,0xc8cbb9fd,0xbbbbfec5,0x19840657,0x9e7ef8ea,0x39db1cb5,0x64105f30,0x78aa8296 .long 0xa3738c29,0xa3d9b7f0,0xbc3250a3,0x0a2f235a,0x445e4caf,0x55e506f6,0x33475f7a,0x0974f73d,0x5ba2f5a8,0xd37dbba3,0x6af40066,0x542c6e63,0xc5d73e2c,0x26d99b53,0x6c3ca33e,0x06060d7d .long 0x065fef4a,0xcdbef1c2,0xfd5b92e3,0x77e60f7d,0x26708350,0xd7c549f0,0x34f121bf,0x201b3ad0,0x0334fc14,0x5fcac2a1,0x344552f6,0x8a9a9e09,0x97653082,0x7dd8a1d3,0x79d4f289,0x5fc0738f .long 0x17d2d8c3,0x787d244d,0x70830684,0xeffc6345,0xe4f73ae5,0x5ddb96dd,0x172549a5,0x8efb14b1,0x2245ae7a,0x6eb73eee,0xea11f13e,0xbca4061e,0x30b01f5d,0xb577421d,0x782e152c,0xaa688b24 .long 0xbd3502ba,0x67608e71,0xb4de75a0,0x4ef41f24,0xfd6125e5,0xb08dde5e,0xa409543f,0xde484825,0x65cc2295,0x1f198d98,0x6e0edfa2,0x428a3771,0xadf35fc7,0x4f9697a2,0xf7cac3c7,0x01a43c79 .long 0x0fd3659a,0xb05d7059,0xbb7f2d9a,0x8927f30c,0x8cf984d3,0x4023d1ac,0x02897a45,0x32125ed3,0x3d414205,0xfb572dad,0xe3fa82a9,0x73000ef2,0xf10a5581,0x4c0868e9,0x6b0b3ca5,0x5b61fc67 .long 0x7cae440c,0xc1258d5b,0x402b7531,0x21c08b41,0xde932321,0xf61a8955,0x2d1408af,0x3568faf8,0x9ecf965b,0x71b15e99,0xe917276f,0xf14ed248,0x820cf9e2,0xc6f4caa1,0x18d83c7e,0x681b20b2 .long 0xc6c01120,0x6cde738d,0xae70e0db,0x71db0813,0x74afe18c,0x95fc0644,0x129e2be7,0x34619053,0xdb2a3b15,0x80615cea,0xdb4c7073,0x0a49a19e,0x8fd2d367,0x0e1b84c8,0x033fb8aa,0xd74bf462 .long 0x533ef217,0x889f6d65,0xc3ca2e87,0x7158c7e4,0xdc2b4167,0xfb670dfb,0x844c257f,0x75910a01,0xcf88577d,0xf336bf07,0xe45e2ace,0x22245250,0x7ca23d85,0x2ed92e8d,0x2b812f58,0x29f8be4c .long 0x076fe12b,0xdd9ebaa7,0xae1537f9,0x3f2400cb,0x17bdfb46,0x1aa93528,0x67883b41,0xc0f98430,0x0170911d,0x5590ede1,0x34d4b17f,0x7562f5bb,0x1826b8d2,0xe1fa1df2,0x6bd80d59,0xb40b796a .long 0x3467ba92,0xd65bf197,0xf70954b0,0x8c9b46db,0x0e78f15d,0x97c8a0f3,0x85a4c961,0xa8f3a69a,0x61e4ce9b,0x4242660f,0x6ea6790c,0xbf06aab3,0xec986416,0xc6706f8e,0x9a9fc225,0x9e56dec1 .long 0x9a9898d9,0x527c46f4,0x5633cdef,0xd799e77b,0x7d9e4297,0x24eacc16,0x6b1cb734,0xabb61cea,0xf778443c,0xbee2e8a7,0x29de2fe6,0x3bb42bf1,0x3003bb6f,0xcbed86a1,0xd781cdf6,0xd3918e6c .long 0x9a5103f1,0x4bee3271,0xf50eac06,0x5243efc6,0x6adcc119,0xb8e122cb,0xc0b80a08,0x1b7faa84,0x6dfcd08c,0x32c3d1bd,0x0be427de,0x129dec4e,0x1d263c83,0x98ab679c,0xcef64eff,0xafc83cb7 .long 0x2fa6be76,0x85eb6088,0x1328cbfe,0x892585fb,0xcf618dda,0xc154d3ed,0x3abaf26e,0xc44f601b,0x2be1fdfd,0x7bf57d0b,0x21137fee,0xa833bd2d,0x2db591a8,0x9353af36,0x5562a056,0xc76f26dc .long 0x3fdf5a51,0x1d87e47d,0x55c9cab0,0x7afb5f93,0x89e0586e,0x91bbf58f,0x0d843709,0x7c72c018,0x99b5c3dc,0xa9a5aafb,0x3844aeb0,0xa48a0f1d,0xb667e482,0x7178b7dd,0x6e23a59a,0x453985e9 .long 0x01b25dd8,0x4a54c860,0xfb897c8a,0x0dd37f48,0x0ea90cd9,0x5f8aa610,0x16d5830d,0xc8892c68,0xef514ca5,0xeb4befc0,0xe72c9ee6,0x478eb679,0xdbc40d5f,0x9bca20da,0xdde4f64a,0xf015de21 .long 0xeaf4b8a5,0xaa6a4de0,0x4bc60e32,0x68cfd9ca,0x7fd15e70,0x668a4b01,0xf27dc09d,0xd9f0694a,0xba708bcd,0xf6c3cad5,0x5bb95c2a,0x5cd2ba69,0x33c0a58f,0xaa28c1d3,0xabc77870,0x23e274e3 .long 0xdfd20a4a,0x44c3692d,0x81a66653,0x091c5fd3,0x09a0757d,0x6c0bb691,0x667343ea,0x9072e8b9,0x80848bec,0x31d40eb0,0x79fd36cc,0x95bd480a,0x65ed43f5,0x01a77c61,0x2e0d40bf,0xafccd127 .long 0x1cc1884b,0xeccfc82d,0x5d4753b4,0xc85ac201,0x658e099f,0xc7a6caac,0x04b27390,0xcf46369e,0x506467ea,0xe2e7d049,0x37cdeccc,0x481b63a2,0xed80143a,0x4029abd8,0xbcb00b88,0x28bfe3c7 .long 0x0643d84a,0x3bec1009,0xabd11041,0x885f3668,0xf83a34d6,0xdb02432c,0x719ceebe,0x32f7b360,0xdad1fe7a,0xf06c7837,0x5441a0b0,0x60a157a9,0xe2d47550,0x704970e9,0x271b9020,0xcd2bd553 .long 0x33e24a0b,0xff57f82f,0xf2565079,0x9cbee23f,0xeb5f5825,0x16353427,0xe948d662,0x276feec4,0xda10032b,0xd1b62bc6,0xf0e72a53,0x718351dd,0x2420e7ba,0x93452076,0x3a00118d,0x96368fff .long 0x150a49e4,0x00ce2d26,0x3f04706b,0x0c28b636,0x58b196d0,0xbad65a46,0xec9f8b7c,0x6c8455fc,0x2d71867e,0xe90c895f,0xedf9f38c,0x5c0be31b,0xd8f6ec04,0x2a37a15e,0x8cd85251,0x239639e7 .long 0x9c7c4c6b,0xd8975315,0xd7409af7,0x603aa3c0,0x007132fb,0xb8d53d0c,0xa6849238,0x68d12af7,0xbf5d9279,0xbe0607e7,0xaada74ce,0x9aa50055,0xba7e8ccb,0xe81079cb,0xa5f4ff5e,0x610c71d1 .long 0x5aa07093,0x9e2ee1a7,0xa75da47c,0xca84004b,0x3de75401,0x074d3951,0xbb311592,0xf938f756,0x00a43421,0x96197618,0x07bc78c8,0x39a25362,0x0a171276,0x278f710a,0x8d1a8f08,0xb28446ea .long 0xe3b6a661,0x184781bf,0xe6d279f7,0x7751cb1d,0xc59eb662,0xf8ff95d6,0x58d3dea7,0x186d90b7,0xdfb4f754,0x0e4bb6c1,0x2b2801dc,0x5c5cf56b,0x1f54564d,0xc561e452,0xf0dd7f13,0xb4fb8c60 .long 0x33ff98c7,0xf8849630,0xcf17769c,0x9619fffa,0x1bfdd80a,0xf8090bf6,0x422cfe63,0x14d9a149,0x6f6df9ea,0xb354c360,0x218f17ea,0xdbcf770d,0x79eb3480,0x207db7c8,0x559b6a26,0x213dbda8 .long 0x29fc81b3,0xac4c200b,0x171d87c1,0xebc3e09f,0x1481aa9e,0x91799530,0x92e114fa,0x051b92e1,0xecb5537f,0xdf8f92e9,0x290c7483,0x44b1b2cc,0x2adeb016,0xa711455a,0x81a10c2c,0x964b6856 .long 0xcec03623,0x4f159d99,0xef3271ea,0x05532225,0xc5ee4849,0xb231bea3,0x7094f103,0x57a54f50,0x9598b352,0x3e2d421d,0x67412ab4,0xe865a49c,0x1cc3a912,0xd2998a25,0x0c74d65d,0x5d092808 .long 0x4088567a,0x73f45908,0x1f214a61,0xeb6b280e,0xcaf0c13d,0x8c9adc34,0xf561fb80,0x39d12938,0xbc6edfb4,0xb2dc3a5e,0xfe4d210e,0x7485b1b1,0xe186ae72,0x062e0400,0x6eeb3b88,0x91e32d5c .long 0x4be59224,0x6df574d7,0x716d55f3,0xebc88ccc,0xcad6ed33,0x26c2e6d0,0x0d3e8b10,0xc6e21e7d,0x5bcc36bb,0x2cc5840e,0x7da74f69,0x9292445e,0x4e5193a8,0x8be8d321,0x8df06413,0x3ec23629 .long 0xb134defa,0xc7e9ae85,0x1bb2d475,0x6073b1d0,0x2863c00d,0xb9ad615e,0x525f4ac4,0x9e29493d,0x4e9acf4f,0xc32b1dea,0xa50db88d,0x3e1f01c8,0x04da916c,0xb05d70ea,0xd865803e,0x714b0d0a .long 0x9920cb5e,0x4bd493fc,0x92c7a3ac,0x5b44b1f7,0xbcec9235,0xa2a77293,0xcd378553,0x5ee06e87,0xda621607,0xceff8173,0x99f5d290,0x2bb03e4c,0xa6f734ac,0x2945106a,0xd25c4732,0xb5056604 .long 0xe079afee,0x5945920c,0x6789831f,0x686e17a0,0xb74a5ae5,0x5966bee8,0x1e258d46,0x38a673a2,0x83141c95,0xbd1cc1f2,0x0e96e486,0x3b2ecf4f,0x74e5fc78,0xcd3aa896,0x2482fa7a,0x415ec10c .long 0x80503380,0x15234419,0xd314b392,0x513d917a,0x63caecae,0xb0b52f4e,0x2dc7780b,0x07bf22ad,0xe4306839,0xe761e8a1,0x5dd7feaa,0x1b3be962,0x74c778f1,0x4fe728de,0x5e0070f6,0xf1fa0bda .long 0x6ec3f510,0x85205a31,0xd2980475,0x2c7e4a14,0x6f30ebfd,0xde3c19c0,0xd4b7e644,0xdb1c1f38,0x5dce364a,0xfe291a75,0x058f5be3,0xb7b22a3c,0x37fea38c,0x2cd2c302,0x2e17be17,0x2930967a .long 0x0c061c65,0x87f009de,0xedc6ed44,0xcb014aac,0x3bafb1eb,0x49bd1cb4,0x282d3688,0x81bd8b5c,0xf01a17af,0x1cdab87e,0xe710063b,0x21f37ac4,0x42fc8193,0x5a6c5676,0x56a6015c,0xf4753e70 .long 0xa15b0a44,0x020f795e,0x8958a958,0x8f37c8d7,0xa4b675b5,0x63b7e89b,0x0fc31aea,0xb4fb0c0c,0xa7ff1f2e,0xed95e639,0x619614fb,0x9880f5a3,0x947151ab,0xdeb6ff02,0xa868dcdb,0x5bc5118c .long 0x4c20cea5,0xd8da2055,0x14c4d69a,0xcac2776e,0x622d599b,0xcccb22c1,0x68a9bb50,0xa4ddb653,0x1b4941b4,0x2c4ff151,0x6efba588,0xe1ff19b4,0xc48345e0,0x35034363,0x1e29dfc4,0x45542e3d .long 0x349f7aed,0xf197cb91,0x8fca8420,0x3b2b5a00,0x23aaf6d8,0x7c175ee8,0x35af32b6,0x54dcf421,0x27d6561e,0x0ba14307,0xd175b1e2,0x879d5ee4,0x99807db5,0xc7c43673,0x9cd55bcd,0x77a54455 .long 0x0105c072,0xe6c2ff13,0x8dda7da4,0x18f7a99f,0x0e2d35c1,0x4c301820,0xd9cc6c82,0x06a53ca0,0xf1aa1d9e,0xaa21cc1e,0x4a75b1e8,0x32414334,0x0ebe9fdc,0x2a6d1328,0x98a4755a,0x16bd173f .long 0x2133ffd9,0xfbb9b245,0x830f1a20,0x39a8b2f1,0xd5a1f52a,0x484bc97d,0xa40eddf8,0xd6aebf56,0x76ccdac6,0x32257acb,0x1586ff27,0xaf4d36ec,0xf8de7dd1,0x8eaa8863,0x88647c16,0x0045d5cf .long 0xc005979d,0xa6f3d574,0x6a40e350,0xc2072b42,0x8de2ecf9,0xfca5c156,0xa515344e,0xa8c8bf5b,0x114df14a,0x97aee555,0xfdc5ec6b,0xd4374a4d,0x2ca85418,0x754cc28f,0xd3c41f78,0x71cb9e27 .long 0x03605c39,0x89105079,0xa142c96c,0xf0843d9e,0x16923684,0xf3744934,0xfa0a2893,0x732caa2f,0x61160170,0xb2e8c270,0x437fbaa3,0xc32788cc,0xa6eda3ac,0x39cd818e,0x9e2b2e07,0xe2e94239 .long 0x0260e52a,0x6967d39b,0x90653325,0xd42585cc,0x21ca7954,0x0d9bd605,0x81ed57b3,0x4fa20877,0xe34a0bbe,0x60c1eff8,0x84f6ef64,0x56b0040c,0xb1af8483,0x28be2b24,0xf5531614,0xb2278163 .long 0x5922ac1c,0x8df27545,0xa52b3f63,0xa7b3ef5c,0x71de57c4,0x8e77b214,0x834c008b,0x31682c10,0x4bd55d31,0xc76824f0,0x17b61c71,0xb6d1c086,0xc2a5089d,0x31db0903,0x184e5d3f,0x9c092172 .long 0xc00cc638,0xdd7ced5b,0x61278fc2,0x1a2015eb,0x6a37f8d6,0x2e8e5288,0xe79933ad,0xc457786f,0x2c51211a,0xb3fe4cce,0x24c20498,0xad9b10b2,0xd28db5e5,0x90d87a4f,0x3aca2fc3,0x698cd105 .long 0xe91b536d,0x4f112d07,0x9eba09d6,0xceb982f2,0x197c396f,0x3c157b2c,0x7b66eb24,0xe23c2d41,0x3f330d37,0x480c57d9,0x79108deb,0xb3a4c8a1,0xcb199ce5,0x702388de,0xb944a8d4,0x0b019211 .long 0x840bb336,0x24f2a692,0xa669fa7b,0x7c353bdc,0xdec9c300,0xda20d6fc,0xa13a4f17,0x625fbe2f,0xdbc17328,0xa2b1b61a,0xa9515621,0x008965bf,0xc620ff46,0x49690939,0x8717e91c,0x182dd27d .long 0xea6c3997,0x5ace5035,0xc2610bef,0x54259aaa,0x3c80dd39,0xef18bb3f,0x5fc3fa39,0x6910b95b,0x43e09aee,0xfce2f510,0xa7675665,0xced56c9f,0xd872db61,0x10e265ac,0xae9fce69,0x6982812e .long 0xce800998,0x29be11c6,0xb90360d9,0x72bb1752,0x5a4ad590,0x2c193197,0x9fc1dbc0,0x2ba2f548,0xe490ebe0,0x7fe4eebb,0x7fae11c0,0x12a0a4cd,0xe903ba37,0x7197cf81,0xde1c6dd8,0xcf7d4aa8 .long 0x3fd5684c,0x92af6bf4,0x80360aa1,0x2b26eecf,0x00546a82,0xbd960f30,0xf59ad8fe,0x407b3c43,0x249c82ba,0x86cae5fe,0x2463744c,0x9e0faec7,0x94916272,0x87f551e8,0x6ceb0615,0x033f9344 .long 0x8be82e84,0x1e5eb0d1,0x7a582fef,0x89967f0e,0xa6e921fa,0xbcf687d5,0xd37a09ba,0xdfee4cf3,0xb493c465,0x94f06965,0x7635c030,0x638b9a1c,0x66f05e9f,0x76667864,0xc04da725,0xccaf6808 .long 0x768fccfc,0xca2eb690,0xb835b362,0xf402d37d,0xe2fdfcce,0x0efac0d0,0xb638d990,0xefc9cdef,0xd1669a8b,0x2af12b72,0x5774ccbd,0x33c536bc,0xfb34870e,0x30b21909,0x7df25aca,0xc38fa2f7 .long 0xbf81f3f5,0x74c5f02b,0xaf7e4581,0x0525a5ae,0x433c54ae,0x88d2aaba,0x806a56c5,0xed9775db,0xc0edb37d,0xd320738a,0x66cc1f51,0x25fdb6ee,0x10600d76,0xac661d17,0xbdd1ed76,0x931ec1f3 .long 0x19ee43f1,0x65c11d62,0x60829d97,0x5cd57c3e,0x984be6e8,0xd26c91a3,0x8b0c53bd,0xf08d9309,0xc016e4ea,0x94bc9e5b,0x11d43d2b,0xd3916839,0x73701155,0x886c5ad7,0x20b00715,0xe0377626 .long 0xaa80ba59,0x7f01c9ec,0x68538e51,0x3083411a,0xe88128af,0x970370f1,0x91dec14b,0x625cc3db,0x01ac3107,0xfef9666c,0xd5057ac3,0xb2a8d577,0x92be5df7,0xb0f26299,0x00353924,0xf579c8e5 .long 0x1341ed7a,0xb8fa3d93,0xa7b59d49,0x4223272c,0x83b8c4a4,0x3dcb1947,0xed1302e4,0x4e413c01,0xe17e44ce,0x6d999127,0x33b3adfb,0xee86bf75,0x25aa96ca,0xf6902fe6,0xe5aae47d,0xb73540e4 .long 0x1b4a158c,0x32801d7b,0x27e2a369,0xe571c99e,0x10d9f197,0x40cb76c0,0x3167c0ae,0xc308c289,0xeb7958f2,0xa6ef9dd3,0x300879b1,0xa7226dfc,0x7edf0636,0x6cd0b362,0x7bc37eed,0x4efbce6c .long 0x8d699021,0x75f92a05,0x772566e3,0x586d4c79,0x761ad23a,0x378ca5f1,0x1465a8ac,0x650d86fc,0x842ba251,0x7a4ed457,0x42234933,0x6b65e3e6,0x31aad657,0xaf1543b7,0xcbfec369,0xa4cefe98 .long 0x9f47befb,0xb587da90,0x41312d13,0x6562e9fb,0xeff1cefe,0xa691ea59,0x05fc4cf6,0xcc30477a,0x0b0ffd3d,0xa1632461,0x5b355956,0xa1f16f3b,0x4224ec24,0x5b148d53,0xf977012a,0xdc834e7b .long 0xb2c69dbc,0x7bfc5e75,0x03c3da6c,0x3aa77a29,0xca910271,0xde0df03c,0x7806dc55,0xcbd5ca4a,0x6db476cb,0xe1ca5807,0x5f37a31e,0xfde15d62,0xf41af416,0xf49af520,0x7d342db5,0x96c5c5b1 .long 0xeb4ceb9b,0x155c43b7,0x4e77371a,0x2e993010,0x675d43af,0x1d2987da,0x8599fd72,0xef2bc1c0,0x9342f6b2,0x96894b7b,0x7c8e71f0,0x201eadf2,0x4a1f3efc,0xf3479d9f,0x702a9704,0xe0f8a742 .long 0xb3eba40c,0xeafd44b6,0xc1c1e0d0,0xf9739f29,0x619d505e,0x0091471a,0x9d7c263e,0xc15f9c96,0x83afbe33,0x5be47285,0x04f1e092,0xa3b6d6af,0x751a9d11,0xe76526b9,0x9a4ae4d2,0x2ec5b26d .long 0x02f6fb8d,0xeb66f4d9,0x96912164,0x4063c561,0x80ef3000,0xeb7050c1,0xeaa5b3f0,0x288d1c33,0x07806fd8,0xe87c68d6,0x4bbbf50f,0xb2f7f9d5,0xac8d6627,0x25972f3a,0x10e8c13b,0xf8547774 .long 0x872b4a60,0xcc50ef6c,0x4613521b,0xab2a34a4,0x983e15d1,0x39c5c190,0x59905512,0x61dde5df,0x9f2275f3,0xe417f621,0x451d894b,0x0750c8b6,0x78b0bdaa,0x75b04ab9,0x458589bd,0x3bfd9fd4 .long 0xee9120b6,0xf1013e30,0x23a4743e,0x2b51af93,0x48d14d9e,0xea96ffae,0x698a1d32,0x71dc0dbe,0x0180cca4,0x914962d2,0xc3568963,0x1ae60677,0x437bc444,0x8cf227b1,0xc9962c7a,0xc650c83b .long 0xfe7ccfc4,0x23c2c7dd,0x1b929d48,0xf925c89d,0x06783c33,0x4460f74b,0xa590475a,0xac2c8d49,0xb807bba0,0xfb40b407,0x69ff8f3a,0x9d1e362d,0xcbef64a4,0xa33e9681,0x332fb4b2,0x67ece5fa .long 0x739f10e3,0x6900a99b,0xff525925,0xc3341ca9,0xa9e2d041,0xee18a626,0x29580ddd,0xa5a83685,0x9d7de3cd,0xf3470c81,0x2062cf9c,0xedf02586,0xc010edb0,0xf43522fa,0x13a4b1ae,0x30314135 .long 0xdb22b94b,0xc792e02a,0xa1eaa45b,0x993d8ae9,0xcd1e1c63,0x8aad6cd3,0xc5ce688a,0x89529ca7,0xe572a253,0x2ccee3aa,0x02a21efb,0xe02b6438,0xc9430358,0xa7091b6e,0x9d7db504,0x06d1b1fa .long 0xc4744733,0x58846d32,0x379f9e34,0x40517c71,0x130ef6ca,0x2f65655f,0xf1f3503f,0x526e4488,0x7ee4a976,0x8467bd17,0x921363d1,0x1d9dc913,0xb069e041,0xd8d24c33,0x2cdf7f51,0x5eb5da0a .long 0x197b994f,0x1c0f3cb1,0x2843eae9,0x3c95a6c5,0xa6097ea5,0x7766ffc9,0xd723b867,0x7bea4093,0x4db378f9,0xb48e1f73,0xe37b77ac,0x70025b00,0xaf24ad46,0x943dc8e7,0x16d00a85,0xb98a15ac .long 0x2743b004,0x3adc38ba,0x334415ee,0xb1c7f4f7,0x1e62d05a,0xea43df8f,0x9d76a3b6,0x32618905,0xa23a0f46,0x2fbd0bb5,0x6a01918c,0x5bc971db,0xb4743f94,0x7801d94a,0x676ae22b,0xb94df65e .long 0xaf95894c,0xaafcbfab,0x276b2241,0x7b9bdc07,0x5bdda48b,0xeaf98362,0xa3fcb4df,0x5977faf2,0x052c4b5b,0xbed042ef,0x067591f0,0x9fe87f71,0x22f24ec7,0xc89c73ca,0xe64a9f1b,0x7d37fa9e .long 0x15562627,0x2710841a,0xc243b034,0x2c01a613,0x2bc68609,0x1d135c56,0x8b03f1f6,0xc2ca1715,0x3eb81d82,0xc9966c2d,0x8f6df13e,0xc02abf4a,0x8f72b43b,0x77b34bd7,0x360c82b0,0xaff6218f .long 0x8d55b9d2,0x0aa5726c,0x99e9bffb,0xdc0adbe9,0xefb9e72a,0x9097549c,0x9dfb3111,0x16755712,0xf26847f9,0xdd8bf984,0xdfb30cb7,0xbcb8e387,0x5171ef9c,0xc1fd32a7,0x389b363f,0x977f3fc7 .long 0xf4babda0,0x116eaf2b,0xf7113c8e,0xfeab68bd,0xb7def526,0xd1e3f064,0xe0b3fa02,0x1ac30885,0x40142d9d,0x1c5a6e7b,0x30921c0b,0x839b5603,0x36a116a3,0x48f301fa,0xcfd9ee6d,0x380e1107 .long 0x58854be1,0x7945ead8,0xcbd4d49d,0x4111c12e,0x3a29c2ef,0xece3b1ec,0x8d3616f5,0x6356d404,0x594d320e,0x9f0d6a8f,0xf651ccd2,0x0989316d,0x0f8fdde4,0x6c32117a,0xa26a9bbc,0x9abe5cc5 .long 0x9723f671,0xcff560fb,0x7f3d593c,0x21b2a12d,0x24ba0696,0xe4cb18da,0xc3543384,0x186e2220,0x88312c29,0x722f64e0,0x17dc7752,0x94282a99,0x5a85ee89,0x62467bbf,0xf10076a0,0xf435c650 .long 0x43b3a50b,0xc9ff1539,0x1a53efbc,0x7132130c,0xf7b0c5b7,0x31bfe063,0x4ea994cc,0xb0179a7d,0xc85f455b,0x12d064b3,0x8f6e0062,0x47259328,0xb875d6d9,0xf64e590b,0xad92bcc7,0x22dd6225 .long 0xb9c3bd6d,0xb658038e,0xfbba27c8,0x00cdb0d6,0x1062c45d,0x0c681337,0x2d33407d,0xd8515b8c,0x8cbb5ecf,0xcb8f699e,0xc608d7d8,0x8c4347f8,0xbb3e00db,0x2c11850a,0xecb49d19,0x20a8dafd .long 0x45ee2f40,0xbd781480,0x416b60cf,0x75e354af,0x8d49a8c4,0xde0b58a1,0xfa359536,0xe40e94e2,0x62accd76,0xbd4fa59f,0x8c762837,0x05cf466a,0x448c277b,0xb5abda99,0x48b13740,0x5a9e01bf .long 0x326aad8d,0x9d457798,0xc396f7e7,0xbdef4954,0xc253e292,0x6fb274a2,0x1cfe53e7,0x2800bf0a,0x44438fd4,0x22426d31,0x5e259f9a,0xef233923,0x03f66264,0x4188503c,0x7f9fdfab,0x9e5e7f13 .long 0x5fcc1aba,0x565eb76c,0x59b5bff8,0xea632548,0xaab6d3fa,0x5587c087,0x6ce39c1b,0x92b639ea,0x953b135c,0x0706e782,0x425268ef,0x7308912e,0x090e7469,0x599e92c7,0x9bc35e75,0x83b90f52 .long 0x244975b3,0x4750b3d0,0x11965d72,0xf3a44358,0x9c8dc751,0x179c6774,0xd23d9ff0,0xff18cdfe,0x2028e247,0xc4013833,0xf3bfbc79,0x96e280e2,0xd0880a84,0xf60417bd,0x2a568151,0x263c9f3d .long 0x2d2ce811,0x36be15b3,0xf8291d21,0x846dc0c2,0x789fcfdb,0x5cfa0ecb,0xd7535b9a,0x45a0beed,0x96d69af1,0xec8e9f07,0x599ab6dc,0x31a7c5b8,0xf9e2e09f,0xd36d45ef,0xdcee954b,0x3cf49ef1 .long 0x086cff9b,0x6be34cf3,0x39a3360f,0x88dbd491,0x0dbfbd1d,0x1e96b8cc,0xcb7e2552,0xc1e5f7bf,0x28819d98,0x0547b214,0x7aea9dcb,0xc770dd9c,0x041d68c8,0xaef0d4c7,0x13cb9ba8,0xcc2b9818 .long 0xfe86c607,0x7fc7bc76,0x502a9a95,0x6b7b9337,0xd14dab63,0x1948dc27,0xdae047be,0x249dd198,0xa981a202,0xe8356584,0x3a893387,0x3531dd18,0xc85c7209,0x1be11f90,0xe2a52b5a,0x93d2fe1e .long 0xec6d6b97,0x8225bfe2,0xbd0aa5de,0x9cf6d6f4,0x54779f5f,0x911459cb,0x86aeb1f3,0x5649cddb,0x3f26ce5a,0x32133579,0x550f431e,0xc289a102,0x73b84c6f,0x559dcfda,0xee3ac4d7,0x84973819 .long 0xf2606a82,0xb51e55e6,0x90f2fb57,0xe25f7061,0xb1a4e37c,0xacef6c2a,0x5dcf2706,0x864e359d,0x7ce57316,0x479e6b18,0x3a96b23d,0x2cab2500,0x8ef16df7,0xed489862,0xef3758b5,0x2056538c .long 0xf15d3101,0xa7df865e,0x61b553d7,0x80c5533a,0x4ed14294,0x366e1997,0xb3c0bcd6,0x6620741f,0xedc45418,0x21d1d9c4,0xc1cc4a9d,0x005b859e,0xa1c462f0,0xdf01f630,0xf26820c7,0x15d06cf3 .long 0x3484be47,0x9f7f24ee,0x4a0c902f,0x2ff33e96,0x5a0bc453,0x00bdf457,0x1aa238db,0x2378dfaf,0x856720f2,0x272420ec,0x96797291,0x2ad9d95b,0x768a1558,0xd1242cc6,0x5cc86aa8,0x2e287f8b .long 0x990cecaa,0x796873d0,0x675d4080,0xade55f81,0x21f0cd84,0x2645eea3,0xb4e17d02,0x7a1efa0f,0x037cc061,0xf6858420,0xd5d43e12,0x682e05f0,0x27218710,0x59c36994,0x3f7cd2fc,0x85cbba4d .long 0x7a3cd22a,0x726f9729,0x4a628397,0x9f8cd5dc,0xc23165ed,0x17b93ab9,0x122823d4,0xff5f5dbf,0x654a446d,0xc1e4e4b5,0x677257ba,0xd1a9496f,0xde766a56,0x6387ba94,0x521ec74a,0x23608bc8 .long 0x6688c4d4,0x16a522d7,0x07373abd,0x9d6b4282,0xb42efaa3,0xa62f07ac,0xe3b90180,0xf73e00f7,0x49421c3e,0x36175fec,0x3dcf2678,0xc4e44f9b,0x7220f09f,0x76df436b,0x3aa8b6cf,0x172755fb .long 0x446139cc,0xbab89d57,0x5fe0208f,0x0a0a6e02,0x11e5d399,0xcdbb63e2,0xa8977f0b,0x33ecaa12,0xf7c42664,0x59598b21,0xab65d08a,0xb3e91b32,0xf4502526,0x035822ee,0x720a82a9,0x1dcf0176 .long 0x3d589e02,0x50f8598f,0xb1d63d2c,0xdf0478ff,0x1571cd07,0x8b8068bd,0xd79670cd,0x30c3aa4f,0x941ade7f,0x25e8fd4b,0x32790011,0x3d1debdc,0x3a3f9ff0,0x65b6dcbd,0x793de69c,0x282736a4 .long 0xd41d3bd3,0xef69a0c3,0x07a26bde,0xb533b8c9,0xdb2edf9f,0xe2801d97,0xe1877af0,0xdc4a8269,0x3d590dbe,0x6c1c5851,0xee4e9357,0x84632f6b,0x79b33374,0xd36d36b7,0x9bbca2e6,0xb46833e3 .long 0xf7fc0586,0x37893913,0x66bf4719,0x385315f7,0xb31855dc,0x72c56293,0x849061fe,0xd1416d4e,0x51047213,0xbeb3ab78,0xf040c996,0x447f6e61,0x638b1d0c,0xd06d310d,0xbad1522e,0xe28a413f .long 0x82003f86,0x685a76cb,0x0bcdbca3,0x610d07f7,0x9ca4c455,0x6ff66021,0xcea10eec,0x7df39b87,0xe22db218,0xb9255f96,0x08a34c44,0x8cc6d9eb,0x859f9276,0xcd4ffb86,0x50d07335,0x8fa15eb2 .long 0xcf2c24b5,0xdf553845,0x52f9c3ba,0x89f66a9f,0xe4a7ceb3,0x8f22b5b9,0x0e134686,0xaffef809,0x8eb8fac2,0x3e53e1c6,0x28aec98e,0x93c1e4eb,0x32a43bcb,0xb6b91ec5,0xb2d74a51,0x2dbfa947 .long 0xca84bad7,0xe065d190,0xad58e65c,0xfb13919f,0xf1cb6e31,0x3c41718b,0x06d05c3f,0x688969f0,0x21264d45,0xd4f94ce7,0x7367532b,0xfdfb65e9,0x0945a39d,0x5b1be8b1,0x2b8baf3b,0x229f789c .long 0x6f49f15d,0xd8f41f3e,0x907f0792,0x678ce828,0xfca6e867,0xc69ace82,0xd01dcc89,0x106451ae,0x19fc32d2,0x1bb4f7f0,0xb00c52d2,0x64633dfc,0xad9ea445,0x8f13549a,0xfb323705,0x99a3bf50 .long 0x534d4dbc,0x0c9625a2,0xc2a2fea3,0x45b8f1d1,0xa530fc1a,0x76ec21a1,0x9e5bd734,0x4bac9c2a,0x7b4e3587,0x5996d76a,0x1182d9e3,0x0045cdee,0x1207f13d,0x1aee24b9,0x97345a41,0x66452e97 .long 0x9f950cd0,0x16e5b054,0xd7fdd075,0x9cc72fb1,0x66249663,0x6edd61e7,0xf043cccb,0xde4caa4d,0x55c7ac17,0x11b1f57a,0x1a85e24d,0x779cbd44,0xe46081e7,0x78030f86,0x8e20f643,0xfd4a6032 .long 0x0a750c0f,0xcc7a6488,0x4e548e83,0x39bacfe3,0x0c110f05,0x3d418c76,0xb1f11588,0x3e4daa4c,0x5ffc69ff,0x2733e7b5,0x92053127,0x46f147bc,0xd722df94,0x885b2434,0xe6fc6b7c,0x6a444f65 .long 0xc3f16ea8,0x7a1a465a,0xb2f1d11c,0x115a461d,0x6c68a172,0x4767dd95,0xd13a4698,0x3392f2eb,0xe526cdc7,0xc7a99ccd,0x22292b81,0x8e537fdc,0xa6d39198,0x76d8cf69,0x2446852d,0xffc5ff43 .long 0xa90567e6,0x97b14f7e,0xb6ae5cb7,0x513257b7,0x9f10903d,0x85454a3c,0x69bc3724,0xd8d2c9ad,0x6b29cb44,0x38da9324,0x77c8cbac,0xb540a21d,0x01918e42,0x9bbfe435,0x56c3614e,0xfffa707a .long 0xd4e353b7,0x0ce4e3f1,0xef46b0a0,0x062d8a14,0x574b73fd,0x6408d5ab,0xd3273ffd,0xbc41d1c9,0x6be77800,0x3538e1e7,0xc5655031,0x71fe8b37,0x6b9b331a,0x1cd91621,0xbb388f73,0xad825d0b .long 0x1cb76219,0x56c2e05b,0x71567e7e,0x0ec0bf91,0x61c4c910,0xe7076f86,0xbabc04d9,0xd67b085b,0x5e93a96a,0x9fb90459,0xfbdc249a,0x7526c1ea,0xecdd0bb7,0x0d44d367,0x9dc0d695,0x95399917 .long 0x9e240d18,0x61360ee9,0xb4b94466,0x057cdcac,0x2fe5325c,0xe7667cd1,0x21974e3b,0x1fa297b5,0xdb083d76,0xfa4081e7,0xf206bd15,0x31993be6,0x14c19f8c,0x8949269b,0xa9d92357,0x21468d72 .long 0xa4c506ec,0x2ccbc583,0xd1acfe97,0x957ed188,0x12f1aea2,0x8baed833,0x8325362d,0xef2a6cb4,0x8e195c43,0x130dde42,0x0e6050c6,0xc842025a,0x08686a5d,0x2da972a7,0xe508b4a8,0xb52999a1 .long 0x10a5a8bd,0xd9f090b9,0x096864da,0xca91d249,0x3f67dbc1,0x8e6a93be,0xf5f4764c,0xacae6fba,0xd21411a0,0x1563c6e0,0xda0a4ad8,0x28fa787f,0x908c8030,0xd524491c,0x4c795f07,0x1257ba0e .long 0xceca9754,0x83f49167,0x4b7939a0,0x426d2cf6,0x723fd0bf,0x2555e355,0xc4f144e2,0xa96e6d06,0x87880e61,0x4768a8dd,0xe508e4d5,0x15543815,0xb1b65e15,0x09d7e772,0xac302fa0,0x63439dd6 .long 0xc14e35c2,0xb93f802f,0x4341333c,0x71735b7c,0x16d4f362,0x03a25104,0xbf433c8e,0x3f4d069b,0xf78f5a7c,0x0d83ae01,0x7c4eed07,0x50a8ffbe,0x76e10f83,0xc74f8906,0x9ddaf8e1,0x7d080966 .long 0x698e04cc,0xb11df8e1,0x169005c8,0x877be203,0x4f3c6179,0x32749e8c,0x7853fc05,0x2dbc9d0a,0x9454d937,0x187d4f93,0xb4800e1b,0xe682ce9d,0x165e68e8,0xa9129ad8,0xbe7f785b,0x0fe29735 .long 0x5b9e02b7,0x5303f40c,0x35ee04e8,0xa37c9692,0x34d6632b,0x5f46cc20,0x96ac545b,0x55ef72b2,0x7b91b062,0xabec5c1f,0xbb33e821,0x0a79e1c7,0x3a9f4117,0xbb04b428,0xfd2a475a,0x0de1f28f .long 0x3a4434b4,0x31019ccf,0x1a7954dc,0xa3458111,0xe34972a7,0xa9dac80d,0x74f6b8dd,0xb043d054,0x11137b1a,0x021c319e,0xed5cc03f,0x00a754ce,0xcbea5ad4,0x0aa2c794,0x70c015b6,0x093e67f4 .long 0xc97e3f6b,0x72cdfee9,0xb6da7461,0xc10bcab4,0xb59806b9,0x3b02d2fc,0xa1de6f47,0x85185e89,0x0eb6c4d4,0x39e6931f,0xd4fa5b04,0x4d4440bd,0x34be7eb8,0x5418786e,0x9d7259bc,0x6380e521 .long 0xd598d710,0x20ac0351,0xcb3a4da4,0x272c4166,0xca71de1f,0xdb82fe1a,0xd8f54b0f,0x746e79f2,0x4b573e9b,0x6e7fc736,0xfd4b5040,0x75d03f46,0x0b98d87b,0x5c1cc36d,0x1f472da1,0x513ba3f1 .long 0xabb177dd,0x79d0af26,0x7891d564,0xf82ab568,0x72232173,0x2b6768a9,0x8c1f6619,0xefbb3bb0,0xa6d18358,0xb29c11db,0xb0916d3a,0x519e2797,0x9188e290,0xd4dc18f0,0x98b0ca7f,0x648e86e3 .long 0x983c38b5,0x859d3145,0x637abc8b,0xb14f176c,0xcaff7be6,0x2793fb9d,0x35a66a5a,0xebe5a55f,0x9f87dc59,0x7cec1dcd,0xfbdbf560,0x7c595cd3,0x26eb3257,0x5b543b22,0xc4c935fd,0x69080646 .long 0x81e9ede3,0x7f2e4403,0xcaf6df0a,0x243c3894,0x1c073b11,0x7c605bb1,0xba6a4a62,0xcd06a541,0x49d4e2e5,0x29168949,0x4af66880,0x33649d07,0xe9a85035,0xbfc0c885,0xfc410f4b,0xb4e52113 .long 0x78a6513b,0xdca3b706,0x9edb1943,0x92ea4a2a,0xdb6e2dd8,0x02642216,0x9fd57894,0x9b45d0b4,0xc69d11ae,0x114e70db,0x4c57595f,0x1477dd19,0xec77c272,0xbc2208b4,0xdb68f59c,0x95c5b4d7 .long 0x42e532b7,0xb8c4fc63,0x9ae35290,0x386ba422,0xd201ecbc,0xfb5dda42,0xa0e38fd6,0x2353dc8b,0x68f7e978,0x9a0b85ea,0x2ad6d11f,0x96ec5682,0xe5f6886d,0x5e279d6c,0x3cb1914d,0xd3fe03cd .long 0x7ea67c77,0xfe541fa4,0xe3ea810c,0x952bd2af,0x8d01d374,0x791fef56,0x0f11336e,0xa3a1c621,0xc7ec6d79,0x5ad0d5a9,0x3225c342,0xff7038af,0xbc69601b,0x003c6689,0x45e8747d,0x25059bc7 .long 0xf2086fbf,0xfa4965b2,0x86916078,0xf6840ea6,0x70081d6c,0xd7ac7620,0xb5328645,0xe600da31,0x529b8a80,0x01916f63,0x2d7d6f3e,0xe80e4858,0xd664ca7c,0x29eb0fe8,0xe7b43b0c,0xf017637b .long 0x76cb2566,0x9a75c806,0xb24892d9,0x8f76acb1,0x1f08fe45,0x7ae7b9cc,0x6a4907d8,0x19ef7329,0x5f228bf0,0x2db4ab71,0x817032d7,0xf3cdea39,0xdcabe3c0,0x0b1f482e,0xbb86325c,0x3baf76b4 .long 0x10089465,0xd49065e0,0x8e77c596,0x3bab5d29,0x193dbd95,0x7636c3a6,0xb246e499,0xdef5d294,0x286b2475,0xb22c58b9,0xcd80862b,0xa0b93939,0xf0992388,0x3002c83a,0xeacbe14c,0x6de01f9b .long 0xadd70482,0x6aac688e,0x7b4a4e8a,0x708de92a,0x758a6eef,0x75b6dd73,0x725b3c43,0xea4bf352,0x87912868,0x10041f2c,0xef09297a,0xb1b1be95,0xa9f3860a,0x19ae23c5,0x515dcf4b,0xc4f0f839 .long 0x97f6306a,0x3c7ecca3,0x68a3a4b0,0x744c44ae,0xb3a1d8a2,0x69cd13a0,0x5256b578,0x7cad0a1e,0x33791d9e,0xea653fcd,0x74b2e05f,0x9cc2a05d,0xfd7affa2,0x73b391dc,0xb6b05442,0xddb7091e .long 0x8538a5c6,0xc71e27bf,0x89abff17,0x195c63dd,0x1b71e3da,0xfd315285,0xfa680fa0,0x9cbdfda7,0x849d7eab,0x9db876ca,0x3c273271,0xebe2764b,0xf208dcea,0x663357e3,0x565b1b70,0x8c5bd833 .long 0x9837fc0d,0xccc3b4f5,0xa79cf00f,0x9b641ba8,0xdfdf3990,0x7428243d,0x020786b1,0x83a594c4,0x526c4502,0xb712451a,0x6adb3f93,0x9d39438e,0xe9ff0ccd,0xfdb261e3,0xe07af4c3,0x80344e3c .long 0x2fa4f126,0x75900d7c,0x5c99a232,0x08a3b865,0xdb25e0c3,0x2478b6bf,0x71db2edf,0x482cc2c2,0x5f321bb8,0x37df7e64,0x9a8005b4,0x8a93821b,0xcc8c1958,0x3fa2f10c,0x2c269d0a,0x0d332218 .long 0xe246b0e6,0x20ab8119,0xd349fd17,0xb39781e4,0xb31aa100,0xd293231e,0xbb032168,0x4b779c97,0xc8470500,0x4b3f19e1,0x0c4c869d,0x45b7efe9,0xa1a6bbcc,0xdb84f38a,0xb2fddbc1,0x3b59cb15 .long 0x3fd165e8,0xba5514df,0x061f8811,0x499fd6a9,0xbfef9f00,0x72cd1fe0,0x79ad7e8a,0x120a4bb9,0x5f4a5ac5,0xf2ffd095,0x95a7a2f0,0xcfd174f1,0x9d17baf1,0xd42301ba,0x77f22089,0xd2fa487a .long 0xb1dc77e1,0x9cb09efe,0x21c99682,0xe9566939,0x6c6067bb,0x8c546901,0x61c24456,0xfd378574,0x81796b33,0x2b6a6cbe,0x58e87f8b,0x62d550f6,0x7f1b01b4,0x1b763e1c,0x1b1b5e12,0x4b93cfea .long 0x1d531696,0xb9345238,0x88cdde69,0x57201c00,0x9a86afc7,0xdde92251,0xbd35cea8,0xe3043895,0x8555970d,0x7608c1e1,0x2535935e,0x8267dfa9,0x322ea38b,0xd4c60a57,0x804ef8b5,0xe0bf7977 .long 0xc06fece4,0x1a0dab28,0x94e7b49d,0xd405991e,0x706dab28,0xc542b6d2,0xa91618fb,0xcb228da3,0x107d1cea,0x224e4164,0xd0f5d8f1,0xeb9fdab3,0x0d6e41cd,0xc02ba386,0x9b1f7146,0x676a72c5 .long 0x4d6cb00b,0xffd6dd98,0xde2e8d7c,0xcef9c5ca,0x641c7936,0xa1bbf5d7,0xee8f772e,0x1b95b230,0xe8ac25b1,0xf765a92e,0x3a18b7c6,0xceb04cfc,0x0acc8966,0x27944cef,0x434c1004,0xcbb3c957 .long 0xa43ff93c,0x9c9971a1,0xa1e358a9,0x5bc2db17,0xa8d9bc82,0x45b4862e,0x2201e052,0x70ebfbfb,0x92871591,0xafdf64c7,0xb42d0219,0xea5bcae6,0x2ad8f03c,0xde536c55,0xa76aa33c,0xcd6c3f4d .long 0x0bca6de3,0xbeb5f623,0xb1e706fd,0xdd20dd99,0xac9059d4,0x90b3ff9d,0x7ccccc4e,0x2d7b2902,0xce98840f,0x8a090a59,0x8410680a,0xa5d947e0,0x923379a5,0x49ae346a,0xb28a3156,0x7dbc84f9 .long 0x54a1aff2,0xfd40d916,0x3a78fb9b,0xabf318ba,0x3029f95e,0x50152ed8,0xc58ad7fa,0x9fc1dd77,0x13595c17,0x5fa57915,0x8f62b3a9,0xb9504668,0xff3055b0,0x907b5b24,0x9a84f125,0x2e995e35 .long 0x7e9bbcfb,0x87dacf69,0xe86d96e3,0x95d0c1d6,0x2d95a75c,0x65726e3c,0xacd27f21,0x2c3c9001,0x6c973f57,0x1deab561,0xa5221643,0x108b7e2c,0xc4ef79d4,0x5fee9859,0x40d4b8c6,0xbd62b88a .long 0x197c75d6,0xb4dd29c4,0xb7076feb,0x266a6df2,0x4bf2df11,0x9512d0ea,0x6b0cc9ec,0x1320c24f,0x01a59596,0x6bb1e0e1,0xeff9aaac,0x8317c5bb,0x385aa6c9,0x65bb405e,0x8f07988f,0x613439c1 .long 0x16a66e91,0xd730049f,0xfa1b0e0d,0xe97f2820,0x304c28ea,0x4131e003,0x526bac62,0x820ab732,0x28714423,0xb2ac9ef9,0xadb10cb2,0x54ecfffa,0xf886a4cc,0x8781476e,0xdb2f8d49,0x4b2c87b5 .long 0x0a44295d,0xe857cd20,0x58c6b044,0x707d7d21,0xf596757c,0xae8521f9,0x67b2b714,0x87448f03,0x5ebcd58d,0x13a9bc45,0x9122d3c1,0x79bcced9,0x9e076642,0x3c644247,0x2df4767d,0x0cf22778 .long 0x71d444b6,0x5e61aee4,0xc5084a1d,0x211236bf,0x4fd3eaf6,0x7e15bc9a,0xab622bf5,0x68df2c34,0x59bf4f36,0x9e674f0f,0xd7f34d73,0xf883669b,0x31497b1d,0xc48ac1b8,0x5106703b,0x323b925d .long 0x74082008,0x22156f42,0xc8482bcb,0xeffc521a,0x12173479,0x5c6831bf,0xc4739490,0xcaa2528f,0x8f1b3c4d,0x84d2102a,0x2d9bec0d,0xcf64dfc1,0x78a546ef,0x433febad,0x7b73cef1,0x1f621ec3 .long 0x37338615,0x6aecd627,0x01d8edf6,0x162082ab,0x19e86b66,0x833a8119,0xd299b5db,0x6023a251,0xbbf04b89,0xf5bb0c3a,0xae749a44,0x6735eb69,0x4713de3b,0xd0e058c5,0x2c3d4ccd,0xfdf2593e .long 0xfdd23667,0x1b8f414e,0xfa2015ee,0xdd52aaca,0xbd9625ff,0x3e31b517,0x8db5918c,0x5ec9322d,0xa96f5294,0xbc73ac85,0x61a0666a,0x82aa5bf3,0xbf08ac42,0x49755810,0x891cedfc,0xd21cdfd5 .long 0x67f8be10,0x918cb57b,0x56ffa726,0x365d1a7c,0x6532de93,0x2435c504,0x2674cd02,0xc0fc5e10,0x9cbbb142,0x6e51fcf8,0xafc50692,0x1d436e5a,0x3fbcae22,0x766bffff,0xfd55d3b8,0x3148c2fd .long 0x233222fa,0x52c7fdc9,0xe419fb6b,0x89ff1092,0x25254977,0x3cd6db99,0x1cf12ca7,0x2e85a161,0xdc810bc9,0xadd2547c,0x9d257c22,0xea3f458f,0x27d6b19b,0x642c1fbe,0x140481a6,0xed07e6b5 .long 0x86d2e0f8,0x6ada1d42,0x0e8a9fd5,0xe5920122,0x708c1b49,0x02c936af,0x2b4bfaff,0x60f30fee,0x858e6a61,0x6637ad06,0x3fd374d0,0xce4c7767,0x7188defb,0x39d54b2d,0xf56a6b66,0xa8c9d250 .long 0xb24fe1dc,0x58fc0f5e,0x6b73f24c,0x9eaf9dee,0x33650705,0xa90d588b,0xaf2ec729,0xde5b62c5,0xd3c2b36e,0x5c72cfae,0x034435da,0x868c19d5,0xe17ee145,0x88605f93,0x77a5d5b1,0xaa60c4ee .long 0x3b60c472,0xbcf5bfd2,0xeb1d3049,0xaf4ef13c,0xe13895c9,0x373f44fc,0x0cbc9822,0xf29b382f,0x73efaef6,0x1bfcb853,0xa8c96f40,0xcf56ac9c,0x7a191e24,0xd7adf109,0xbf8a8dc2,0x98035f44 .long 0x1e750c84,0xf40a71b9,0x5dc6c469,0xc57f7b0c,0x6fbc19c1,0x49a0e79c,0xa48ebdb8,0x6b0f5889,0xa07c4e9f,0x5d3fd084,0xab27de14,0xc3830111,0x33e08dcc,0x0e4929fe,0x40bb73a3,0xf4a5ad24 .long 0x490f97ca,0xde86c2bf,0x67a1ce18,0x288f09c6,0x1844478d,0x364bb886,0xceedb040,0x7840fa42,0x5a631b37,0x1269fdd2,0xa47c8b7d,0x94761f1e,0x481c6266,0xfc0c2e17,0x3daa5fa7,0x85e16ea2 .long 0x92491048,0xccd86033,0xf4d402d7,0x0c2f6963,0xdf6a865c,0x6336f7df,0xb5c02a87,0x0a2a463c,0xbf2f12ee,0xb0e29be7,0x66bad988,0xf0a22002,0x9123c1d7,0x27f87e03,0x328a8c98,0x21669c55 .long 0x92f14529,0x186b9803,0x63954df3,0xd3d056cc,0x175a46f6,0x2f03fd58,0x11558558,0x63e34ebe,0x5b80cfa5,0xe13fedee,0xd401dbd1,0xe872a120,0xe8a9d667,0x52657616,0xe08d6693,0xbc8da4b6 .long 0x1b703e75,0x370fb9bb,0xd4338363,0x6773b186,0xecef7bff,0x18dad378,0x995677da,0xaac787ed,0x0437164b,0x4801ea8b,0x73fe795e,0xf430ad20,0x8ee5eb73,0xb164154d,0x108f7c0e,0x0884ecd8 .long 0x5f520698,0x0e6ec096,0x44f7b8d9,0x640631fe,0xa35a68b9,0x92fd34fc,0x4d40cf4e,0x9c5a4b66,0x80b6783d,0x949454bf,0x3a320a10,0x80e701fe,0x1a0a39b2,0x8d1a564a,0x320587db,0x1436d53d .long 0x6556c362,0xf5096e6d,0xe2455d7e,0xbc23a3c0,0x807230f9,0x3a7aee54,0x22ae82fd,0x9ba1cfa6,0x99c5d706,0x833a057a,0x842315c9,0x8be85f4b,0x66a72f12,0xd083179a,0xcdcc73cd,0x2fc77d5d .long 0x5616ee30,0x22b88a80,0xe7ab1083,0xfb09548f,0x511270cd,0x8ad6ab0d,0x6924d9ab,0x61f6c57a,0x90aecb08,0xa0f7bf72,0x0df784a4,0x849f87c9,0xcfaf1d03,0x27c79c15,0xc463face,0xbbf9f675 .long 0x765ba543,0x91502c65,0x42ea60dd,0x18ce3cac,0x6e43ecb3,0xe5cee6ac,0x68f2aeeb,0x63e4e910,0xc85932ee,0x26234fa3,0x4c90c44d,0x96883e8b,0xa18a50f6,0x29b9e738,0x3f0420df,0xbfc62b2a .long 0x6d3e1fa9,0xd22a7d90,0xfe05b8a3,0x17115618,0xbb2b9c01,0x2a0c9926,0xe07e76a2,0xc739fcc6,0x165e439a,0x540e9157,0x6a9063d8,0x06353a62,0x61e927a3,0x84d95594,0xe2e0be7f,0x013b9b26 .long 0x973497f1,0x4feaec3b,0x093ebc2d,0x15c0f94e,0x33af0583,0x6af5f227,0xc61f3340,0x0c2af206,0x4457397c,0xd25dbdf1,0xcabcbae0,0x2e8ed017,0xc2815306,0xe3010938,0xe8c6cd68,0xbaa99337 .long 0x3b0ec7de,0x08513182,0x58df05df,0x1e1b822b,0xa5c3b683,0x5c14842f,0x3eba34ce,0x98fe977e,0x0d5e8873,0xfd2316c2,0xbd0d427d,0xe48d839a,0x623fc961,0x495b2218,0xb46fba5e,0x24ee56e7 .long 0x91e4de58,0x9184a55b,0xdfdea288,0xa7488ca5,0xa8dcc943,0xa723862e,0x849dc0fc,0x92d762b2,0x091ff4a9,0x3c444a12,0x0cada274,0x581113fa,0x30d8eae2,0xb9de0a45,0xdf6b41ea,0x5e0fcd85 .long 0xc094dbb5,0x6233ea68,0xd968d410,0xb77d062e,0x58b3002d,0x3e719bbc,0x3dc49d58,0x68e7dd3d,0x013a5e58,0x8d825740,0x3c9e3c1b,0x21311747,0x7c99b6ab,0x0cb0a2a7,0xc2f888f2,0x5c48a3b3 .long 0x991724f3,0xc7913e91,0x39cbd686,0x5eda799c,0x63d4fc1e,0xddb595c7,0xac4fed54,0x6b63b80b,0x7e5fb516,0x6ea0fc69,0xd0f1c964,0x737708ba,0x11a92ca5,0x9628745f,0x9a86967a,0x61f37958 .long 0xaa665072,0x9af39b2c,0xefd324ef,0x78322fa4,0xc327bd31,0x3d153394,0x3129dab0,0x81d5f271,0xf48027f5,0xc72e0c42,0x8536e717,0xaa40cdbc,0x2d369d0f,0xf45a657a,0xea7f74e6,0xb03bbfc4 .long 0x0d738ded,0x46a8c418,0xe0de5729,0x6f1a5bb0,0x8ba81675,0xf10230b9,0x112b33d4,0x32c6f30c,0xd8fffb62,0x7559129d,0xb459bf05,0x6a281b47,0xfa3b6776,0x77c1bd3a,0x7829973a,0x0709b380 .long 0xa3326505,0x8c26b232,0xee1d41bf,0x38d69272,0xffe32afa,0x0459453e,0x7cb3ea87,0xce8143ad,0x7e6ab666,0x932ec1fa,0x22286264,0x6cd2d230,0x6736f8ed,0x459a46fe,0x9eca85bb,0x50bf0d00 .long 0x877a21ec,0x0b825852,0x0f537a94,0x300414a7,0x21a9a6a2,0x3f1cba40,0x76943c00,0x50824eee,0xf83cba5d,0xa0dbfcec,0x93b4f3c0,0xf9538148,0x48f24dd7,0x61744162,0xe4fb09dd,0x5322d64d .long 0x3d9325f3,0x57447384,0xf371cb84,0xa9bef2d0,0xa61e36c5,0x77d2188b,0xc602df72,0xbbd6a7d7,0x8f61bc0b,0xba3aa902,0x6ed0b6a1,0xf49085ed,0xae6e8298,0x8bc625d6,0xa2e9c01d,0x832b0b1d .long 0xf1f0ced1,0xa337c447,0x9492dd2b,0x800cc793,0xbea08efa,0x4b93151d,0xde0a741e,0x820cf3f8,0x1c0f7d13,0xff1982dc,0x84dde6ca,0xef921960,0x45f96ee3,0x1ad7d972,0x29dea0c7,0x319c8dbe .long 0x7b82b99b,0xd3ea3871,0x470eb624,0x75922d4d,0x3b95d466,0x8f66ec54,0xbee1e346,0x66e673cc,0xb5f2b89a,0x6afe67c4,0x290e5cd3,0x3de9c1e6,0x310a2ada,0x8c278bb6,0x0bdb323b,0x420fa384 .long 0x0eb919b0,0x0ae1d63b,0xa74b9620,0xd74ee51d,0xa674290c,0x395458d0,0x4620a510,0x324c930f,0xfbac27d4,0x2d1f4d19,0x9bedeeac,0x4086e8ca,0x9b679ab8,0x0cdd211b,0x7090fec4,0x5970167d .long 0xfaf1fc63,0x3420f2c9,0x328c8bb4,0x616d333a,0x57f1fe4a,0x7d65364c,0x55e5c73a,0x9343e877,0xe970e78c,0x5795176b,0x60533627,0xa36ccebf,0x09cdfc1b,0xfc7c7380,0xb3fec326,0xb39a2afe .long 0x6224408a,0xb7ff1ba1,0x247cfc5e,0xcc856e92,0xc18bc493,0x01f102e7,0x2091c727,0x4613ab74,0xc420bf2b,0xaa25e89c,0x90337ec2,0x00a53176,0x7d025fc7,0xd2be9f43,0x6e6fe3dc,0x3316fb85 .long 0x9ac50814,0x27520af5,0x9a8e4223,0xfdf95e78,0x56bec5a0,0xb7e7df2a,0xdf159e5d,0xf7022f7d,0xcac1fe8f,0x93eeeab1,0x37451168,0x8040188c,0xd967dce6,0x7ee8aa8a,0x3abc9299,0xfa0e79e7 .long 0x2064cfd1,0x67332cfc,0xb0651934,0x339c31de,0x2a3bcbea,0x719b28d5,0x9d6ae5c6,0xee74c82b,0xbaf28ee6,0x0927d05e,0x9d719028,0x82cecf2c,0xddb30289,0x0b0d353e,0xfddb2e29,0xfe4bb977 .long 0x640bfd9e,0xbb5bb990,0x82f62108,0xd226e277,0x02ffdd56,0x4bf00985,0x2ca1b1b5,0x7756758a,0x5285fe91,0xc32b62a3,0x8c9cd140,0xedbc546a,0xaf5cb008,0x1e47a013,0x073ce8f2,0xbca7e720 .long 0x17a91cae,0xe10b2ab8,0x08e27f63,0xb89aab65,0xdba3ddf9,0x7b3074a7,0x330c2972,0x1c20ce09,0x5fcf7e33,0x6b9917b4,0x945ceb42,0xe6793743,0x5c633d19,0x18fc2215,0xc7485474,0xad1adb3c .long 0x6424c49b,0x646f9679,0x67c241c9,0xf888dfe8,0x24f68b49,0xe12d4b93,0xa571df20,0x9a6b62d8,0x179483cb,0x81b4b26d,0x9511fae2,0x666f9632,0xd53aa51f,0xd281b3e4,0x7f3dbd16,0x7f96a765 .long 0x074a30ce,0xa7f8b5bf,0x005a32e6,0xd7f52107,0x50237ed4,0x6f9e0907,0x8096fa2b,0x2f21da47,0xeec863a0,0xf3e19cb4,0x9527620a,0xd18f77fd,0x407c1cf8,0x9505c81c,0x1b6ec284,0x9998db4e .long 0xc247d44d,0x7e3389e5,0x3f4f3d80,0x12507141,0x4a78a6c7,0xd4ba0110,0x767720be,0x312874a0,0x75944370,0xded059a6,0x3b2c0bdd,0xd6123d90,0x51c108e3,0xa56b717b,0x070623e9,0x9bb7940e .long 0x84ac066c,0x794e2d59,0xe68c69a0,0xf5954a92,0x4fd99dcc,0x28c52458,0xb1012517,0x60e639fc,0x7de79248,0xc2e60125,0xf12fc6d7,0xe9ef6404,0x2a3b5d32,0x4c4f2808,0xc768eb8a,0x865ad32e .long 0x13fb70b6,0xac02331b,0x95599b27,0x037b44c1,0x60bd082c,0x1a860fc4,0xc980cd01,0xa2e25745,0x1da0263e,0xee3387a8,0x2d10f3d6,0x931bfb95,0xa1f24a32,0x5b687270,0xca494b86,0xf140e65d .long 0xb2f1ac7a,0x4f4ddf91,0x760fee27,0xf99eaabb,0x49c228e5,0x57f4008a,0x1cf713bb,0x090be440,0x5004f022,0xac91fbe4,0x569e1af6,0xd838c2c2,0x0f1daaa5,0xd6c7d20b,0x1bbb02c0,0xaa063ac1 .long 0x59558a78,0x0938a422,0x8435da2f,0x5343c669,0x034410dc,0x96f67b18,0x84510804,0x7cc1e424,0x16dfbb7d,0x86a1543f,0x5b5bd592,0x921fa942,0xb33dd03c,0x9dcccb6e,0xb843f51e,0x8581ddd9 .long 0x81d73c9e,0x54935fcb,0x0a5e97ab,0x6d07e979,0xcf3a6bab,0x4dc7b30a,0x170bee11,0x147ab1f3,0x9fafdee4,0x0aaf8e3d,0x538a8b95,0xfab3dbcb,0x6ef13871,0x405df4b3,0x088d5a49,0xf1f4e9cb .long 0x66b33f1d,0x9bcd24d3,0x5ce445c0,0x3b97b820,0xba93ff61,0xe2926549,0x4dafe616,0xd9c341ce,0x16efb6f3,0xfb30a76e,0x605b953c,0xdf24b8ca,0xc2fffb9f,0x8bd52afe,0xe19d0b96,0xbbac5ff7 .long 0x459afccd,0x43c01b87,0xb7432652,0x6bd45143,0x55b5d78e,0x84734530,0x1554ba7d,0x81088fdb,0x1e269375,0xada0a52c,0x2dc5ec10,0xf9f037c4,0x94bfbc11,0xc0660607,0xc9c40d2f,0xc0a630bb .long 0xab64c31e,0x5efc797e,0x74507144,0xffdb1dab,0x1ca6790c,0xf6124287,0xe69bf1bf,0xe9609d81,0x00d24fc9,0xdb898595,0xe51fb417,0x9c750333,0xfef7bbde,0x51830a91,0x945f585c,0x0ce67dc8 .long 0x4763eb50,0x9a730ed4,0xc1ab0d66,0x24a0e221,0x648748f3,0x643b6393,0x6d3c6291,0x1982daa1,0x8bbc5549,0x6f00a9f7,0x7f36384e,0x7a1783e1,0xde977f50,0xe8346323,0xb245502a,0x91ab688d .long 0x6d0bdd66,0x331ab6b5,0x64b71229,0x0a6ef32e,0xfe7c352f,0x1028150e,0xce7b39d3,0x27e04350,0xc1070c82,0x2a3c8acd,0x80c9feef,0xfb2034d3,0x709f3729,0x2d729621,0x62cb4549,0x8df290bf .long 0xfc2e4326,0x02f99f33,0x5eddf032,0x3b30076d,0x0c652fb5,0xbb21f8cf,0xed91cf7b,0x314fb49e,0x2f700750,0xa013eca5,0x712a4575,0x2b9e3c23,0xaf30fbb0,0xe5355557,0x7c77e771,0x1ada3516 .long 0x7b135670,0x45f6ecb2,0x7cfc202e,0xe85d19df,0x58d1be9f,0x0f1b50c7,0xead2e344,0x5ebf2c0a,0xabc199c9,0x1531fe4e,0x56bab0ae,0xc7032592,0x6c1fec54,0x16ab2e48,0x04280188,0x0f87fda8 .long 0x609e4a74,0xdc9f46fc,0xba667f91,0x2a44a143,0xb4d83436,0xbc3d8b95,0xc7bd2958,0xa01e4bd0,0x73483c90,0x7b182932,0xa7c7b598,0xa79c6aa1,0xeaaac07e,0xbf3983c6,0x96e0d4e6,0x8f18181e .long 0x051af62b,0x8553d37c,0x0bf94496,0xe9a998eb,0xb0d59aa1,0xe0844f9f,0xe6afb813,0x983fd558,0x65d69804,0x9670c0ca,0x6ea5ff2d,0x732b22de,0x5fd8623b,0xd7640ba9,0xa6351782,0x9f619163 .long 0xacee5043,0x0bfc27ee,0x2eb10f02,0xae419e73,0x8943fb05,0x19c028d1,0xff13aa2a,0x71f01cf7,0x8887a132,0x7790737e,0x66318410,0x67513309,0x7ddb795e,0x9819e8a3,0xdad100b2,0xfecb8ef5 .long 0x3021926a,0x59f74a22,0x6f9b4c1c,0xb7c28a49,0x912ad0ab,0xed1a733f,0x01a5659c,0x42a910af,0x7bd68cab,0x3842c6e0,0x76d70ac8,0x2b57fa38,0x3c53aaeb,0x8a6707a8,0x65b4db18,0x62c1c510 .long 0xb2d09dc7,0x8de2c1fb,0x266bd23b,0xc3dfed12,0xd5b27db6,0x927d039b,0x103243da,0x2fb2f0f1,0x80be7399,0xf855a07b,0x1f9f27a8,0xed9327ce,0x729bdef7,0xa0bd99c7,0x28250d88,0x2b67125e .long 0x8670ced7,0x784b26e8,0xc31bd3b4,0xe3dfe41f,0xbcc85cbc,0x9e353a06,0x60178a9d,0x302e2909,0xa6eac16e,0x860abf11,0xaa2b3aac,0x76447000,0x850afdab,0x46ff9d19,0xfdb2d4c1,0x35bdd6a5 .long 0x7e5c9ce9,0xe82594b0,0x20af346e,0x0f379e53,0xbc65ad4a,0x608b31e3,0x267c4826,0x710c6b12,0x71954cf1,0x51c966f9,0x0d0aa215,0xb1cec793,0x86bd23a8,0x1f155989,0xf9452e86,0xae2ff99c .long 0x340ceaa2,0xd8dd953c,0x2e2e9333,0x26355275,0x8586f06d,0x15d4e5f9,0xf7cab546,0xd6bf94a8,0xb76a9af0,0x33c59a0a,0xba095af7,0x52740ab3,0x24389ca0,0xc444de8a,0x706da0cb,0xcc6f9863 .long 0x6b2515cf,0xb5a741a7,0x9585c749,0x71c41601,0xe683de97,0x78350d4f,0x63d0b5f5,0x31d61524,0xfbce090b,0x7a0cc5e1,0xfbcb2a5b,0xaac927ed,0x20d84c35,0xe920de49,0x22b4de26,0x8c06a0b6 .long 0xafe7ddf3,0xd34dd58b,0xc1e6e55b,0x55851fed,0x960696e7,0xd1395616,0x5f22705f,0x940304b2,0xb0a2a860,0x6f43f861,0x0e7cc981,0xcf121282,0x0ab64a96,0x12186212,0xb789383c,0x09215b9a .long 0x37387c09,0x311eb305,0xf03ee760,0xc5832fce,0x32f7ea19,0x30358f58,0x91d53551,0xe01d3c34,0xda48ea80,0x1ca5ee41,0xcf4fa4c1,0x34e71e8e,0x7af1e1c7,0x312abd25,0x2153f4a5,0xe3afcdeb .long 0x00235e9a,0x9d5c84d7,0x8c4c836f,0x0308d3f4,0x89332de5,0xc0a66b04,0x89e566ef,0x610dd399,0xd1ac1635,0xf8eea460,0x20a2c0df,0x84cbb3fb,0xe74a48c5,0x40afb488,0xd326b150,0x29738198 .long 0xa6d74081,0x2a17747f,0x55a26214,0x60ea4c05,0x1f88c5fe,0x53514bb4,0x7e83426c,0xedd64567,0x96460b25,0xd5d6cbec,0x68dc115e,0xa12fd0ce,0x697840ea,0xc5bc3ed2,0xa6331e31,0x969876a8 .long 0x472ff580,0x60c36217,0x4ad41393,0xf4229705,0xa03b8b92,0x4bd99ef0,0xc144f4f6,0x501c7317,0x18464945,0x159009b3,0x74c5c6be,0x6d5e594c,0x321a3660,0x2d587011,0x3898d022,0xd1e184b1 .long 0x4c6a7e04,0x5ba04752,0x45550b65,0x47fa1e2b,0x48c0a9a5,0x9419daf0,0x7c243236,0x66362953,0x5cb12a88,0xcd0744b1,0x2b646188,0x561b6f9a,0x66c2c0c0,0x599415a5,0x0f83f09a,0xbe3f0859 .long 0xb92041b8,0x9141c5be,0x26477d0d,0x01ae38c7,0xd12c7a94,0xca8b71f3,0x765c70db,0xfab5b31f,0x487443e9,0x76ae7492,0x990d1349,0x8595a310,0x7d460a37,0xf8dbeda8,0x1e45a38f,0x7f7ad082 .long 0x1059705a,0xed1d4db6,0xe6b9c697,0xa3dd492a,0x6eb38bd5,0x4b92ee3a,0x67cc0bb7,0xbab2609d,0x6e70ee82,0x7fc4fe89,0x13e6b7e3,0xeff2c56e,0x34d26fca,0x9b18959e,0x889d6b45,0x2517ab66 .long 0xbdefdd4f,0xf167b4e0,0xf366e401,0x69958465,0xa73bbec0,0x5aa368ab,0x7b240c21,0x12148709,0x18969006,0x378c3233,0xe1fe53d1,0xcb4d73ce,0x130c4361,0x5f50a80e,0x7ef5212b,0xd67f5951 .long 0x9e70c72e,0xf145e21e,0x5566d2fb,0xb2e52e29,0x032397f5,0x44eaba4a,0x7e31a7de,0x5e56937b,0x456c61e1,0x68dcf517,0xa8b0a388,0xbc2e954a,0x60a8b755,0xe3552fa7,0x73ad0cde,0x03442dae .long 0xceb26210,0x37ffe747,0x787baef9,0x983545e8,0x86a3de31,0x8b8c8535,0xfacd46db,0xc621dbcb,0x59266fbb,0x82e442e9,0x339d471c,0xa3514c37,0x62cdad96,0x3a11b771,0xecf9bdf0,0xf0cb3b3c .long 0x478e2135,0x3fcbdbce,0xbda35342,0x7547b5cf,0x8a677af6,0xa97e81f1,0x28817987,0xc8c2bf83,0x45580985,0xdf07eaaf,0xc93b45cb,0xc68d1f05,0xc77b4cac,0x106aa2fe,0x04a7ae86,0x4c1d8afc .long 0x9eb45ab2,0xdb41c3fd,0xd4b22e74,0x5b234b5b,0xf215958a,0xda253dec,0xa04edfa0,0x67e0606e,0xef751b11,0xabbbf070,0xf6f06dce,0xf352f175,0x6839f6b4,0xdfc4b6af,0x9959848e,0x53ddf9a8 .long 0xc21520b0,0xda49c379,0xdbd5d1b6,0x90864ff0,0x5f49c7f7,0x2f055d23,0xa796b2d8,0xe51e4e6a,0x5c9dc340,0xc361a67f,0xbca7c620,0x5ad53c37,0x32c756d0,0xda1d6588,0x8bb67e13,0xad60d911 .long 0x0eeec8c6,0xd6c47bdf,0x078a1821,0x4a27fec1,0xc3099524,0x081f7415,0x82cd8060,0x8effdf0b,0x65842df8,0xdb70ec1c,0xd319a901,0x8821b358,0xde42b529,0x72ee56ee,0x236e4286,0x5bb39592 .long 0xfd6f7140,0xd1183316,0xbd8e81f7,0xf9fadb5b,0x5a02d962,0x701d5e0c,0x1b601324,0xfdee4dbf,0x35d7620e,0xbed17407,0xf48c0012,0x04e3c2c3,0x3455449a,0x9ee29da7,0x91a836c4,0x562cdef4 .long 0x47701097,0x8f682a5f,0xff88d0c2,0x617125d8,0x57bb86dd,0x948fda24,0x289f7286,0x348abb8f,0x99d94bbd,0xeb10eab5,0x4684d160,0xd51ba28e,0x30c8f41a,0xabe0e51c,0x13254f4a,0x66588b45 .long 0xfad097a5,0x147ebf01,0x610e815d,0x49883ea8,0x8a11de56,0xe44d60ba,0x827a7a6d,0xa970de6e,0x5e17fc19,0x2be41424,0x01214057,0xd833c657,0x363e723f,0x1375813b,0xe6a52e9b,0x6820bb88 .long 0xd875d56a,0x7e7f6970,0x51fbf6bf,0xd6a0a9ac,0xa3083c12,0x54ba8790,0x6ae7eb64,0xebaeb23d,0xb99a907a,0xa8685c3a,0x026bf40b,0xf1e74550,0xc802cd9e,0x7b73a027,0x4fef4635,0x9a8a927c .long 0x08191224,0xe1b6f60c,0xde4ec091,0xc4126ebb,0x4ae38d84,0xe1dff4dc,0x4f2ef985,0xde3f57db,0xd446a1dd,0x34964337,0x859e77f6,0x7bf217a0,0x8e1d13f5,0x8ff10527,0x74eeae27,0xa304ef03 .long 0xd19dfa5a,0xfc6f5e47,0x7fad982b,0xdb007de3,0x613715f5,0x28205ad1,0x7889529e,0x251e6729,0x1ae98e78,0x72705184,0x271cac32,0xf818537d,0xb7f410f5,0xc8a15b7e,0x81f62393,0xc474356f .long 0xc242316b,0x92dbdc5a,0xdbf4aff5,0xabe060ac,0x909a8ec6,0x6e8c38fe,0x6116cb94,0x43e514e5,0x07d784f9,0x2078fa38,0xf4b5b357,0x1161a880,0x13adea3d,0x5283ce79,0xcc6a910b,0x0756c3e6 .long 0xaaa79697,0x60bcfe01,0x56391db1,0x04a73b29,0x189b45a0,0xdd8dad47,0x48d5b8d9,0xbfac0dd0,0x7d3d2ec2,0x34ab3af5,0x207bd3af,0x6fa2fc2d,0x66550ded,0x9ff40092,0x1fd5b913,0x719b3e87 .long 0x6d17fbc7,0xa573a496,0x73d2b24e,0x0cd1a70a,0xb2676937,0x34e2c5ca,0xbf669f21,0xe7050b06,0x1ede9046,0xfbe948b6,0x97662659,0xa0530051,0xf10124c5,0x58cbd4ed,0xdd6c06c8,0xde2646e4 .long 0x8cad38c0,0x332f8108,0x6bd68ae2,0x471b7e90,0x0d8e27a3,0x56ac3fb2,0x136b4b0d,0xb54660db,0xa6fd8de4,0x123a1e11,0xa37799ef,0x44dbffea,0xce6ac17c,0x4540b977,0xaf60acef,0x495173a8 .long 0x391c2a82,0x9ebb284d,0x158308e8,0xbcdd4863,0x83f1edca,0x006f16ec,0x695dc6c8,0xa13e2c37,0x4a057a87,0x2ab756f0,0xa6b48f98,0xa8765500,0x68651c44,0x4252face,0xe1765e02,0xa52b540b .long 0x16a0d2bb,0x4f922fc5,0x1a623499,0x0d5cc16c,0x57c62c8b,0x9241cf3a,0xfd1b667f,0x2f5e6961,0xf5a01797,0x5c15c70b,0x60956192,0x3d20b44d,0x071fdb52,0x04911b37,0x8d6f0f7b,0xf648f916 .long 0xe60b7cf7,0x6dc1acaf,0x84a9d869,0x25860a50,0xe7ba8ac4,0x56fc6f09,0x6148d29e,0x828c5bd0,0xdc55ae5f,0xac6b435e,0xc0117411,0xa527f56c,0xfd24342c,0x94d5045e,0x70b67c0d,0x2c4c0a35 .long 0xfac61d9a,0x027cc8b8,0xe3c6fe8a,0x7d25e062,0xe5bff503,0xe08805bf,0x6ff632f7,0x13271e6c,0x232f76a5,0x55dca6c0,0x701ef426,0x8957c32d,0xa10a5178,0xee728bcb,0xb62c5173,0x5ea60411 .long 0xd0b8892b,0xfc4e964e,0x9301bb74,0x9ea17683,0xfcc48626,0x6265c5ae,0xbb3e9102,0xe60cf82e,0xd4df5531,0x57adf797,0x8deeefe2,0x235b59a1,0x3f306eb1,0x60adcf58,0x3d09492d,0x105c2753 .long 0xb5def996,0x4090914b,0x233dd1e7,0x1cb69c83,0x9b3d5e76,0xc1e9c1d3,0xfccf6012,0x1f3338ed,0x2f5378a8,0xb1e95d0d,0x2f00cd21,0xacf4c2c7,0xeb5fe290,0x6e984240,0x248088ae,0xd66c038d .long 0xf94d70cf,0x804d264a,0x7314bf7e,0xbdb802ef,0x4333ed02,0x8fb54de2,0x285635d9,0x740461e0,0x365e9383,0x4113b2c8,0x3fdef652,0xea762c83,0x47b956c1,0x4eec6e2e,0x65620fa4,0xa3d814be .long 0xb4d8bc50,0x9ad5462b,0xa9195770,0x181c0b16,0x78412a68,0xebd4fe1c,0xc0dff48c,0xae0341bc,0x7003e866,0xb6bc45cf,0x8a24a41b,0xf11a6dea,0xd04c24c2,0x5407151a,0xda5b7b68,0x62c9d27d .long 0x88cceff6,0x2e964235,0x8b07ed69,0x8594c54f,0xc84d0d0d,0x1578e73c,0xff532868,0x7b4e1055,0xb5ec995a,0xa348c0d5,0x14289a54,0xbf4b9d55,0x58fbd777,0x9ba155a6,0x1a84491d,0x186ed7a8 .long 0x614c0900,0xd4992b30,0xbd00c24b,0xda98d121,0x7ec4bfa1,0x7f534dc8,0x37dc34bc,0x4a5ff674,0x1d7ea1d7,0x68c196b8,0x80a6d208,0x38cf2893,0xe3cbbd6e,0xfd56cd09,0x4205a5b6,0xec72e27e .long 0xa44f77f7,0x15ea68f5,0xb43c52bc,0x7aa5f9fd,0x94f0e609,0x86ff676f,0x2e2d432b,0xa4cde963,0xeee470af,0x8cafa0c0,0x8a3f5ec8,0x84137d0e,0xfaa31231,0xebb40411,0x6f7f7ccf,0xa239c13f .long 0xa8afd30b,0x32865719,0x8a826dce,0x86798328,0xc4a8fbe0,0xdf04e891,0xebf56ad3,0xbb6b6e1b,0x471f1ff0,0x0a695b11,0xbe15baf0,0xd76c3389,0xbe96c43e,0x018edb95,0x90794158,0xf2beaaf4 .long 0xc3076a27,0x152db09e,0xe416545d,0x5e82908e,0x356d6f2e,0xa2c41272,0x31fd74e1,0xdc9c9642,0x519bf615,0x66ceb88d,0x05a2274e,0xe29ecd76,0xbf5e2fa0,0x3a0473c4,0x64284e67,0x6b6eb671 .long 0xb88756dd,0xe8b97932,0xf17e3e61,0xed4e8652,0x3ee1c4a4,0xc2dd1499,0x597f8c0e,0xc0aaee17,0x6c168af3,0x15c4edb9,0xb39ae875,0x6563c7bf,0x20adb436,0xadfadb6f,0x9a042ac0,0xad55e8c9 .long 0xb76da1f5,0x975a1ed8,0xa58acb94,0x10dfa466,0xac060282,0x8dd7f7e3,0x572a051e,0x6813e66a,0x350cb901,0xb4ccae1e,0x50cb7822,0xb653d656,0xdfab3b87,0x42484710,0x9b670fd0,0xcd7ee537 .long 0x523b8bf6,0x0a50b12e,0x8f910c1b,0x8009eb5b,0x4a167588,0xf535af82,0xfb2a2abd,0x0f835f9c,0x2afceb62,0xf59b2931,0x169d383f,0xc797df2a,0x66ac02b0,0xeb3f5fb0,0xdaa2d0ca,0x029d4c6f .long 0xafab4bc5,0xd4059bc1,0x56783247,0x833f5c6f,0x8d2d3605,0xb5346630,0xd34d8433,0x83387891,0xadd9419a,0xd973b30f,0xafe3fce8,0xbcca1099,0x0809aac6,0x08178315,0x540f0f11,0x01b7f21a .long 0x909523c8,0x65c29219,0xa3a1c741,0xa62f648f,0x60c9e55a,0x88598d4f,0x0e4f347a,0xbce9141b,0x35f9b988,0x9af97d84,0x320475b6,0x0210da62,0x9191476c,0x3c076e22,0x44fc7834,0x7520dbd9 .long 0xc1ab1bbd,0x6a6b2cfe,0xdc650938,0xef8a65be,0x805d7bc4,0x72855540,0xed11fdfd,0xda389396,0x74660876,0xa9d5bd36,0xb45dff35,0x11d67c54,0xa4f5da94,0x6af7d148,0xc0bbeb31,0xbb8d4c3f .long 0xe0a1b12a,0x87a7ebd1,0x770ba95f,0x1e4ef88d,0xdc2ae9cb,0x8c33345c,0x01cc8403,0xcecf1276,0x1b39b80f,0x687c012e,0x35c33ba4,0xfd90d0ad,0x5c9661c2,0xa3ef5a67,0xe017429e,0x368fc88e .long 0x196a2fa2,0xd30c6761,0xbd5b312e,0x931b9817,0x72f54a31,0xba01000c,0x66eaa541,0xa203d2c8,0x98939db3,0xf2abdee0,0x3e606c02,0xe37d6c2c,0x521ff643,0xf2921574,0xd7e2fca3,0x2781b3c4 .long 0x7850ec06,0x664300b0,0x7d3a10cf,0xac5a38b9,0xe34ab39d,0x9233188d,0x5072cbb9,0xe77057e4,0xb59e78df,0xbcf0c042,0x1d97de52,0x4cfc91e8,0x3ee0ca4a,0x4661a26c,0xfb8507bc,0x5620a4c1 .long 0x049f842c,0x4b44d4aa,0x1540e82b,0xceabc5d5,0x15c6f156,0x306710fd,0x63db1d72,0xbe5ae52b,0x334957f1,0x06f1e7e6,0x31144a70,0x57e388f0,0xdf96447b,0xfb69bb2f,0x73e38a12,0x0f78ebd3 .long 0x2b7ce542,0xb8222605,0x7472bde1,0xe6d4ce99,0x09d2f4da,0x53e16ebe,0x53b92b2e,0x180ff42e,0x2c34a1c6,0xc59bcc02,0x422c46c2,0x3803d6f9,0x5c14a8a2,0x18aff74f,0x10a08b28,0x55aebf80 .long 0x7135593f,0x66097d58,0x2be570cd,0x32e6eff7,0x2a8c860d,0x584e6a10,0xa2eb4163,0xcd185890,0x6d97e134,0x7ceae99d,0xdd8447ce,0xd42c6b70,0xb8c50273,0x59ddbb4a,0x3cf34e1e,0x03c612df .long 0x04b6c5a0,0x84b9ca15,0x18f0e3a3,0x35216f39,0xbd986c00,0x3ec2d2bc,0xd19228fe,0x8bf546d9,0x4cd623c3,0xd1c655a4,0x502b8e5a,0x366ce718,0xeea0bfe7,0x2cfc84b4,0xcf443e8e,0xe01d5cee .long 0x036520f8,0x8ec045d9,0x92d40e98,0xdfb3c3d1,0xcc559a04,0x0bac4cce,0x240ea6b1,0x35eccae5,0xf8a5a0ac,0x180b32db,0xeb699700,0x547972a5,0xca26bca0,0xa3765801,0xa647f25a,0x57e09d0e .long 0x2fdd23cc,0xb956970e,0x5682e971,0xb80288bc,0x9ae86ebc,0xe6e6d91e,0x8c9f1939,0x0564c83f,0x39560368,0x551932a2,0x049c28e2,0xe893752b,0xa6a158c3,0x0b03cee5,0x04964263,0xe12d656b .long 0x63e3bc1d,0x4b47554e,0x45044ff7,0xc719b6a2,0xe48daa07,0x4f24d30a,0xc8c1edc3,0xa3f37556,0x0700d360,0x9a47bf76,0x822ae4e2,0xbb1a1824,0x89f1fb4c,0x22e275a3,0x9968c5f5,0x72b1aa23 .long 0xbe063f64,0xa75feaca,0xbce47a09,0x9b392f43,0x1ad07aca,0xd4241509,0x8d26cd0f,0x4b0c591b,0x92f1169a,0x2d42ddfd,0x4cbf2392,0x63aeb1ac,0x0691a2af,0x1de9e877,0xd98021da,0xebe79af7 .long 0x40e50acf,0xcfdf2a4e,0xaf01d665,0xf0a98ad7,0x1831be1f,0xefb640bf,0x80e9ada0,0x6fe8bd2f,0x6cafbc91,0x94c103a1,0x8308e08c,0x170f8759,0x9780ff4f,0x5de2d2ab,0x45b201f2,0x666466bc .long 0xf5b343bc,0x58af2010,0xf2f142fe,0x0f2e400a,0xa85f4bdf,0x3483bfde,0x03bfeaa9,0xf0b1d093,0xc7081603,0x2ea01b95,0x3dba1097,0xe943e4c9,0xb438f3a6,0x47be92ad,0xe5bf6636,0x00bb7742 .long 0x824297b4,0x136b7083,0x5584455f,0x9d0e5580,0xf1c7d69e,0xab48cedc,0x2a256e76,0x53a9e481,0x65eb2413,0x0402b0e0,0x8fc407a7,0xdadbbb84,0x8d7f5492,0xa65cd5a4,0x74bae294,0x21d44293 .long 0x3b5f1cc4,0x66917ce6,0xce872e62,0x37ae52ea,0x2905f244,0xbb087b72,0x1e6af74f,0x12077086,0x1058edea,0x4b644e49,0xb638ca1d,0x827510e3,0x6038591c,0x8cf2b704,0xfe635063,0xffc8b47a .long 0x1b4d5e63,0x3ae220e6,0x9d961b4b,0xbd864742,0x9bd16bed,0x610c107e,0x1127147b,0x4270352a,0x64cfc50e,0x7d17ffe6,0x1e36cb42,0x50dee01a,0x35dc5f9a,0x068a7622,0xdf53f62c,0x9a08d536 .long 0x6be5f7de,0x4ed71457,0xc2263c9e,0xd93006f8,0xcacacb36,0xe073694c,0x3ae118ab,0x2ff7a5b4,0xcd871236,0x3cce53f1,0xc2aa6d52,0xf156a39d,0xb198d76d,0x9cc5f271,0x81383d39,0xbc615b6f .long 0xde3eee6b,0xa54538e8,0xab910d91,0x58c77538,0x58d278bd,0x31e5bdbc,0xb963acae,0x3cde4adf,0x5302169c,0xb1881fd2,0xa989ed8b,0x8ca60fa0,0xff96a0ee,0xa1999458,0xac6c283d,0xc1141f03 .long 0x6dfafed3,0x7677408d,0x39661588,0x33a01653,0x0b726fa0,0x3c9c15ec,0x6c9b56da,0x090cfd93,0xa3c40af5,0xe34f4bae,0xd21129f1,0x3469eadb,0x1e207ce8,0xcc51674a,0xc83b1ef9,0x1e293b24 .long 0x1e6c0bb4,0x17173d13,0x90776d35,0x19004695,0x6de6f922,0xe7980e34,0xf4dd9a22,0x873554cb,0xcbf18a51,0x0316c627,0x3032c081,0x4d93651b,0x3946834d,0x207f2771,0x30cdbf80,0x2c08d7b4 .long 0x86df2a61,0x137a4fb4,0xecf7b4a2,0xa1ed9c07,0x7bd042ff,0xb2e460e2,0x5f62f5ec,0xb7f5e2fa,0xcc2423b7,0x7aa6ec6b,0xba63eea7,0x75ce0a7f,0xf250a6e1,0x67a45fb1,0xe53cdc9f,0x93bc919c .long 0x871942df,0x9271f56f,0x7859ad66,0x2372ff6f,0x33cb1a78,0x5f4c2b96,0x5838aa83,0xe3e29101,0xe4e8110c,0xa7ed1611,0x330198ce,0x2a2d70d5,0x6720efe0,0xbdf132e8,0x66a471bf,0xe61a8962 .long 0x825808bd,0x796d3a85,0x3fd6e902,0x51dc3cb7,0x916219d1,0x643c768a,0xa2ad7d32,0x36cd7685,0xb22922a4,0xe3db9d05,0xdba29660,0x6494c87e,0xbcd2ebc7,0xf0ac91df,0x45107f8d,0x4deb57a0 .long 0xc3d12a73,0x42271f59,0xa5c2c51d,0x5f71687c,0x05797bcb,0xcb1f50c6,0xd6d34eb0,0x29ed0ed9,0x4683c2eb,0xe5fe5b47,0x97447c46,0x4956eeb5,0x71207167,0x5b163a43,0x0248c5ef,0x93fa2fed .long 0x31f63950,0x67930af2,0x14caa2c9,0xa77797c1,0x27ac7e62,0x526e80ee,0x58b28aec,0xe1e6e626,0xb3c9fef0,0x636178b0,0x6d5f90be,0xaf7752e0,0xeece51cf,0x94ecaf18,0xca806e1f,0x2864d0ed .long 0x97c69134,0x6de2e383,0xeb291293,0x5a42c316,0x6a60bae0,0xc7779219,0x6b7599d1,0xa24de346,0xb75d4941,0x49d374aa,0x2d501ff0,0x98900586,0xeb7974cf,0x9f16d40e,0xcdd8c115,0x1033860b .long 0x2094cec3,0xb6c69ac8,0x403b770c,0x9976fb88,0x4859590d,0x1dea026c,0x8562d1fd,0xb6acbb46,0x44569d85,0x7cd6c461,0x97f0891d,0xc3190a36,0x48d5a17d,0xc6f53195,0xd749abc8,0x7d919966 .long 0xdd1c8a20,0x65104837,0x2f683419,0x7e5410c8,0xbe94022e,0x958c3ca8,0x6145dac2,0x605c3197,0x01683d54,0x3fc07501,0x595b1234,0x1d7127c5,0x9481277f,0x10b8f87c,0xe65a1adb,0x677db2a8 .long 0xddce3345,0xec2fccaa,0x012a4350,0x2a6811b7,0xac598bdc,0x96760ff1,0xd1bf4128,0x054d652a,0x92a21005,0x0a1151d4,0x33110fdf,0xad7f3971,0x1960100f,0x8c95928c,0x7bf03362,0x6c91c825 .long 0xce309f06,0xc8c8b2a2,0xca27204b,0xfdb27b59,0x0848e32e,0xd223eaa5,0xe7bfaf1e,0xb93e4b2e,0x44aa3ded,0xc5308ae6,0xc015d573,0x317a666a,0x1a979707,0xc888ce23,0x0d5c4958,0xf141c1e6 .long 0x61906373,0xb53b7de5,0xeb999595,0x858dbade,0xa59e5c36,0x8cbb47b2,0xdcf4e842,0x660318b3,0x12ba4b7a,0xbd161ccd,0xf8c8282a,0xf399daab,0xeeb2130d,0x1587633a,0xda38dd7d,0xa465311a .long 0x64d3779b,0x5f75eec8,0xad64c171,0x3c5d0476,0x2a914428,0x87410371,0x90e2fc29,0x8096a891,0x23b3ebc2,0xd3d2ae9d,0xa580cfd6,0x90bdd6db,0xc5b01f6c,0x52dbb7f3,0xe102a2dc,0xe68eded4 .long 0x99eb6df0,0x17785b77,0x7386b779,0x26c3cc51,0x6417a48e,0x345ed988,0x07d6ef31,0xe990b4e4,0x2586abba,0x0f456b7e,0x59c96e9a,0x239ca6a5,0xe2eb4206,0xe327459c,0xa002b90a,0x3a4c3313 .long 0xf6a3f6fb,0x2a114806,0x85c251dd,0xad5cad2f,0xf5a784d3,0x92c1f613,0x349766d5,0xec7bfacf,0x3e23cb3b,0x04b3cd33,0xc5a64b2d,0x3979fe84,0x7e589106,0x192e2720,0xa15b527f,0xa60c43d1 .long 0xbe7cf3a6,0x2dae9082,0xbc967274,0xcc86ba92,0xaea0a8a9,0xf28a2ce8,0x6ee988b3,0x404ca6d9,0x005921b8,0xfd7e9c5d,0x44e79bf9,0xf56297f1,0x0d75ddc2,0xa163b460,0xa1f2be87,0x30b23616 .long 0xbfe50e2b,0x4b070d21,0xe1bfede1,0x7ef8cfd0,0x2aac4ae0,0xadba0011,0xb9ebd033,0x2a3e7d01,0xe38d9d1c,0x995277ec,0x9c5d2de3,0xb500249e,0xf13ca8c9,0x8912b820,0x877793af,0xc8798114 .long 0xec3f1dec,0x19e6125d,0x911178da,0x07b1f040,0x904a6738,0xd93ededa,0x0bebedcd,0x55187a5a,0xeb329d41,0xf7d04722,0xf170b391,0xf449099e,0xca99f828,0xfd317a69,0x34a4976d,0x50c3db2b .long 0x3757b392,0xe9ba7784,0xaa3ca05a,0x326caefd,0xf1e593d4,0x78e5293b,0x0d98fd13,0x7842a937,0x5f96b10d,0xe694bf96,0x06a8cd05,0x373a9df6,0xe8f0c7fc,0x997d1e51,0x63fd972e,0x1d019790 .long 0x5499fb32,0x0064d858,0x77a8aeb7,0x7b67bad9,0x2d08eec5,0x1d3eb977,0xcbabae1d,0x5fc047a6,0xe54a64bb,0x0577d159,0xc43497e4,0x8862201b,0x2ce0608d,0xad6b4e28,0x0b167aac,0x8b687b7d .long 0x8b2ecfa9,0x6ed4d367,0xa90c3c38,0x24dfe62d,0x3fe5c42b,0xa1862e10,0xd5732a9f,0x1ca73dca,0x76bb87ad,0x35f038b7,0xf242b81f,0x674976ab,0xb0fd90cd,0x4f2bde7e,0xa7fdf092,0x6efc172e .long 0x92222f1f,0x3806b69b,0x6cf7ae70,0x5a2459ca,0xa85217ee,0x6789f69c,0xe3dc85ac,0x5f232b5e,0x48e9e516,0x660e3ec5,0x3197eb31,0x124b4e47,0xaafcca23,0x10a0cb13,0x8213224f,0x7bd63ba4 .long 0x290a7f4f,0xaffad7cc,0x0286b461,0x6b409c9e,0xffa407af,0x58ab809f,0xc68ac073,0xc3122eed,0x4ef24d7e,0x17bf9e50,0x3e2a5811,0x5d929794,0x02902e01,0x519bc867,0x39c8a851,0x76bba5da .long 0xda94951e,0xe9f9669c,0x66b8d418,0x4b6af58d,0x17d426a4,0xfa321074,0x9dde6027,0xc78e66a9,0x4a53b964,0x0516c083,0xff602330,0xfc659d38,0x58c5c897,0x0ab55e5c,0x838bc5df,0x985099b2 .long 0xc52fc238,0x061d9efc,0x6ac1da3f,0x712b2728,0x9283fe08,0xfb658149,0xb8aaa2f7,0x4954ac94,0x7fb2e74f,0x85c0ada4,0xb89926b0,0xee8ba98e,0x23d1af5b,0xe4f9d37d,0xba9b015e,0x14ccdbf9 .long 0x7bfe7178,0xb674481b,0x65405868,0x4e1debae,0xc48c867d,0x061b2821,0x513b30ea,0x69c15b35,0x36871088,0x3b4a1666,0x1220b1ff,0xe5e29f5d,0x233d9f4d,0x4b82bb35,0x18cdc675,0x4e076333 .long 0xa3e6fced,0x0d53f5c7,0xf45fbdeb,0xe8cbbdd5,0x13339a70,0xf85c01df,0x142ceb81,0x0ff71880,0xbd70437a,0x4c4e8774,0xba0bda6a,0x5fb32891,0xf18bd26e,0x1cdbebd2,0x03a9d522,0x2f9526f1 .long 0x92c4d684,0x40ce3051,0x7612efcd,0x8b04d725,0x6f9cae20,0xb9dcda36,0xf058856c,0x0edc4d24,0x85427900,0x64f2e6bf,0xdc09dfea,0x3de81295,0x379bf26c,0xd41b4487,0x6df135a9,0x50b62c6d .long 0xc72dfe67,0xd4f8e3b4,0x90e19fdf,0xc416b0f6,0x4c13bd35,0x18b9098d,0x15b8cb9e,0xac11118a,0xf0062841,0xf598a318,0x89f356f4,0xbfe0602f,0x30177a0c,0x7ae3637e,0x61136537,0x34097747 .long 0xd005832a,0x0db2fb5e,0x91042e4f,0x5f5efd3b,0xed70f8ca,0x8c4ffdc6,0xb52da9cc,0xe4645d0b,0xc9001d1f,0x9596f58b,0x4e117205,0x52c8f0bc,0xe398a084,0xfd4aa0d2,0x104f49de,0x815bfe3a .long 0x23885e5f,0x97e5443f,0xe8433aab,0xf72f8f99,0xe4d4e604,0xbd00b154,0xe5e173ff,0xd0b35e6a,0x9164722d,0x57b2a048,0x88761ec8,0x3e3c665b,0x3da83832,0x6bdd1397,0x73dafe3b,0x3c8b1a1e .long 0x54317cac,0x4497ace6,0x521771b3,0xbe600ab9,0xb0dfe8b8,0xb42e409e,0x3942310f,0x386a67d7,0x4431cc28,0x25548d8d,0x985dc524,0xa7cff142,0x93c4be32,0x4d60f5a1,0xd071c6e1,0x83ebd5c8 .long 0xb1fd2b0b,0xba3a80a7,0x5bec33e8,0x9b3ad396,0x79743fb3,0xb3868d61,0xfdb462fa,0xcfd169fc,0x9ce0a6af,0xd3b499d7,0xe42d3ff8,0x55dc1cf1,0xc6c3e1b2,0x04fb9e6c,0x6f69a474,0x47e6961d .long 0xe548b37b,0x54eb3acc,0x84d40549,0xb38e7542,0x7b341b4f,0x8c3daa51,0x690bf7fa,0x2f6928ec,0x86ce6c41,0x0496b323,0x10adadcd,0x01be1c55,0x4bb5faf9,0xc04e67e7,0xe15c9985,0x3cbaf678 .long 0x50ca4247,0x8cd12145,0xe7dd30aa,0xba1aa47a,0xe58fee24,0x2f81ddf1,0xeec9b0e8,0x03452936,0x243aea96,0x8bdc3b81,0x15c3d0e5,0x9a2919af,0x10948361,0x9ea640ec,0x6e0bcccf,0x5ac86d5b .long 0xc36cf440,0xf892d918,0xc939719c,0xaed3e837,0xc0218b64,0xb07b08d2,0xce9790dd,0x6f1bcbba,0x60919b8e,0x4a84d6ed,0x8ac1f9eb,0xd8900791,0x0dd5daef,0xf84941aa,0x67fd62c5,0xb22fe40a .long 0x157f2db3,0x97e15ba2,0x8e28ca9c,0xbda2fc8f,0x37b9f454,0x5d050da4,0x2379d72e,0x3d57eb57,0xfb5ee997,0xe9b5eba2,0xe11538ca,0x01648ca2,0xf6327974,0x32bb76f6,0xff3f4bb7,0x338f14b8 .long 0xd7ab9a2d,0x524d226a,0x7dfae958,0x9c00090d,0x8751d8c2,0x0ba5f539,0x3ab8262d,0x8afcbcdd,0xe99d043b,0x57392729,0xaebc943a,0xef51263b,0x20862935,0x9feace93,0xb06c817b,0x639efc03 .long 0x66b4be7a,0x1fe054b3,0x84a37a1e,0x3f25a9de,0x78d75cd9,0xf39ef1ad,0x5062c1b5,0xd7b58f49,0xff563436,0x6f74f9a9,0xe8af51e7,0xf718ff29,0x15e97fec,0x5234d313,0x292f1c0a,0xb6a8e2b1 .long 0x327720c1,0xa7f53aa8,0xba092cc8,0x956ca322,0x28746c4d,0x8f03d64a,0x66d0d392,0x51fe1782,0x3c832c80,0xd19b34db,0x6da2e3b4,0x60dccc5c,0x0a104ccc,0x245dd62e,0x620b21fd,0xa7ab1de1 .long 0x3893d123,0xb293ae0b,0xb15ee71c,0xf7b75783,0x42a9468b,0x5aa3c614,0xdb15d744,0xd686123c,0xa7ab4116,0x8c616891,0xa4e6a459,0x6fcd72c8,0x77e5fad7,0xac219110,0x704fa46b,0xfb6a20e7 .long 0x341d81dc,0xe839be7d,0x32148379,0xcddb6889,0xf7026ead,0xda6211a1,0xf4d1cc5e,0xf3b2575f,0xa7a73ae6,0x40cfc8f6,0x61d5b483,0x83879a5e,0x41a50ebc,0xc5acb1ed,0x3c07d8fa,0x59a60cc8 .long 0xb1876262,0x1b73bdce,0x12af4ee9,0x2b0d79f0,0xd46e1d07,0x8bcf3b0b,0xe45d152f,0x17d6af9d,0x6d736451,0x73520461,0x56b0bf5a,0x43cbbd97,0xd5999b9d,0xb0833a5b,0xeb72e398,0x702614f0 .long 0x59c3e9f8,0x0aadf01a,0xce6b3d16,0x40200e77,0xdeddafad,0xda22bdd3,0x310d72e1,0x76dedaf4,0x4bc2e88f,0x49ef807c,0x146dd5a5,0x6ba81291,0x7d8d59e9,0xa1a4077a,0x802db349,0x87b6a2e7 .long 0x1b4e598e,0xd5679997,0x06fe4b1d,0xf499ef1f,0xfcb267c5,0x3978d3ae,0x235786d0,0xb582b557,0x1715cb07,0x32b3b2ca,0x8480241d,0x4c3de6a2,0xcb571ecd,0x63b5ffed,0xed2fe9a9,0xeaf53900 .long 0xc3b81990,0xdec98d4a,0x9e0cc8fe,0x1cb83722,0xd2b427b9,0xfe0b0491,0xe983a66c,0x0f2386ac,0xb3291213,0x930c4d1e,0x59a62ae4,0xa2f82b2e,0xf93e89e3,0x77233853,0x11777c7f,0x7f8063ac .long 0x59ad2877,0xff0eb567,0x9865c754,0x6f454642,0x236e9a84,0xe6fe701a,0x06e40fc3,0xc586ef16,0x24bafad9,0x3f62b6e0,0x64da906a,0xc8b42bd2,0xda3276a0,0xc98e1eb4,0x06cbf852,0x30d0e5fc .long 0xe8b4dfd4,0x1b6b2ae1,0x8301cbac,0xd754d5c7,0x112a39ac,0x66097629,0x93ba4ab9,0xf86b5999,0x99f9d581,0x26c9dea7,0xc2fafeaa,0x0473b1a8,0x3b2505a5,0x1469af55,0xd6a43323,0x227d16d7 .long 0xad3d97f9,0x3316f73c,0x1f137455,0x52bf3bb5,0x09954e7c,0x953eafeb,0xdd732411,0xa721dfed,0x141d4579,0xb4929821,0xaa3bd435,0x3411321c,0x17fa6015,0xafb355aa,0x18e42f0e,0xb4e7ef4a .long 0x59371000,0x604ac97c,0x7f759c18,0xe1c48c70,0xa5db6b65,0x3f62ecc5,0x38a21495,0x0a78b173,0xbcc8ad94,0x6be1819d,0xd89c3400,0x70dc04f6,0xa6b4840a,0x462557b4,0x60bd21c0,0x544c6ade .long 0x907a544b,0x6a00f24e,0x313da210,0xa7520dcb,0x11e4994b,0xfe939b75,0xbc275d70,0x918b6ba6,0x644be892,0xd3e5e0fc,0xfdaf6c42,0x707a9816,0xf15c13fe,0x60145567,0xe130a54a,0x4818ebaa .long 0x58d2f767,0x28aad3ad,0xd7e7c773,0xdc5267fd,0xc3afcc98,0x4919cc88,0x2db8cd4b,0xaa2e6ab0,0xd0c63eaa,0xd46fec04,0x19ffa832,0xa1cb92c5,0xe43a631f,0x678dd178,0x3dc788b3,0xfb5ae1cd .long 0x6e77de04,0x68b4fb90,0xf06dbb97,0x7992bcf0,0xc417c01d,0x896e6a13,0xb956be01,0x8d96332c,0x413aa2b9,0x902fc93a,0xfc98c8a5,0x99a4d915,0x565f1137,0x52c29407,0x21e4f281,0x4072690f .long 0x02ff6072,0x36e607cf,0x8ad98cdc,0xa47d2ca9,0xf5f56609,0xbf471d1e,0xf264ada0,0xbcf86623,0xaa9e5cb6,0xb70c0687,0x17401c6c,0xc98124f2,0xd4a61435,0x8189635f,0xa9d98ea6,0xd28fb8af .long 0x40c251f8,0xb9a67c2a,0xa2da44be,0x88cd5d87,0xe09b5423,0x437deb96,0x64287dc1,0x150467db,0xcdabb839,0xe161debb,0xf1839a3e,0xa79e9742,0x652d202b,0xbb8dd3c2,0xe9f97d96,0x7b3e67f7 .long 0xb1cb6ac9,0x5aa5d78f,0xca1d0d45,0xffa13e8e,0x2ba5bf95,0x369295dd,0x39aff05e,0xd68bd1f8,0x26d783f2,0xaf0d86f9,0xfc3aafc1,0x543a59b3,0x7b7da97c,0x3fcf81d2,0xd25dee46,0xc990a056 .long 0x519cce2c,0x3e6775b8,0xae13d863,0xfc9af71f,0x47c1605c,0x774a4a6f,0x2fd205e8,0x46ba4245,0xd3fd524d,0xa06feea4,0x6de1acc2,0x1e724641,0x334e2b42,0xf53816f1,0x922f0024,0x49e5918e .long 0x65c7322d,0x439530b6,0xb3c1b3fb,0xcf12cc01,0x0172f685,0xc70b0186,0x1b58391d,0xb915ee22,0xa317db24,0x9afdf03b,0x17b8ffc4,0x87dec659,0xe4d3d050,0x7f46597b,0x006500e7,0x80a1c1ed .long 0x78bf030e,0x84902a96,0x50560148,0xfb5e9c9a,0x63362426,0x6dae0a92,0xa9e30c40,0xdcaeecf4,0x518d0c6b,0xc0d887bb,0xcb985b9d,0x99181152,0xef7bc381,0xad186898,0x9ee46201,0x18168ffb .long 0x2502753c,0x9a04cdaa,0x51407c41,0xbb279e26,0xf23564e5,0xeacb03aa,0x71e61016,0x18336582,0xeb809877,0x8684b8c4,0xea0e672e,0xb336e18d,0x34ee5867,0xefb601f0,0x1341cfd1,0x2733edbe .long 0x26025c3c,0xb15e809a,0x9350df88,0xe6e981a6,0x8502fd8e,0x92376237,0x0c12be9b,0x4791f216,0x25f02425,0xb7256789,0x7a974443,0xec863194,0xfb41cc52,0x7c0ce882,0xf25c07f2,0xc266ff7e .long 0x017025f3,0x3d4da8c3,0xfb9579b4,0xefcf628c,0x1f3716ec,0x5c4d0016,0x6801116e,0x9c27ebc4,0x1da1767e,0x5eba0ea1,0x47004c57,0xfe151452,0x8c2373b7,0x3ace6df6,0x5dbc37ac,0x75c3dffe .long 0xddc925fc,0x3dc32a73,0x2f65ee0b,0xb679c841,0x451cbfeb,0x715a3295,0xf76e9a29,0xd9889768,0xb28ad247,0xec20ce7f,0x00894d79,0xe99146c4,0x9f5e3ea7,0x71457d7c,0x38030031,0x097b2662 .long 0xcf9f82a8,0xdb7f6ae6,0x438f473a,0x319decb9,0x283856c3,0xa63ab386,0xb06a361b,0x13e3172f,0x7d5a006c,0x2959f8dc,0x75fba752,0x2dbc27c6,0x87c22c9e,0xc1227ab2,0x71a268b2,0x06f61f75 .long 0x04779ce2,0x1b6bb971,0x0aadcb1d,0xaca83812,0xaeaab2d5,0x297ae0bc,0x5bfb9f13,0xa5c14ee7,0xf17a62c7,0xaa00c583,0x173759f6,0x39eb962c,0x86c9a88f,0x1eeba1d4,0xdf016c5e,0x0ab6c37a .long 0xa28a0749,0xa2a147db,0xee519165,0x246c20d6,0xd3810715,0x5068d1b1,0x748160b9,0xb1e7018c,0xf380ff62,0x03f5b1fa,0xf3cb2c1e,0xef7fb1dd,0xfc91a7da,0xeab539a8,0xf3f9b561,0x83ddb707 .long 0xfe7df7a4,0xc550e211,0x063f6f40,0xa7cd07f2,0x2976879c,0xb0de3635,0xe55741da,0xb5f83f85,0xf3d8ac3d,0x4ea9d25e,0x62819f02,0x6fe2066f,0xcef4a564,0x4ab2b9c2,0x5ffa2de3,0x1e155d96 .long 0xc3a72d00,0x0eb0a19b,0x8513c31b,0x4037665b,0x04c64637,0x2fb2b6bf,0x08cdc639,0x45c34d6e,0xf01fd796,0x56f1e10f,0xfe3667b8,0x4dfb8101,0x9021d0c0,0xe0eda253,0x8a06c6ab,0x7a94e9ff .long 0xbb9aa882,0x2d3bb0d9,0xec05fd10,0xea20e4e5,0x1a1ca64e,0xed7eeb5f,0xc6327cbd,0x2fa6b43c,0x3aa91121,0xb577e3cf,0x3a34079b,0x8c6bd5ea,0x60e02fc0,0xd7e5ba39,0x90141bf8,0xf16dd2c3 .long 0x80101b98,0xb57276d9,0xb82f0f66,0x760883fd,0x4bc3eff3,0x89d7de75,0x5dc2ab40,0x03b60643,0xe05beeac,0xcd6e53df,0xbc3325cd,0xf2f1e862,0x774f03c3,0xdd0f7921,0x4552cc1b,0x97ca7221 .long 0x1cd19f72,0x5a0d6afe,0xf183fbeb,0xa20915dc,0x832c403c,0x9fda4b40,0xbe425442,0x32738edd,0xb5eccf1a,0x469a1df6,0x28bbe1f0,0x4b5aff42,0x570dfc93,0x31359d7f,0xf0088628,0xa18be235 .long 0xb00ed3a9,0xa5b30fba,0x73cdf8be,0x34c61374,0xabc56797,0x2c5c5f46,0xb82a8ae2,0x5cecf93d,0xa968fbf0,0x7d3dbe41,0x1a5c7f3d,0xd23d4583,0xc087a9c7,0xf28f69a0,0x474471ca,0xc2d75471 .long 0x4eb732ec,0x36ec9f4a,0xb1ca6bed,0x6c943bbd,0xf2457892,0xd64535e1,0xf7e2ac06,0x8b84a8ea,0x2499dd5f,0xe0936cd3,0x0ed04e57,0x12053d7e,0xe4305d9d,0x4bdd0076,0x1f67f0a2,0x34a527b9 .long 0x9cec46ea,0xe79a4af0,0x658b9bc7,0xb15347a1,0x35af2f75,0x6bd2796f,0x4051c435,0xac957990,0xc33a655d,0x2669dda3,0x88514aa3,0x5d503c2e,0x3753dd41,0xdfa11337,0x0b754f78,0x3f054673 .long 0x496125bd,0xbf185677,0x3775006c,0xfb0023c8,0x3a037899,0xfa0f072f,0x0e4aea57,0x4222b6eb,0x7866d25a,0x3dde5e76,0x4837aa6f,0xb6eb04f8,0x2cf1cdb8,0x5315591a,0x2d4e683c,0x6dfb4f41 .long 0x48ee1f3a,0x7e923ea4,0x05a2afd5,0x9604d9f7,0x40ea4948,0xbe1d4a33,0xb44cbd2f,0x5b45f1f4,0x4acc757e,0x5faf8376,0x63d68ff7,0xa7cf9ab8,0xdf0e404b,0x8ad62f69,0x12bdafdf,0xd65f33c2 .long 0xa377b14e,0xc365de15,0x8e39f60c,0x6bf5463b,0x2ce68148,0x62030d2d,0xe6f843a8,0xd95867ef,0xef5ab017,0xd39a0244,0x4ab55d12,0x0bd2d8c1,0x41639169,0xc9503db3,0xf7660c8a,0x2d4e25b0 .long 0xe224c5d7,0x760cb3b5,0x68616919,0xfa3baf8c,0x8d142552,0x9fbca113,0x7669ebf5,0x1ab18bf1,0x9bdf25dd,0x55e6f53e,0xcb6cd154,0x04cc0bf3,0x95e89080,0x595bef49,0x104a9ac1,0xfe9459a8 .long 0xcce9bb32,0xad2d89ca,0xf7de8285,0xddea65e1,0xb351bd4b,0x62ed8c35,0x0c0e19a7,0x4150ff36,0x345f4e47,0x86e3c801,0x203a266c,0x3bf21f71,0x855b1f13,0x7ae110d4,0x07262517,0x5d6aaf6a .long 0x813d28f1,0x1e0f12e1,0x7ad7a523,0x6000e11d,0xc744a17b,0xc7d8deef,0x14c05a00,0x1e990b48,0x93e976d5,0x68fddaee,0x46610d63,0x696241d1,0x893dda88,0xb204e7c3,0x6a3a6946,0x8bccfa65 .long 0xc5cd1411,0xb59425b4,0xff3658b1,0x701b4042,0x4784cf93,0xe3e56bca,0x8fe68d60,0x27de5f15,0xf8d53f19,0x4ab9cfce,0xa40a730d,0xddb10311,0x4eee0a8a,0x6fa73cd1,0x5249719d,0xfd548748 .long 0xa8123ef0,0x49d66316,0xe7f95438,0x73c32db4,0x0d9e7854,0x2e2ed209,0x9d9f0507,0xf98a9329,0x0c6aa20a,0xc5d33cf6,0x75279bb2,0x9a32ba14,0x774a7307,0x7e3202cb,0xe8c42dbd,0x64ed4bc4 .long 0xd4caed0d,0xc20f1a06,0x171d22b3,0xb8021407,0xd13268d7,0xd426ca04,0x25f4d126,0x92377007,0x71f21a85,0x4204cbc3,0xf82369ba,0x18461b7a,0x3fc858f9,0xc0c07d31,0xe2bab569,0x5deb5a50 .long 0xd5eea89e,0xd5959d46,0x08437f4b,0xfdff8424,0x3cfe254f,0xf21071e4,0x95468321,0x72417696,0x102cae3e,0x5d8288b9,0xf1965dff,0x2d143e3d,0xa078d847,0x00c9a376,0x26028731,0x6fc0da31 .long 0xe45083a2,0xa2baeadf,0x5e5b4bcd,0x66bc7218,0xd04b8e7f,0x2c826442,0x6c4b586b,0xc19f5451,0x5b7eeed5,0x60182c49,0x7aa9dfa1,0xd9954ecd,0xc73884ad,0xa403a8ec,0x9bb39041,0x7fb17de2 .long 0xabb020e8,0x694b64c5,0x19c4eec7,0x3d18c184,0x1c4793e5,0x9c4673ef,0x056092e6,0xc7b8aeb5,0xf0f8c16b,0x3aa1ca43,0xd679b2f6,0x224ed5ec,0x55a205c9,0x0d56eeaf,0x4b8e028b,0xbfe115ba .long 0x3927f4fe,0x97e60849,0x759aa7c5,0xf91fbf94,0x6be90a51,0x985af769,0x78ccb823,0xc1277b78,0xe7a75952,0x395b656e,0x928da5f5,0x00df7de0,0x4ca4454f,0x09c23175,0x7aa2d3c1,0x4ec971f4 .long 0xe75d9ccc,0x45c3c507,0x3dc90306,0x63b7be8a,0x5db44bdc,0x37e09c66,0x6841c6a2,0x50d60da1,0x08df1b12,0x6f9b65ee,0x7ff089df,0x38734879,0x3fe8013d,0x9c331a66,0x5f42fcc8,0x017f5de9 .long 0xe8e57567,0x43077866,0xf9fcdb18,0xc9f781ce,0x9b12e174,0x38131dda,0x8a03752a,0x25d84aa3,0x4d0c0ce2,0x45e09e09,0x92bebba5,0x1564008b,0xa87284c7,0xf7e8ad31,0x97e7bbaa,0xb7c4b46c .long 0x97acf4ec,0x3e22a7b3,0x5ea8b640,0x0426c400,0x4e969285,0x5e3295a6,0xa6a45670,0x22aabc59,0x5f5942bc,0xb929714c,0xfa3182ed,0x9a6168bd,0x104152ba,0x2216a665,0xb6926368,0x46908d03 .long 0x5a1251fb,0xa9f5d874,0xc72725c7,0x967747a8,0x31ffe89e,0x195c33e5,0xe964935e,0x609d210f,0x2fe12227,0xcafd6ca8,0x0426469d,0xaf9b5b96,0x5693183c,0x2e9ee04c,0xc8146fef,0x1084a333 .long 0xaed1d1f7,0x96649933,0x50563090,0x566eaff3,0xad2e39cf,0x345057f0,0x1f832124,0x148ff65b,0xcf94cf0d,0x042e89d4,0x520c58b3,0x319bec84,0x5361aa0d,0x2a267626,0x8fbc87ad,0xc86fa302 .long 0x5c8b06d5,0xfc83d2ab,0xfe4eac46,0xb1a785a2,0x846f7779,0xb99315bc,0xef9ea505,0xcf31d816,0x15d7dc85,0x2391fe6a,0xb4016b33,0x2f132b04,0x181cb4c7,0x29547fe3,0x650155a1,0xdb66d8a6 .long 0xadc1696f,0x6b66d7e1,0x0acd72d0,0x98ebe593,0xcc1b7435,0x65f24550,0xb4b9a5ec,0xce231393,0xdb067df9,0x234a22d4,0xcaff9b00,0x98dda095,0x6100c9c1,0x1bbc75a0,0x939cf695,0x1560a9c8 .long 0x99e0925f,0xcf006d3e,0x6322375a,0x2dd74a96,0xb56af5ba,0xc58b446a,0xe0b9b4f1,0x50292683,0x1aeaffa3,0xe2c34cb4,0x9b9587c1,0x8b17203f,0xead1350c,0x6d559207,0xfb7f9604,0x2b66a215 .long 0xfe51bf74,0x0850325e,0x5e460094,0x9c4f579e,0x76da2f25,0x5c87b92a,0x6febef33,0x889de4e0,0x646083ce,0x6900ec06,0xbfe12773,0xbe2a0335,0xc5344110,0xadd1da35,0xb802cd20,0x757568b7 .long 0x00f7e6c8,0x75559779,0x0facd2f0,0x38e8b94f,0x03fde375,0xfea1f3af,0x75881dfc,0x5e11a1d8,0xc1e2f2ef,0xb3a6b02e,0xc605a6c5,0x193d2bbb,0x339a0b2d,0x325ffeee,0x9e0c8846,0x27b6a724 .long 0xf1c367ca,0xe4050f1c,0xc90fbc7d,0x9bc85a9b,0xe1a11032,0xa373c4a2,0xad0393a9,0xb64232b7,0x167dad29,0xf5577eb0,0x94b78ab2,0x1604f301,0xe829348b,0x0baa94af,0x41654342,0x77fbd8dd .long 0xb964e39a,0xdab50ea5,0xd0d3c76e,0xd4c29e3c,0x56d11964,0x80dae67c,0xe5ffcc2f,0x7307a8bf,0x91708c3b,0x65bbc1aa,0x28bf0eeb,0xa151e62c,0x6fa34db7,0x6cb53381,0xa29403a8,0x5139e05c .long 0x94a7cd2e,0x6ff651b4,0x0699336c,0x5671ffd1,0x979a896a,0x6f5fd2cc,0xd8148cef,0x11e893a8,0x65cf7b10,0x988906a1,0xc50d8485,0x81b67178,0x8a35b3de,0x7c0deb35,0xc1d29799,0x423ac855 .long 0xdac50b74,0xaf580d87,0x5869734c,0x28b2b89f,0x874e28fb,0x99a3b936,0x25f3f73a,0xbb2c9190,0x84a9d5b7,0x199f6918,0x7e770374,0x7ebe2325,0x0738efe2,0xf442e107,0xcf9082d2,0xcf9f3f56 .long 0x09618708,0x719f69e1,0xc183f9b1,0xcc9e8364,0x366a21af,0xec203a95,0x068b141f,0x6aec5d6d,0x994f04e9,0xee2df78a,0x271245b0,0xb39ccae8,0x97e43f4f,0xb875a4a9,0xdb2cea98,0x507dfe11 .long 0x489b03e9,0x4fbf81cb,0x6ec414fa,0xdb86ec5b,0xf51b3ae5,0xfad444f9,0x1914e3fe,0xca7d33d6,0x0ae6c4d0,0xa9c32f5c,0x73969568,0xa9ca1d1e,0x1aa7467e,0x98043c31,0xe21b5ac6,0xe832e75c .long 0x5232123d,0x314b7aea,0x65ae86db,0x08307c8c,0xaa4668ed,0x06e7165c,0xb4d3ec39,0xb170458b,0xc19bb986,0x4d2e3ec6,0xae0304ed,0xc5f34846,0x6c9f9722,0x917695a0,0x4cab1c0a,0x6c7f7317 .long 0x9d6d2e8b,0x6295940e,0x549f7c97,0xd318b8c1,0x97713885,0x22453204,0xa8a440fe,0x468d834b,0xbfba796e,0xd81fe5b2,0x6d71f116,0x152364db,0xb5b66e53,0xbb8c7c59,0x2641a192,0x0b12c61b .long 0xfcf0a7fd,0x31f14802,0x5488b01e,0x42fd0789,0x9952b498,0x71d78d6d,0x07ac5201,0x8eb572d9,0x4d194a88,0xe0a2a44c,0xba017e66,0xd2b63fd9,0xf888aefc,0x78efc6c8,0x4a881a11,0xb76f6bda .long 0xb46c2397,0x187f314b,0x5ded2819,0x004cf566,0x38764d34,0xa9ea5704,0x78084709,0xbba45217,0x1171121e,0x06474571,0xe7c9b671,0xad7b7eb1,0x730f7507,0xdacfbc40,0xc7ad7bd1,0x178cd8c6 .long 0xb2a67238,0xbf0be101,0xaf9c14f2,0x3556d367,0xa5662075,0x104b7831,0x79d9e60a,0x58ca59bb,0xa569a73b,0x4bc45392,0x5698f6c9,0x517a52e8,0xaeadd755,0x85643da5,0x2a581b84,0x1aed0cd5 .long 0x80af1372,0xb9b4ff84,0xf1ba5d1f,0x244c3113,0xf5f98d31,0x2a5dacbe,0x4375bc2a,0x2c3323e8,0x5594b1dd,0x17a3ab4a,0xceb4797e,0xa1928bfb,0xe4886a19,0xe83af245,0x72b5a74a,0x8979d546 .long 0x19f9e967,0xa0f726bc,0xe8fbbf4e,0xd9d03152,0xb7707d40,0xcfd6f51d,0x63f6e6e0,0x633084d9,0x55667eaf,0xedcd9cdc,0x2e44d56f,0x73b7f92b,0x4e962b14,0xfb2e39b6,0xf671fcbf,0x7d408f6e .long 0x164a89bb,0xcc634ddc,0x3ef3bd05,0x74a42bb2,0x428decbb,0x1280dbb2,0x402c8596,0x6103f6bb,0x355a5752,0xfa2bf581,0x00946674,0x562f96a8,0x6da0223b,0x4e4ca16d,0x28d3aa25,0xfe47819f .long 0xf8dfcf8a,0x9eea3075,0x95669825,0xa284f0aa,0x867d3fd8,0xb3fca250,0x269d691e,0x20757b5f,0x93b8a5de,0xf2c24020,0xebc06da6,0xd3f93359,0xb2739c33,0x1178293e,0xbcd686e5,0xd2a3e770 .long 0xcd941534,0xa76f49f4,0xe3c71c0e,0x0d37406b,0x3b97f7e3,0x172d9397,0xbd7fd0de,0xec17e239,0x6f496ba2,0xe3290551,0x36ad50e7,0x6a693172,0x83e7eff5,0xc4e539a2,0x18e1b4cf,0x752737e7 .long 0x68af43ee,0xa2f7932c,0x703d00bd,0x5502468e,0x2fb061f5,0xe5dc978f,0x28c815ad,0xc9a1904a,0x470c56a4,0xd3af538d,0x193d8ced,0x159abc5f,0x20108ef3,0x2a37245f,0x223f7178,0xfa17081e .long 0x10c8c0f5,0x27b0fb2b,0x40650547,0x2102c3ea,0x8ac3bfa7,0x594564df,0x509dad96,0x98102033,0xf1d18a13,0x6989643f,0xd7fc5af0,0x35eebd91,0xfaeaafd8,0x078d096a,0xdef3de98,0xb7a89341 .long 0xecf2a73a,0x2a206e8d,0x8e551994,0x066a6397,0xb98d53a2,0x3a6a088a,0x2d1124aa,0x0ce7c67c,0x759a113c,0x48cec671,0x4f6f67fa,0xe3b373d3,0xfd36727b,0x5455d479,0xa13c0d81,0xe5a428ee .long 0x1c86682b,0xb853dbc8,0xb8d02b2a,0xb78d2727,0x8ebc329a,0xaaf69bed,0x293b2148,0xdb6b40b3,0xb8c4961f,0xe42ea77d,0x20e5e0ab,0xb1a12f7c,0x79e8b05e,0xa0ec5274,0xfab60a80,0x68027391 .long 0x16b1bd5e,0x6bfeea5f,0x4de30ad3,0xf957e420,0x6a353b9e,0xcbaf664e,0x26d14feb,0x5c873312,0xb65f57cb,0x4e87f98c,0x5e0cdd41,0xdb60a621,0xa6881440,0x67c16865,0x46ab52aa,0x1093ef1a .long 0x3f4ece64,0xc095afb5,0x7604551a,0x6a6bb02e,0x0b26b8cd,0x55d44b4e,0xf971268a,0xe5f9a999,0x11a7de84,0xc08ec425,0xfda469dd,0x83568095,0x6c6c90a2,0x737bfba1,0xbe229831,0x1cb9c4a0 .long 0xbb2eec64,0x93bccbba,0xda03adbe,0xa0c23b64,0xe0e86ac4,0x5f7aa00a,0xfc1401e6,0x470b941e,0x9df43574,0x5ad8d679,0x0f65d810,0x4ccfb8a9,0xaa7fbd81,0x1bce80e3,0x9508d20a,0x273291ad .long 0x42a92806,0xf5c4b46b,0xa86ab44a,0x810684ec,0xca0bc9f8,0x4591640b,0x5c4b6054,0xb5efcdfc,0x6e9edd12,0x16fc8907,0xd4d792f9,0xe29d0b50,0x9b03116d,0xa45fd01c,0xc81765a4,0x85035235 .long 0xb4b4b67c,0x1fe2a9b2,0xe8020604,0xc1d10df0,0xbc8058d8,0x9d64abfc,0x712a0fbb,0x8943b9b2,0x3b3def04,0x90eed914,0x4ce775ff,0x85ab3aa2,0x7bbc9040,0x605fd4ca,0xe2c75dfb,0x8b34a564 .long 0x10358560,0x41ffc94a,0x9e5c28aa,0x2d8a5072,0x4cc7eb15,0xe915a0fc,0x8f6d0f5d,0xe9efab05,0xd19e9b91,0xdbab47a9,0x0276154c,0x8cfed745,0x2cfede0d,0x154357ae,0x19f5a4ef,0x520630df .long 0xe382360f,0x25759f7c,0x88bf5857,0xb6db05c9,0x6c58d46c,0x2917d61d,0xfd20cb7a,0x14f8e491,0x11c20340,0xb68a727a,0xaf7ccbb6,0x0386f86f,0xfee09a20,0x5c8bc6cc,0xbb7eea35,0x7d76ff4a .long 0xdb15be7a,0xa7bdebe7,0xd89f0302,0x67a08054,0xc1193364,0x56bf0ea9,0x62837ebe,0xc8244467,0x20d841b8,0x32bd8e8b,0xdbb8a54f,0x127a0548,0x63b20236,0x83dd4ca6,0x203491fa,0x87714718 .long 0xaa8a5288,0x4dabcaaa,0xaf23a1c9,0x91cc0c8a,0x3f220e0c,0x34c72c6a,0x1232144a,0xbcc20bdf,0xa20ede1b,0x6e2f42da,0x74a00515,0xc441f00c,0x734b8c4b,0xbf46a5b6,0x7b56c9a4,0x57409503 .long 0xe4585d45,0x9f735261,0x6734e642,0x9231faed,0xbe70ee6c,0x1158a176,0x7c3501bf,0x35f1068d,0xa2d26115,0x6beef900,0xef0afee3,0x649406f2,0xbc2420a1,0x3f43a60a,0xd5aee4ac,0x509002a7 .long 0x3ff3571b,0xb46836a5,0x837927c1,0x24f98b78,0x4533c716,0x6254256a,0xd07ee196,0xf27abb0b,0x5c6d5bfd,0xd7cf64fc,0xf0cd7a77,0x6915c751,0x8798f534,0xd9f59012,0xf81d8b5f,0x772b0da8 .long 0x2e03fa69,0x1244260c,0x3be1a374,0x36cf0e3a,0xef06b960,0x6e7c1633,0x671f90f6,0xa71a4c55,0x33c673db,0x7a941251,0x73e8c131,0xc0bea510,0xd4f6c734,0x61a8a699,0x341ed001,0x25e78c88 .long 0x8e2f7d90,0x5c18acf8,0x77be32cd,0xfdbf33d7,0xd2eb5ee9,0x0a085cd7,0xb3201115,0x2d702cfb,0x85c88ce8,0xb6e0ebdb,0x1e01d617,0x23a3ce3c,0x567333ac,0x3041618e,0x157edb6b,0x9dd0fd8f .long 0xb57872b8,0x27f74702,0x657d5fe1,0x2ef26b4f,0x57cf3d40,0x95426f0a,0x65a6067a,0x847e2ad1,0x09996a74,0xd474d9a0,0x2a26115c,0x16a56acd,0xd16f4d43,0x02a615c3,0xaadb85b7,0xcc3fc965 .long 0xce07d1b0,0x386bda73,0x58ad4178,0xd82910c2,0xcd2617f4,0x124f82cf,0xef691770,0xcc2f5e8d,0xb8c30ccc,0x82702550,0x1a8e575a,0x7b856aea,0xb1ab9459,0xbb822fef,0xec24e38e,0x085928bc .long 0xba8f4b4d,0x5d0402ec,0x00b4d58b,0xc07cd4ba,0x29227e7a,0x5d8dffd5,0x31bf386f,0x61d44d0c,0x135e6f4d,0xe486dc2b,0xe79410ef,0x680962eb,0xf10088b5,0xa61bd343,0xe2e28686,0x6aa76076 .long 0x8fb98871,0x80463d11,0xbbc76aff,0xcb26f5c3,0xfbe03614,0xd4ab8edd,0xc0cf2dee,0xc8eb579b,0xc93bae41,0xcc004c15,0x3aeca3b2,0x46fbae5d,0x0f1e9ab1,0x671235cf,0x9ec285c1,0xadfba934 .long 0xf216c980,0x88ded013,0xf79e0bc1,0xc8ac4fb8,0xfb97a237,0xa29b89c6,0x9922d8e7,0xb697b780,0xddb945b5,0x3142c639,0xe094c3a9,0x447b06c7,0x72266c90,0xcdcb3642,0xa9385046,0x633aad08 .long 0xb57c6477,0xa36c936b,0xe94dbcc6,0x871f8b64,0xa591a67b,0x28d0fb62,0xc1d926f5,0x9d40e081,0xf2d84b5a,0x3111eaf6,0xa565b644,0x228993f9,0x2c83188b,0x0ccbf592,0x3df3e197,0xf87b30ab .long 0x7642bca8,0xb8658b31,0x52800f17,0x1a032d7f,0x79bf9445,0x051dcae5,0x54a2e253,0xeba6b8ee,0xd4485692,0x5c8b9cad,0x8986e9be,0x84bda40e,0x2f0db448,0xd16d16a4,0xa14d4188,0x8ec80050 .long 0x98fa7aaa,0xb2b26107,0xf073aa4e,0x41209ee4,0xf2d6b19b,0xf1570359,0xfc577caf,0xcbe6868c,0x32c04dd3,0x186c4bdc,0xcfeee397,0xa6c35fae,0xf086c0cf,0xb4a1b312,0xd9461fe2,0xe0a5ccc6 .long 0x1536189f,0xc32278aa,0xba6df571,0x1126c55f,0xb194560e,0x0f71a602,0x324bd6e1,0x8b2d7405,0x3738be71,0x8481939e,0x1a4d97a9,0xb5090b1a,0xf05ba915,0x116c65a3,0xaae448aa,0x21863ad3 .long 0xa7aae5d3,0xd24e2679,0x0de5c1c4,0x7076013d,0xbb05b629,0x2d50f8ba,0x6e66efbb,0x73c1abe2,0xf2488af7,0xefd4b422,0x663ba575,0xe4105d02,0x53a69457,0x7eb60a8b,0xc945973b,0x62210008 .long 0x77a50ec6,0xfb255478,0x0a37a72c,0xbf0392f7,0x4be18e7a,0xa0a7a19c,0x25b1e0af,0x90d8ea16,0xef953f57,0x7582a293,0xbdc5465a,0x90a64d05,0xe2510717,0xca79c497,0x18cb641f,0x560dbb7c .long 0x4b66abfb,0x1d8e3286,0x59030900,0xd26f52e5,0x5584941a,0x1ee3f643,0x569f5958,0x6d3b3730,0x4789dba5,0x9ff2a62f,0x72b5c9b7,0x91fcb815,0x6c8f9a0e,0xf446cb7d,0x39b7ecb5,0x48f625c1 .long 0x1c6219b8,0xbabae801,0x28ac2f23,0xe7a562d9,0x26e20588,0xe1b48732,0x775af051,0x06ee1cad,0xfaff79f7,0xda29ae43,0x652ee9e0,0xc141a412,0x195f4bd0,0x1e127f6f,0x072f34f8,0x29c6ab4f .long 0x30448112,0x7b7c1477,0xe4a38656,0x82b51af1,0x2f315010,0x2bf2028a,0x6ea88cd4,0xc9a4a01f,0x257e5818,0xf63e95d8,0xb4519b16,0xdd8efa10,0x0da910bf,0xed8973e0,0x5c0fe4a9,0xed49d077 .long 0xb7caee1e,0xac3aac5e,0xa7f4da57,0x1033898d,0x5c6669b9,0x42145c0e,0xc1aa2aa0,0x42daa688,0x1a1d885a,0x629cc15c,0xf4b76817,0x25572ec0,0x9c8f8f28,0x8312e435,0x81965490,0x8107f8cd .long 0x6fa6110c,0x516ff3a3,0xfb93561f,0x74fb1eb1,0x8457522b,0x6c0c9047,0x6bb8bdc6,0xcfd32104,0xcc80ad57,0x2d6884a2,0x86a9b637,0x7c27fc35,0xadf4e8cd,0x3461baed,0x617242f0,0x1d56251a .long 0xc955bef4,0x0b80d209,0x06adb047,0xdf02cad2,0x5ec74fee,0xf0d7cb91,0x1111ba44,0xd2503375,0xdf53cb36,0x9671755e,0x3368551b,0x54dcb612,0xc8a025a4,0x66d69aac,0xe77ef445,0x6be946c6 .long 0xa995e094,0x719946d1,0xe51e04d8,0x65e848f6,0x6a1e3113,0xe62f3300,0x501de503,0x1541c7c1,0xf4acfade,0x4daac9fa,0x44cd0b71,0x0e585897,0x0a51cd77,0x544fd869,0x0031016d,0x60fc20ed .long 0xa4276867,0x58b404ec,0x34f34993,0x46f6c3cc,0xc636e5bd,0x477ca007,0x7c458b47,0x8018f5e5,0xe47b668f,0xa1202270,0xee14f203,0xcef48ccd,0x62ff9b4d,0x23f98bae,0xc589eddd,0x55acc035 .long 0x64db4444,0x3fe712af,0xbecdd480,0x19e9d634,0xa930978a,0xe08bc047,0xa1280733,0x2dbf24ec,0x2cd706b2,0x3c0ae38c,0x359017b9,0x5b012a5b,0x72e0f5ae,0x3943c38c,0x57176fa3,0x786167ea .long 0x594881dc,0xe5f9897d,0xcfb820c1,0x6b5efad8,0xd55018de,0xb2179093,0x0bac56ce,0x39ad7d32,0x2cfc0e81,0xb55122e0,0xf6d89daa,0x117c4661,0xcb64fa09,0x362d01e1,0x3e9c4ddd,0x6a309b4e .long 0xabea49b1,0xfa979fb7,0x10e2c6c5,0xb4b1d27d,0x23afde7a,0xbd61c2c4,0x9786d358,0xeb6614f8,0x7f6f7459,0x4a5d816b,0x09360e7b,0xe431a44f,0xc309914c,0x8c27a032,0xcaede3d8,0xcea5d68a .long 0x3a0a3f95,0x3668f665,0x7ceba27b,0x89369416,0xe4728fe9,0x89981fad,0x8a093562,0x7102c8a0,0x235d21c8,0xbb80310e,0xbefb7f7b,0x505e55d1,0x12958a67,0xa0a90811,0x4d851fef,0xd67e106a .long 0x431dd80e,0xb84011a9,0x73306cd9,0xeb7c7cca,0xd1b3b730,0x20fadd29,0xfe37b3d3,0x83858b5b,0xb6251d5c,0xbf4cd193,0x1352d952,0x1cca1fd3,0x90fbc051,0xc66157a4,0x89b98636,0x7990a638 .long 0x87dec0e1,0xe5aa692a,0xf7b39d00,0x010ded8d,0x54cfa0b5,0x7b1b80c8,0xa0f8ea28,0x66beb876,0x3476cd0e,0x50d7f531,0xb08d3949,0xa63d0e65,0x53479fc6,0x1a09eea9,0xf499e742,0x82ae9891 .long 0x5ca7d866,0xab58b910,0x3adb3b34,0x582967e2,0xcceac0bc,0x89ae4447,0x7bf56af5,0x919c667c,0x60f5dcd7,0x9aec17b1,0xddcaadbc,0xec697b9f,0x463467f5,0x0b98f341,0xa967132f,0xb187f1f7 .long 0x214aeb18,0x90fe7a1d,0x741432f7,0x1506af3c,0xe591a0c4,0xbb5565f9,0xb44f1bc3,0x10d41a77,0xa84bde96,0xa09d65e4,0xf20a6a1c,0x42f060d8,0xf27f9ce7,0x652a3bfd,0x3b3d739f,0xb6bdb65c .long 0xec7fae9f,0xeb5ddcb6,0xefb66e5a,0x995f2714,0x69445d52,0xdee95d8e,0x09e27620,0x1b6c2d46,0x8129d716,0x32621c31,0x0958c1aa,0xb03909f1,0x1af4af63,0x8c468ef9,0xfba5cdf6,0x162c429f .long 0x753b9371,0x2f682343,0x5f1f9cd7,0x29cab45a,0xb245db96,0x571623ab,0x3fd79999,0xc507db09,0xaf036c32,0x4e2ef652,0x05018e5c,0x86f0cc78,0xab8be350,0xc10a73d4,0x7e826327,0x6519b397 .long 0x9c053df7,0xe8cb5eef,0xb300ea6f,0x8de25b37,0xc849cffb,0xdb03fa92,0xe84169bb,0x242e43a7,0xdd6f958e,0xe4fa51f4,0xf4445a8d,0x6925a77f,0xe90d8949,0xe6e72a50,0x2b1f6390,0xc66648e3 .long 0x173e460c,0xb2ab1957,0x30704590,0x1bbbce75,0xdb1c7162,0xc0a90dbd,0x15cdd65d,0x505e399e,0x57797ab7,0x68434dcb,0x6a2ca8e8,0x60ad35ba,0xde3336c1,0x4bfdb1e0,0xd8b39015,0xbbef99eb .long 0x1711ebec,0x6c3b96f3,0xce98fdc4,0x2da40f1f,0x57b4411f,0xb99774d3,0x15b65bb6,0x87c8bdf4,0xc2eef12d,0xda3a89e3,0x3c7471f3,0xde95bb9b,0xd812c594,0x600f225b,0x2b75a56b,0x54907c5d .long 0x8db60e35,0xa93cc5f0,0xfa833319,0x743e3cd6,0xf81683c9,0x7dad5c41,0x9c34107e,0x70c1e7d9,0xa6be0907,0x0edc4a39,0x86d0b7d3,0x36d47035,0x272bfa60,0x8c76da03,0x0f08a414,0x0b4a07ea .long 0x45c1dd53,0x699e4d29,0x231debb5,0xcadc5898,0xa77f00e0,0xdf49fcc7,0xa73e5a0e,0x93057bbf,0x027a4cd1,0x2f8b7ecd,0xc614011a,0x114734b3,0x67677c68,0xe7a01db7,0x7e273f4f,0x89d9be5e .long 0x089808ef,0xd225cb2e,0xd59e4107,0xf1f7a27d,0x8211b9c9,0x53afc761,0xe6819159,0x0361bc67,0x7f071426,0x2a865d0b,0xe7072567,0x6a3c1810,0x0d6bcabd,0x3e3bca1e,0x408591bc,0xa1b02bc1 .long 0x31fba239,0xe0deee59,0x98bd91d1,0xf47424d3,0x071a3c1d,0x0f8886f4,0xa819233b,0x3f7d41e8,0xcf6eb998,0x708623c2,0x609a287f,0x86bb49af,0x63c90762,0x942bb249,0x55a9654b,0x0ef6eea5 .long 0x36f5defe,0x5f6d2d72,0x56f99176,0xfa9922dc,0xf78ce0c7,0x6c8c5ece,0xbe09b55e,0x7b44589d,0x9ea83770,0xe11b3bca,0x2ab71547,0xd7fa2c7f,0x2a1ddcc0,0x2a3dd6fa,0x5a7b7707,0x09acb430 .long 0x649d4e57,0x4add4a2e,0x1917526e,0xcd53a2b0,0x20b44ac4,0xc5262330,0xbaa2c31d,0x4028746a,0x64291d4c,0x51318390,0xee5ad909,0xbf48f151,0x7b185681,0xcce57f59,0x4854d442,0x7c3ac1b0 .long 0xc093c171,0x65587dc3,0x24f42b65,0xae7acb24,0x955996cb,0x5a338adb,0x6051f91b,0xc8e65675,0x28b8d0b1,0x66711fba,0xb6c10a90,0x15d74137,0x3a232a80,0x70cdd7eb,0x6191ed24,0xc9e2f07f .long 0xf79588c0,0xa80d1db6,0xb55768cc,0xfa52fc69,0x7f54438a,0x0b4df1ae,0xf9b46a4f,0x0cadd1a7,0x1803dd6f,0xb40ea6b3,0x55eaae35,0x488e4fa5,0x382e4e16,0x9f047d55,0x2f6e0c98,0xc9b5b7e0 .long 0x95762649,0x6b1bd2d3,0xc7aea3f6,0xa9604ee7,0x6dc6f896,0x3646ff27,0x2860bad1,0x9bf0e7f5,0x7cb44b92,0x2d92c821,0xaea9c182,0xa2f5ce63,0x9154a5fd,0xd0a2afb1,0x95801da6,0x482e474c .long 0xb611c24b,0xc19972d0,0x60a8f351,0x1d468e65,0x7bcf6421,0xeb758069,0x88fbc491,0xec9dd0ee,0x956c2e32,0x5b59d2bf,0xdcddf94e,0x73dc6864,0xbcee7665,0xfd5e2321,0x5e9a06c4,0xa7b4f8ef .long 0x7280f855,0xfba918dd,0x8baec688,0xbbaac260,0x33400f42,0xa3b3f00f,0x66f2e6e4,0x3d2dba29,0x98509375,0xb6f71a94,0xcea423cc,0x8f33031f,0x4807e6fb,0x009b8dd0,0x5cdb954c,0x5163cfe5 .long 0xcf41c6e8,0x03cc8f17,0x037b925c,0xf1f03c2a,0x66d2427c,0xc39c19cc,0x7b6c18e4,0x823d24ba,0x901f0b4f,0x32ef9013,0xf8941c2e,0x684360f1,0x2c28092e,0x0ebaff52,0x256c932f,0x7891e4e3 .long 0xac445e3d,0x51264319,0x8ea74381,0x553432e7,0x67e9c50a,0xe6eeaa69,0x62e628c7,0x27ced284,0x7a4afa57,0x3f96d375,0xe484c150,0xde0a14c3,0x38bd9923,0x364a24eb,0xe5177422,0x1df18da0 .long 0xd8d38a9b,0x174e8f82,0xe7de1391,0x2e97c600,0xa1c175dd,0xc5709850,0x32ae5035,0x969041a0,0x76a2086b,0xcbfd533b,0xd7c2e8fe,0xd6bba71b,0x099dfb67,0xb2d58ee6,0x064a85d9,0x3a8b342d .long 0x522f9be3,0x3bc07649,0xdf1f49a8,0x690c075b,0x3854ec42,0x80e1aee8,0x17689dc7,0x2a7dbf44,0x3faf4078,0xc004fc0e,0xdf11862c,0xb2f02e9e,0xa0a1b7b3,0xf10a5e0f,0x8936ec80,0x30aca623 .long 0x02f40d9a,0xf83cbf05,0x2c318a4d,0x4681c468,0x0e9c2674,0x98575618,0x1847092e,0xbe79d046,0x78bd01e0,0xaf1e480a,0x72a51db9,0x6dd359e4,0xe3afbab6,0x62ce3821,0x17733199,0xc5cee5b6 .long 0x6ffd9fbb,0xe08b30d4,0x36c610b7,0x6e5bc699,0x9ce262cf,0xf343cff2,0x68b914c1,0xca2e4e35,0x16de36c5,0x011d64c0,0x42e2b829,0xe0b10fdd,0x6685aaf8,0x78942981,0x230ede97,0xe7511708 .long 0x3b922bf8,0x671ed8fc,0x4c29b133,0xe4d8c0a0,0x3b6e99c4,0x87eb1239,0x8793beba,0xaff3974c,0x2c18df9b,0x03749405,0x91007139,0xc5c3a293,0xe37a0b95,0x6a77234f,0xb661c96b,0x02c29a21 .long 0x141ecf61,0xc3aaf1d6,0x3bb22f53,0x9195509e,0x22d51357,0x29597404,0x537bed60,0x1b083822,0xe07289f0,0xcd7d6e35,0x6dd86eff,0x1f94c48c,0xeb0f9cfa,0xc8bb1f82,0x1b2eb97d,0x9ee0b7e6 .long 0x34d74e31,0x5a52fe2e,0x3bf79ab6,0xa352c310,0xabfeeb8f,0x97ff6c5a,0xf5c97305,0xbfbe8fef,0xa7904608,0xd6081ce6,0xc4fca249,0x1f812f3a,0xb9e5e200,0x9b24bc9a,0x38012ee8,0x91022c67 .long 0x30a713a1,0xe83d9c5d,0x84ef0f93,0x4876e3f0,0xc1fbf928,0xc9777029,0xbce7d2a4,0xef7a6bb3,0xdfa2a659,0xb8067228,0xd877a48f,0xd5cd3398,0x025d0f3f,0xbea4fd8f,0x2eae7c2b,0xd67d2e35 .long 0xcc5f4394,0x184de7d7,0x4536e142,0xb5551b5c,0xd34aa60a,0x2e89b212,0xf50051d5,0x14a96fea,0x0d12bb0b,0x4e21ef74,0x60b9677e,0xc522f020,0x2df7731d,0x8b12e467,0x7b326d31,0x39f80382 .long 0x39024a94,0xdfb8630c,0x97319452,0xaacb96a8,0xeda3867c,0xd68a3961,0x77c4ffca,0x0c58e2b0,0x4da919fa,0x3d545d63,0xf15e2289,0xef79b69a,0x808bab10,0x54bc3d3d,0x45f82c37,0xc8ab3007 .long 0x7c4a658a,0xc12738b6,0x40e72182,0xb3c47639,0x8798e44f,0x3b77be46,0x17a7f85f,0xdc047df2,0x5e59d92d,0x2439d4c5,0xe8e64d8d,0xcedca475,0x87ca9b16,0xa724cd0d,0xa5540dfe,0x35e4fd59 .long 0xe4bcf6b1,0xf8c1ff18,0x295018fa,0x856d6285,0x3263c949,0x433f665c,0xa1f21409,0xa6a76dd6,0xcc7b4f79,0x17d32334,0x06720e4a,0xa1d03122,0x81d9bed5,0xadb6661d,0x11db15d1,0xf0d6fb02 .long 0x1fb747d2,0x7fd11ad5,0x3033762b,0xab50f959,0xfbefaf5a,0x2a7e711b,0x3fef2bbf,0xc7393278,0x0df6f9be,0xe29fa244,0x71efd215,0x9092757b,0x4f3d6fd9,0xee60e311,0x0acfb78b,0x338542d4 .long 0x38961a0f,0x44a23f08,0x986987ca,0x1426eade,0x4a863cc6,0x36e6ee2e,0x628b8b79,0x48059420,0x7396e1de,0x30303ad8,0x38c5aad1,0x5c8bdc48,0x5c8f5066,0x3e40e11f,0x8d246bbd,0xabd6e768 .long 0x23330a01,0x68aa40bb,0xc34eafa0,0xd23f5ee4,0x5de02c21,0x3bbee315,0xd1d8dd06,0x18dd4397,0x122d7b44,0x3ba1939a,0xa33870d6,0xe6d3b40a,0x1c4fe3f8,0x8e620f70,0xd3a50cbf,0xf6bba1a5 .long 0xcfc0aee0,0x4a78bde5,0xc08c50bd,0x847edc46,0xad63c9b2,0xbaa2439c,0x10fc2acb,0xceb4a728,0x26da033d,0xa419e40e,0x03e02683,0x6cc3889d,0xfdccf725,0x1cd28559,0x8d13d208,0x0fd7e0f1 .long 0x1f0df9d4,0x01b9733b,0xa2b5e4f3,0x8cc2c5f3,0x3a304fd4,0x43053bfa,0x0a9f1aa7,0x8e87665c,0xd73dc965,0x087f29ec,0x3e9023db,0x15ace455,0x2bce28b4,0x2370e309,0xb6b1e84a,0xf9723442 .long 0xb72d9f26,0xbeee662e,0xf0e47109,0xb19396de,0xe13289d0,0x85b1fa73,0x54e58e32,0x436cf77e,0xe990ef77,0x0ec833b3,0x1b11fc25,0x7373e3ed,0x0fc332ce,0xbe0eda87,0x8d7ea856,0xced04970 .long 0x7e977ca0,0xf85ff785,0xdfdd5d2b,0xb66ee8da,0x905af461,0xf5e37950,0x966d487c,0x587b9090,0x32ba0127,0x6a198a1b,0x141615ac,0xa7720e07,0x996ef2f2,0xa23f3499,0x470bcb3d,0xef5f64b4 .long 0x92b8c559,0xa526a962,0x69740a0f,0x0c14aac0,0xa6bdc0a5,0x0d41a9e3,0x9c48aef4,0x97d52106,0x3e7c253b,0xcf16bd30,0x47fdedc1,0xcc834b1a,0x373aab2e,0x7362c6e5,0xc5f590ff,0x264ed85e .long 0x66d41870,0x7a46d9c0,0x4787ba09,0xa50c20b1,0xe3d44635,0x185e7e51,0x31e2d8dc,0xb3b3e080,0xa179e9d9,0xbed1e558,0x74a76781,0x2daa3f79,0x3a40864f,0x4372baf2,0x4fe75cb5,0x46900c54 .long 0xf76765d0,0xb95f171e,0x95c87502,0x4ad726d2,0x4d7c99bd,0x2ec769da,0xc36cdfa8,0x5e2ddd19,0xa93e6dea,0xc22117fc,0x93771123,0xe8a2583b,0xfa08a3a2,0xbe2f6089,0x8f0e1112,0x4809d5ed .long 0xda7a095e,0x3b414aa3,0x26f5aadd,0x9049acf1,0x6be8b84a,0x78d46a4d,0xb732b9b3,0xd66b1963,0xde6e9555,0x5c2ac2a0,0xb5bd8770,0xcf52d098,0x0fd28921,0x15a15fa6,0x8b27536d,0x56ccb81e .long 0x9f4ccbb8,0x0f0d8ab8,0xdb221729,0xed5f44d2,0x00bed10c,0x43141988,0x1d735b8b,0xc94348a4,0x29ef8479,0x79f3e9c4,0x614c693f,0x4c13a4e3,0x8e143a14,0x32c9af56,0xe29ac5c4,0xbc517799 .long 0x2774856f,0x05e17992,0x6c1bf55f,0x6e52fb05,0xe4f19e16,0xaeda4225,0xaf5ccb26,0x70f4728a,0xb2947f22,0x5d2118d1,0x281d6fb9,0xc827ea16,0x8cf0eabd,0x8412328d,0x03ef9dcf,0x45ee9fb2 .long 0xbb937d63,0x8e700421,0xcc4b37a6,0xdf8ff2d5,0x5ced7b68,0xa4c0d5b2,0xc7308f59,0x6537c1ef,0x3b37f8e8,0x25ce6a26,0xdeebc6ce,0x170e9a9b,0x8728d72c,0xdd037952,0x850154bc,0x445b0e55 .long 0x83a7337b,0x4b7d0e06,0xffecf249,0x1e3416d4,0x66a2b71f,0x24840eff,0xb37cc26d,0xd0d9a50a,0x6fe28ef7,0xe2198150,0x23324c7f,0x3cc5ef16,0x769b5263,0x220f3455,0xa10bf475,0xe2ade2f1 .long 0x458d3671,0x28cd20fa,0x2dc4847b,0x1549722c,0x591941e3,0x6dd01e55,0x27128ccb,0x0e6fbcea,0x3bef0262,0xae1a1e6b,0x8f54e103,0xfa8c472c,0x72c052ec,0x7539c0a8,0x5a3490e9,0xd7b27369 .long 0x71684349,0x143fe1f1,0x32e19b97,0x36b4722e,0x90980aff,0xdc059227,0x9e13d674,0x175c9c88,0x6e6bfdb1,0xa7de5b22,0xbedb4b46,0x5ea5b7b2,0xd34a6e44,0xd5570191,0xa24ff7e6,0xfcf60d2e .long 0x677819e1,0x614a392d,0xaa5a29e8,0x7be74c7e,0x63c85f3f,0xab50fece,0x46cab337,0xaca2e2a9,0x122a6fe3,0x7f700388,0x882a04a8,0xdb69f703,0xcf7aed57,0x9a77935d,0x8d91c86f,0xdf16207c .long 0x63ed9998,0x2fca49ab,0xa77ddf96,0xa3125c44,0x24344072,0x05dd8a86,0xfec3fb56,0xa023dda2,0x0c743032,0x421b41fc,0x5e438639,0x4f2120c1,0xc83c1b07,0xfb7cae51,0xcac2171a,0xb2370caa .long 0x6cc820fb,0x2eb2d962,0xb85a44bf,0x59feee5c,0x5b6598f0,0x94620fca,0x7e314051,0x6b922cae,0x106bed4e,0xff8745ad,0xdfa1e9ab,0x546e71f5,0x1ec29487,0x935c1e48,0x4d936530,0x9509216c .long 0x85c9a2db,0xc7ca3067,0x6be8606f,0xd6ae5152,0xe14c651d,0x09dbcae6,0x9bc32f96,0xc9536e23,0x34521b03,0xa90535a9,0x878756ff,0xf39c526c,0x8aedf03c,0x383172ec,0xefe0c034,0x20a8075e .long 0x64026422,0xf22f9c62,0x24b9d076,0x8dd10780,0x3bef2950,0x944c742a,0x88a2b00b,0x55b9502e,0x86a09817,0xa59e14b4,0x47bb4071,0xa39dd3ac,0x3be0592f,0x55137f66,0xc9e63f5b,0x07fcafd4 .long 0x346eb226,0x963652ee,0xec2facb7,0x7dfab085,0x691add26,0x273bf2b8,0xf2b46c44,0x30d74540,0xf2c2d065,0x05e8e73e,0xd42eeac9,0xff9b8a00,0x97209d22,0x2fcbd205,0xde14ea2c,0xeb740ffa .long 0xa8aef518,0xc71ff913,0xfff4cfa2,0x7bfc74bb,0xb6b36048,0x1716680c,0x9ef79af1,0x121b2cce,0xa01eb3d3,0xbff3c836,0x5f79077b,0x50eb1c6a,0xa004bbcf,0xa48c32d6,0x7d64f61d,0x47a59316 .long 0x93102016,0x6068147f,0x94d12576,0x12c5f654,0xc9bc6b91,0xefb071a7,0x6e23ea95,0x7c2da0c5,0xd4a1dd5d,0xf4fd45b6,0x9122b13c,0x3e7ad9b6,0xe6f57a48,0x342ca118,0x06f8288f,0x1c2e94a7 .long 0x5a97d231,0x99e68f07,0x4d838758,0x7c80de97,0x05872727,0xbce0f5d0,0x19c4d016,0xbe5d95c2,0x9c2492ee,0x921d5cb1,0x404d6fb3,0x42192dc1,0x32f988d3,0x4c84dcd1,0xa17b8e85,0xde26d61f .long 0x137c7408,0xc466dcb6,0x36a266da,0x9a38d7b6,0x83bebf1b,0x7ef5cb06,0x0fd014e3,0xe5cdcbbf,0xf65965a0,0x30aa376d,0xebb3e95e,0x60fe88c2,0x66ee6f20,0x33fd0b61,0x3f41f0a0,0x8827dcdb .long 0x0c56c690,0xbf8a9d24,0xddb7641d,0x40265dad,0x3a6b662b,0x522b05bf,0xb1478c9b,0x466d1dfe,0x1484469b,0xaa616962,0x02df8f9f,0x0db60549,0x3cb8bf51,0xc37bca02,0x21371ce8,0x5effe346 .long 0xff112c32,0xe8f65264,0x7b971fb2,0x8a9c736d,0x7b75080d,0xa4f19470,0x8839c59b,0xfc3f2c5a,0x5aeb49c2,0x1d6c777e,0xda1addfe,0xf3db034d,0x5535affc,0xd76fee5a,0xb92251fd,0x0853ac70 .long 0x8b2a29d5,0x37e3d594,0x4de00ddb,0x28f1f457,0xf42c328b,0x8083c1b5,0xe493c73b,0xd8ef1d8f,0x41dc61bd,0x96fb6260,0x27ee2f8a,0xf74e8a9d,0x2c946a5d,0x7c605a80,0x3839ccfd,0xeed48d65 .long 0x3a29467a,0x9894344f,0xc51eba6d,0xde81e949,0xa5e5c2f2,0xdaea066b,0x08c8c7b3,0x3fc8a614,0x06d0de9f,0x7adff88f,0x3b75ce0a,0xbbc11cf5,0xfbbc87d5,0x9fbb7acc,0x7badfde2,0xa1458e26 .long 0xe039c256,0x1cb43668,0x7c17fd5d,0x5f26fb8b,0x79aa062b,0xeee426af,0xd78fbf04,0x072002d0,0xe84fb7e3,0x4c9ca237,0x0c82133d,0xb401d8a1,0x6d7e4181,0xaaa52592,0x73dbb152,0xe9430833 .long 0xbe24319a,0xf92dda31,0xe095a8e7,0x03f7d28b,0x98782185,0xa52fe840,0x29c24dbc,0x276ddafe,0x1d7a64eb,0x80cd5496,0x7f1dbe42,0xe4360889,0x8438d2d5,0x2f81a877,0x85169036,0x7e4d52a8 .long 0x1d59715d,0x19e3d5b1,0xd788983e,0xc7eaa762,0xabf1f248,0xe5a730b0,0xfae3fd83,0xfbab8084,0x53765b2f,0x65e50d21,0xfa127f3d,0xbdd4e083,0x397b1b10,0x9cf3c074,0xb1b59fd3,0x59f8090c .long 0x615faa8f,0x7b15fd9d,0x968554ed,0x8fa1eb40,0x7aa44882,0x7bb4447e,0x029fff32,0x2bb2d0d1,0x6caa6d2f,0x075e2a64,0x22e7351b,0x8eb879de,0x9a506c62,0xbcd5624e,0xa87e24dc,0x218eaef0 .long 0x44ddfa35,0x37e56847,0xdab3f747,0x9ccfc5c5,0x1ee96cf4,0x9ac1df3f,0x3b480b8f,0x0c0571a1,0x4b3a7b3c,0x2fbeb3d5,0x5dcdbb99,0x35c03669,0xb2415b3a,0x52a0f5dc,0x4413ed9a,0xd57759b4 .long 0x3d30a2c5,0x1fe647d8,0xf78a81dc,0x0857f77e,0x131a4a9b,0x11d5a334,0x29d393f5,0xc0a94af9,0xdaa6ec1a,0xbc3a5c0b,0x88d2d7ed,0xba9fe493,0xbb614797,0xbb4335b4,0x72f83533,0x991c4d68 .long 0xd2f01cb3,0x53258c28,0xd75db0b1,0x93d6eaa3,0xe87d0db4,0x419a2b0d,0xd8fe8493,0xa1e48f03,0xc508b23a,0xf747faf6,0x35d53549,0xf137571a,0xfcf9b838,0x9f5e58e2,0xa7fd3cf5,0xc7186cee .long 0xe978a1d3,0x77b868ce,0x7ab92d04,0xe3a68b33,0x87a5b862,0x51029794,0x3a61d41d,0x5f0606c3,0x6f9326f1,0x2814be27,0xc6fe3c2e,0x2f521c14,0xacdf7351,0x17464d7d,0x777f7e44,0x10f5f9d3 .long 0x269fb37d,0xce8e616b,0x7de62de5,0xaaf73804,0x4fdd4153,0xaba11175,0x3770b49b,0x515759ba,0xaa423a61,0x8b09ebf8,0xcd41fb92,0x592245a1,0x9b4c8936,0x1cba8ec1,0xaf36710e,0xa87e91e3 .long 0x3d34a2e3,0x1fd84ce4,0xb43b5d61,0xee3759ce,0x619186c7,0x895bc78c,0xcbb9725a,0xf19c3809,0xde744b1f,0xc0be21aa,0x60f8056b,0xa7d222b0,0xb23efe11,0x74be6157,0x0cd68253,0x6fab2b4f .long 0x4bf1d725,0xad33ea5f,0x4f6c950f,0x9c1d8ee2,0xa377af06,0x544ee78a,0x94a113e1,0x54f489bb,0x992fb7e8,0x8f11d634,0xa2a44347,0x0169a7aa,0x95020e00,0x1d49d4af,0xe08e120b,0x95945722 .long 0xa4d32282,0xb6e33878,0x48020ae7,0xe36e029d,0x37a9b750,0xe05847fb,0xb29e3819,0xf876812c,0xd23a17f0,0x84ad138e,0xf0b3950e,0x6d7b4480,0x2fd67ae0,0xdfa8aef4,0x52333af6,0x8d3eea24 .long 0xb15d5acc,0x0d052075,0xbd815bc4,0xc6d9c79f,0xdfa36cf2,0x8dcafd88,0x38aa9070,0x908ccbe2,0xba35afce,0x638722c4,0xfd6abf0b,0x5a3da8b0,0xc9c335c1,0x2dce252c,0x65aa799b,0x84e7f0de .long 0xb99a72cb,0x2101a522,0x87618016,0x06de6e67,0xe6f3653e,0x5ff8c7cd,0xc7a6754a,0x0a821ab5,0x7cb0b5a2,0x7e3fa52b,0xc9048790,0xa7fb121c,0x06ce053a,0x1a725020,0x04e929b0,0xb490a31f .long 0x62dd61ad,0xe17be47d,0x6be01371,0x781a961c,0xdae3cbba,0x1063bfd3,0x7f73c9ba,0x35647406,0x2736a129,0xf50e957b,0xed13f256,0xa6313702,0x3a19fcc5,0x9436ee65,0xe7a4c8b6,0xcf2bdb29 .long 0xc5f95cd8,0xb06b1244,0xf4ab95f4,0xda8c8af0,0xb9e5836d,0x1bae59c2,0x3acffffc,0x07d51e7e,0xc2ccbcda,0x01e15e6a,0x8528c3e0,0x3bc1923f,0xa49fead4,0x43324577,0x2aa7a711,0x61a1b884 .long 0x700230ef,0xf9a86e08,0xbd19adf8,0x0af585a1,0xf55ad8f2,0x7645f361,0x46c3614c,0x6e676223,0x4e774d3f,0x23cb257c,0xac102d1b,0x82a38513,0x7b126aa5,0x9bcddd88,0xeefd3ee4,0xe716998b .long 0xfb167583,0x4239d571,0xd16c8f8a,0xdd011c78,0x69a27519,0x271c2895,0xd2d64b6a,0x9ce0a3b7,0xd5ec6738,0x8c977289,0x8840ef6b,0xa3b49f9a,0x9a453419,0x808c14c9,0x0cf0a2d5,0x5c00295b .long 0x1d4bcc76,0x524414fb,0x459a88f1,0xb07691d2,0xf70d110f,0x77f43263,0xb7abf9f3,0x64ada5e0,0x5b544cf5,0xafd0f94e,0xfd2713fe,0xb4a13a15,0x250c74f4,0xb99b7d6e,0x20324e45,0x097f2f73 .long 0xaffa8208,0x994b37d8,0xdc29aafc,0xc3c31b0b,0x7a3a607f,0x3da74651,0xfe6955d6,0xd8e1b8c1,0xc8418682,0x716e1815,0x7dc91d97,0x541d487f,0xc6996982,0x48a04669,0x83a6502e,0xf39cab15 .long 0xe68db055,0x025801a0,0xba3338d5,0xf3569758,0xee2afa84,0xb0c8c0aa,0xfb6562d1,0x4f6985d3,0x132ed17a,0x351f1f15,0xc04365fe,0x510ed0b4,0xe5b1f066,0xa3f98138,0x32df03dc,0xbc9d95d6 .long 0x19abd09e,0xa83ccf6e,0x4ff17edb,0x0b4097c1,0xd64a06ce,0x58a5c478,0x544a58fd,0x2ddcc3fd,0x9e8153b8,0xd449503d,0x7774179b,0x3324fd02,0xdbd9120c,0xaf5d47c8,0x34fa94db,0xeb860162 .long 0x972f07f4,0x5817bdd1,0xd27bbceb,0xe5579e2e,0x5f11e5a6,0x86847a1f,0x7c3cf048,0xb39ed255,0xa2f62e55,0xe1076417,0x1bcf82a2,0x6b9ab38f,0x7aeb29f9,0x4bb7c319,0x17227a46,0xf6d17da3 .long 0x0f968c00,0xab53ddbd,0x000c880b,0xa03da7ec,0x6a9ad24d,0x7b239624,0x01ec60d0,0x612c0401,0x109f5df1,0x70d10493,0x80af7550,0xfbda4030,0xc6b9a9b3,0x30b93f95,0x007d9418,0x0c74ec71 .long 0x6edb951f,0x94175564,0x7f22c282,0x5f4a9d78,0xb38d1196,0xb7870895,0xa228ce7c,0xbc593df3,0x6af3641a,0xc78c5bd4,0x3d9b3dcc,0x7802200b,0x8be33304,0x0dc73f32,0x61ffb79a,0x847ed87d .long 0x6d671192,0xf85c974e,0xde16f60f,0x1e14100a,0x95c38797,0x45cb0d5a,0x9b022da4,0x18923bba,0xbbe7e86e,0xef2be899,0x216067bf,0x4a1510ee,0x84d5ce3e,0xd98c8154,0xf92a2b90,0x1af777f0 .long 0x4ef65724,0x9fbcb400,0x3c0ca6fe,0x3e04a4c9,0x55002994,0xfb3e2cb5,0x5363ecab,0x1f3a93c5,0x3923555b,0x1fe00efe,0x1e1751ea,0x744bedd9,0x6ab69357,0x3fb2db59,0xf5e6618b,0x8dbd7365 .long 0xdf1ea40e,0x99d53099,0x57d61e64,0xb3f24a0b,0x596eb812,0xd088a198,0x5762940b,0x22c8361b,0xf9c0d95c,0x66f01f97,0x8e43cdae,0x88461172,0xb72b15c3,0x11599a7f,0x420d95cc,0x135a7536 .long 0x5f7ae2f6,0x2dcdf0f7,0xd7fa6da2,0x15fc6e1d,0xd1d441b6,0x81ca829a,0x04a106b6,0x84c10cf8,0xa73fbbd0,0xa9b26c95,0x4d8f6ee8,0x7f24e0cb,0x1e25a043,0x48b45937,0x036f3dfe,0xf8a74fca .long 0xc9f84296,0x1ed46585,0x3bc278b0,0x7fbaa8fb,0x6c4fcbd0,0xa8e96cd4,0x73b60a5f,0x940a1202,0x55a4aec8,0x34aae120,0xdbd742f0,0x550e9a74,0x228c68ab,0x794456d7,0xa4e25ec6,0x492f8868 .long 0xb2d8f398,0x682915ad,0x5b84c953,0xf13b51cc,0x5bb917d6,0xcda90ab8,0x4ea3dee1,0x4b615560,0x0a52c1c8,0x578b4e85,0x20b75fc4,0xeab1a695,0xaa0bb3c6,0x60c14f3c,0xb8216094,0x220f448a .long 0xb0e63d34,0x4fe7ee31,0xa9e54fab,0xf4600572,0xd5e7b5a4,0xc0493334,0x06d54831,0x8589fb92,0x6583553a,0xaa70f5cc,0xe25649e5,0x0879094a,0x10044652,0xcc904507,0x02541c4f,0xebb0696d .long 0xb9718710,0x5a171fde,0xf374a9f5,0x38f1bed8,0xba39bdc1,0xc8c582e1,0x908cc0ce,0xfc457b0a,0x883841e2,0x9a187fd4,0x38725381,0x8ec25b39,0x96f84395,0x2553ed05,0x6f6c6897,0x095c7661 .long 0x4bdc5610,0x917ac85c,0x179eb301,0xb2885fe4,0x8b78bdcc,0x5fc65547,0xe59e4699,0x4a9fc893,0x3ce299af,0xbb7ff0cd,0xadf38b20,0x195be9b3,0xd38ddb8f,0x6a929c87,0xb21a51b9,0x55fcc99c .long 0x721a4593,0x2b695b4c,0x768eaac2,0xed1e9a15,0x7489f914,0xfb63d71c,0x78118910,0xf98ba31c,0x9b128eb4,0x80291373,0xd448af4a,0x7801214e,0x55418dd3,0xdbd2e22b,0xd3998242,0xeffb3c0d .long 0xc7bf3827,0xdfa6077c,0x47f8238f,0xf2165bcb,0x8564d554,0xfe37cf68,0x0a81fb98,0xe5f825c4,0xffed4d6f,0x43cc4f67,0xb50a34b0,0xbc609578,0x5041faf1,0x8aa8fcf9,0x651773b6,0x5659f053 .long 0x6044d63b,0xe87582c3,0x0cdb0ca0,0xa6089409,0xbfb2bcf6,0x8c993e0f,0x45985cfc,0xfc64a719,0x83dbedba,0x15c4da80,0x2be67df7,0x804ae112,0xa23defde,0xda4c9658,0x5156e0d3,0x12002ddd .long 0x5dd21b96,0xe68eae89,0xcf44624d,0x8b99f28b,0x1ec8897a,0x0ae00808,0x6712f76e,0xdd0a9303,0x4e233de4,0x96237522,0x2b36a8a5,0x192445b1,0x023993d9,0xabf9ff74,0x2aad4a8f,0x21f37bf4 .long 0xf8bd2bbd,0x340a4349,0x4868195d,0x1d902cd9,0xe5fdb6f1,0x3d27bbf1,0x124f9f1c,0x7a5ab088,0xf7a09e03,0xc466ab06,0x31f2c123,0x2f8a1977,0x041b6657,0xda355dc7,0x8ece2a7c,0xcb840d12 .long 0x7db32675,0xb600ad9f,0x07a06f1b,0x78fea133,0xb31f6094,0x5d032269,0x83ec37aa,0x07753ef5,0x9c0bea78,0x03485aed,0xbc3f4524,0x41bb3989,0x697f726d,0x09403761,0xdf394820,0x6109beb3 .long 0x3b6d1145,0x804111ea,0xa8582654,0xb6271ea9,0x24e66562,0x619615e6,0xd7b6ad9c,0xa2554945,0x99bfe35f,0xd9c4985e,0x7b51cdf6,0x9770ccc0,0x92881832,0x7c327013,0x286b26d1,0x8777d45f .long 0xd847999d,0x9bbeda22,0xc3525d32,0x03aa33b6,0x28a959a1,0x4b7b96d4,0x31e5d234,0xbb3786e5,0x6961f247,0xaeb5d3ce,0x02f93d3f,0x20aa85af,0xd7a7ae4f,0x9cd1ad3d,0x781adaa8,0xbf6688f0 .long 0x7469cead,0xb1b40e86,0x309fca48,0x1904c524,0x4b54bbc7,0x9b7312af,0x593affa2,0xbe24bf8f,0xbd98764b,0xbe5e0790,0xa26e299e,0xa0f45f17,0x6b8fe4c7,0x4af0d2c2,0x8ae8a3e6,0xef170db1 .long 0x29e0ccc1,0x0e8d61a0,0x60ad36ca,0xcd53e87e,0xc8173822,0x328c6623,0xa496be55,0x7ee1767d,0x648945af,0x89f13259,0x25c8009c,0x9e45a5fd,0x1f61ab8c,0xaf2febd9,0x8a275385,0x43f6bc86 .long 0xf2142e79,0x87792348,0xc6e6238a,0x17d89259,0x4a839d9b,0x7536d2f6,0x76a1fbdc,0x1f428fce,0x0db06dfe,0x1c109601,0x50a3a3cc,0xbfc16bc1,0x9b30f41b,0xf9cbd9ec,0x00138cce,0x5b5da0d6 .long 0x56ef96a7,0xec1d0a48,0x982bf842,0xb47eb848,0xec3f700d,0x66deae32,0xaa1181e0,0x4e43c42c,0xd1a4aa2a,0xa1d72a31,0xc004f3ce,0x440d4668,0x45fe8a7a,0x0d6a2d3b,0xfb128365,0x820e52e2 .long 0x25e51b09,0x29ac5fcf,0x2023d159,0x180cd2bf,0xa1ebf90e,0xa9892171,0x7c132181,0xf97c4c87,0xc03dbb7e,0x9f1dc724,0x018cbbe4,0xae043765,0x0767d153,0xfb0b2a36,0x249cbaeb,0xa8e2f4d6 .long 0xd95ea168,0x172a5247,0x2970764a,0x1758fada,0x1d978169,0xac803a51,0xde77e01b,0x299cfe2e,0xb0a98927,0x652a1e17,0x20014495,0x2e26e1d1,0x7175b56a,0x7ae0af9f,0xd64b9f95,0xc2e22a80 .long 0xd90a060a,0x4d0ff9fb,0xbaf38085,0x496a27db,0xda776bcf,0x32305401,0x725f209e,0xb8cdcef6,0x436a0bba,0x61ba0f37,0x76860049,0x263fa108,0xda3542cf,0x92beb98e,0xd5849538,0xa2d4d14a .long 0x12e9a1bc,0x989b9d68,0x5f6e3268,0x61d9075c,0x99ace638,0x352c6aa9,0x920f43ff,0xde4e4a55,0xd673c017,0xe5e4144a,0x6f6e05ea,0x667417ae,0xdcd1bd56,0x613416ae,0x86693711,0x5eb36201 .long 0x3a1aa914,0x2d7bc504,0x76dc5975,0x175a1299,0x3fc8125c,0xe900e0f2,0x11198875,0x569ef68c,0x63a113b4,0x9012db63,0x98835766,0xe3bd3f56,0x76412dea,0xa5c94a52,0xaa735e5c,0xad9e2a09 .long 0x508b65e9,0x405a984c,0x6df1a0d1,0xbde4a1d1,0xdfba80da,0x1a9433a1,0x9440ad2e,0xe9192ff9,0x5099fe92,0x9f649696,0x0b27a54a,0x25ddb65c,0xc590da61,0x178279dd,0xfbde681a,0x5479a999 .long 0x013fe162,0xd0e84e05,0x632d471b,0xbe11dc92,0xfc0e089f,0xdf0b0c45,0x4c144025,0x04fb15b0,0x13c99927,0xa61d5fc2,0x3de2eb35,0xa033e9e0,0xb8dacbb4,0xf8185d5c,0x8644549d,0x9a88e265 .long 0x54671ff6,0xf717af62,0x5fa58603,0x4bd4241b,0xe67773c0,0x06fba40b,0x6a2847e9,0xc1d933d2,0x689e2c70,0xf4f5acf3,0x46bafd31,0x92aab0e7,0x3473f6e5,0x798d76aa,0x93141934,0xcc6641db .long 0xd31e535e,0xcae27757,0x87c2ee11,0x04cc43b6,0x2e029ffa,0x8d1f9675,0xe4cc7a2c,0xc2150672,0x8d68b013,0x3b03c1e0,0xedf298f3,0xa9d6816f,0xa2804464,0x1bfbb529,0x5db22125,0x95a52fae .long 0x0e1cb64e,0x55b32160,0x7e7fc9fe,0x004828f6,0x1bb0fb93,0x13394b82,0x35f1a920,0xb6293a2d,0xd145d2d9,0xde35ef21,0xbb8fa603,0xbe6225b3,0x32cf252d,0x00fc8f6b,0x117cf8c2,0xa28e52e6 .long 0x4c371e6d,0x9d1dc89b,0x36ef0f28,0xcebe0675,0xa4292f81,0x5de05d09,0x353e3083,0xa8303593,0x7e37a9bb,0xa1715b0a,0x2b8faec3,0x8c56f61e,0x33c9b102,0x52507431,0xa44431f0,0x0130cefc .long 0xbd865cfb,0x56039fa0,0xbc5f1dd7,0x4b03e578,0xbabe7224,0x40edf2e4,0x3a1988f6,0xc752496d,0x564beb6b,0xd1572d3b,0x39a1c608,0x0db1d110,0x16f60126,0x568d1934,0xf354af33,0x05ae9668 .long 0xc92544f2,0x19de6d37,0xa35837d5,0xcc084353,0x1a514ece,0xcbb6869c,0x2e1d1066,0xb633e728,0x936c581c,0xf15dd69f,0x7439c4f9,0x96e7b8ce,0x2e448a5b,0x5e676f48,0xfd916bbb,0xb2ca7d5b .long 0xf5024025,0xd55a2541,0xe4c2d937,0x47bc5769,0x0362189f,0x7d31b92a,0xef7816f9,0x83f3086e,0xb587579a,0xf9f46d94,0x30e76c5f,0xec2d22d8,0xb000ffcf,0x27d57461,0x364ffc2c,0xbb7e65f9 .long 0x6652a220,0x7c7c9477,0xd696c981,0x61618f89,0x89effff3,0x5021701d,0x7c314163,0xf2c8ff8e,0x8efb4d3e,0x2da413ad,0xce176d95,0x937b5adf,0x2a67d51c,0x22867d34,0x18eb3ac9,0x262b9b10 .long 0xc43ff28b,0x4e314fe4,0x6a664e7a,0x76476627,0xb7a565c2,0x3e90e40b,0xc1acf831,0x8588993a,0x8f938829,0xd7b501d6,0x3edd7d4c,0x996627ee,0x90cd34c7,0x37d44a62,0xf3833e8d,0xa8327499 .long 0x4bf50353,0x2e18917d,0x556765fb,0x85dd726b,0x93d5ab66,0x54fe65d6,0x915c25fe,0x3ddbaced,0x12f22e85,0xa799d9a4,0x6d06f6bc,0xe2a24867,0x43ca1637,0xf4f1ee56,0x61ece30a,0xfda2828b .long 0xa2dee7a6,0x758c1a3e,0x734b2284,0xdcde2f3c,0x4eaba6ad,0xaba445d2,0x76cee0a7,0x35aaf668,0xe5aa049a,0x7e0b04a9,0x91103e84,0xe74083ad,0x40afecc3,0xbeb183ce,0xea043f7a,0x6b89de9f .long 0xfe67ba66,0x0e299d23,0x93cf2f34,0x91450760,0x97fcf913,0xf45b5ea9,0x8bd7ddda,0x5be00843,0xd53ff04d,0x358c3e05,0x5de91ef7,0xbf7ccdc3,0xb69ec1a0,0xad684dbf,0x801fd997,0x367e7cf2 .long 0xb0dc8595,0x0ca1f3b7,0x9f1d9f2e,0x27de4608,0xbadd82a7,0x1af3bf39,0x65862448,0x79356a79,0xf5f9a052,0xc0602345,0x139a42f9,0x1a8b0f89,0x844d40fc,0xb53eee42,0x4e5b6368,0x93b0bfe5 .long 0xc024789c,0x5434dd02,0x41b57bfc,0x90dca9ea,0x243398df,0x8aa898e2,0x894a94bb,0xf607c834,0xc2c99b76,0xbb07be97,0x18c29302,0x6576ba67,0xe703a88c,0x3d79efcc,0xb6a0d106,0xf259ced7 .long 0xc8de610b,0x0f893a5d,0x67e223ce,0xe8c515fb,0x4ead6dc5,0x7774bfa6,0x925c728f,0x89d20f95,0x098583ce,0x7a1e0966,0x93f2a7d7,0xa2eedb94,0x4c304d4a,0x1b282097,0xc077282d,0x0842e3da .long 0x3b9e2d7b,0xe4d972a3,0xc48218ff,0x7cc60b27,0x84149d91,0x8fc70838,0x2f461ecc,0x5c04346f,0x614650a9,0xebe9fdf2,0xc1f666ac,0x5e35b537,0x88babc83,0x645613d1,0xc5e1c93e,0x88cace3a .long 0x3de92e23,0x209ca375,0x5fbbb6e3,0xccb03cc8,0xd7b1487e,0xccb90f03,0xc710941f,0xfa9c2a38,0x6724ceed,0x756c3823,0x192d0323,0x3a902258,0xea5e038e,0xb150e519,0xc7427591,0xdcba2865 .long 0x78890732,0xe549237f,0x53fcb4d9,0xc443bef9,0xeb3480d6,0x9884d8a6,0x3048b186,0x8a35b6a1,0x65e9a90a,0xb4e44716,0x653006c0,0x45bf380d,0x4fe9ae3b,0x8f3f820d,0x979a3b71,0x244a35a0 .long 0x74cd06ff,0xa1010e9d,0xaca3eeac,0x9c17c7df,0x8063aa2b,0x74c86cd3,0x734614ff,0x8595c4b3,0x990f62cc,0xa3de00ca,0xca0c3be5,0xd9bed213,0xdf8ce9f5,0x7886078a,0x5cd44444,0xddb27ce3 .long 0x58926ddd,0xed374a66,0x908015b8,0x138b2d49,0xde1f7ab8,0x886c6579,0xc3020b7a,0x888b9aa0,0x3a96e355,0xd3ec034e,0xf30fbe9a,0xba65b0b8,0xff21367a,0x064c8e50,0x0b04b46e,0x1f508ea4 .long 0x747c866c,0x98561a49,0x0518a062,0xbbb1e5fe,0xecdc3608,0x20ff4e8b,0x20184027,0x7f55cded,0xf38c85f0,0x8d73ec95,0x8bc3b8c3,0x5b589fdf,0x0f12b66f,0xbe95dd98,0x0e338e01,0xf5bd1a09 .long 0x5e915918,0x65163ae5,0x86f8a46b,0x6158d6d9,0xeeebf99c,0x8466b538,0xbca477ef,0xca8761f6,0x9ebbc601,0xaf3449c2,0xe0c3ae2f,0xef3b0f41,0x5de63752,0xaa6c577d,0x64682a51,0xe9166601 .long 0xfc15aa1e,0x5a3097be,0xb54b0745,0x40d12548,0x519a5f12,0x5bad4706,0xa439dee6,0xed03f717,0x4a02c499,0x0794bb6c,0xcffe71d2,0xf725083d,0x0f3adcaf,0x2cad7519,0x43729310,0x7f68ea1c .long 0xb7ffd977,0xe747c8c7,0x80761a22,0xec104c35,0x5a3ffb83,0x8395ebaf,0xe4b63db7,0xfb3261f4,0xd883e544,0x53544960,0x8cc2eeb8,0x13520d70,0xd3d65f99,0x08f6337b,0x781cf95b,0x83997db2 .long 0x0dbd2c01,0xce6ff106,0x1f9ce934,0x4f8eea6b,0x0e993921,0x546f7c4b,0x5e753fc7,0x6236a324,0xa16022e9,0x65a41f84,0x43d1dbb2,0x0c18d878,0x2d4cef9c,0x73c55640,0x70444c74,0xa0428108 .long 0x9afdfb3c,0x68e4f15e,0x5bdfb6df,0x49a56143,0x5f823d97,0xa9bc1bd4,0xea111c2a,0xbceb5970,0xb269bbc4,0x366b455f,0xe9bc5d62,0x7cd85e1e,0x4f18b086,0xc743c41c,0x95294fb9,0xa4b40990 .long 0x26ee8382,0x9c7c581d,0x359d638e,0xcf17dcc5,0xb728ae3d,0xee8273ab,0xf821f047,0x1d112926,0x50491a74,0x11498477,0xfde0dfb9,0x687fa761,0x7ea435ab,0x2c258022,0x91ce7e3f,0x6b8bdb94 .long 0x3bf834aa,0x4c5b5dc9,0x4f6c7e4b,0x04371819,0x3736bcad,0xc284e00a,0x21ae8f8d,0x0d881118,0xf48c8e33,0xf9cf0f82,0xa1bf40db,0xa11fd075,0xdc2733e5,0xdceab0de,0x8e986bd7,0xc560a8b5 .long 0x3929d097,0x48dd1fe2,0x92f188f1,0x3885b290,0xda6fcdac,0x0f2ae613,0xb662a46c,0x9054303e,0x0738042a,0xb6871e44,0xbdaf6449,0x98e6a977,0xd1c9df1b,0xd8bc0650,0x36e098f9,0xef3d6451 .long 0xb6d72d28,0x03fbae82,0xf5d84080,0x77ca9db1,0xa58efc1c,0x8a112cff,0xc564cb4a,0x518d761c,0xf0d1b5ce,0x69b5740e,0xe9eb1785,0x717039cc,0x22f53382,0x3fe29f90,0x6bc7c95c,0x8e54ba56 .long 0xf7f91d0f,0x9c806d8a,0xa82a5728,0x3b61b0f1,0x94d76754,0x4640032d,0x47d834c6,0x273eb5de,0x7b4e4d53,0x2988abf7,0xde401777,0xb7ce66bf,0x715071b3,0x9fba6b32,0xad3a1a98,0x82413c24 .long 0xe0e8ad93,0x5b7fc8c4,0x5fab868d,0xb5679aee,0x2b3946f3,0xb1f9d2fa,0x5685b50a,0x458897dc,0x89d0caf3,0x1e98c930,0x78642e92,0x39564c5f,0x0dbdaf18,0x1b77729a,0x579e82e6,0xf9170722 .long 0xe4515fa5,0x680c0317,0xfb0c790f,0xf85cff84,0x6d2e0765,0xc7a82aab,0x35c82b32,0x7446bca9,0x6d63184f,0x5de607aa,0x262803a6,0x7c1a46a8,0xaebe8035,0xd218313d,0xc73c51f8,0x92113ffd .long 0x12e7e46c,0x4b38e083,0x56126bd5,0x69d0a37a,0x73c07e04,0xfb3f324b,0x8fda7267,0xa0c22f67,0x4d2c7d8f,0x8f2c0051,0xcbe2cae5,0xbc45ced3,0xa8f0f277,0xe1c6cf07,0x1eb99a98,0xbc392312 .long 0x3cc8ac85,0x75537b7e,0xdd02753b,0x8d725f57,0xb737df2f,0xfd05ff64,0xf6d2531d,0x55fe8712,0x6ab6b01c,0x57ce04a9,0x7cd93724,0x69a02a89,0xcf86699b,0x4f82ac35,0x9cb4b232,0x8242d3ad .long 0xd62105e5,0x713d0f65,0x2d29be61,0xbb222bfa,0x6cfbef09,0xf2f9a79e,0xd5d6782f,0xfc24d8d3,0xd4129967,0x5db77085,0xdc3c2a43,0xdb81c3cc,0x05d8d9a3,0x9d655fc0,0x54298026,0x3f5d057a .long 0x88c54694,0x1157f56d,0x9b09573e,0xb26baba5,0x22adffd1,0x2cab03b0,0xdd69f383,0x60a412c8,0x54b25039,0xed76e98b,0x687e714d,0xd4ee67d3,0x7b00b594,0x87739648,0xc9ef709b,0xce419775 .long 0x1c203a40,0x40f76f85,0xeafd8f91,0x30d352d6,0x95578dd2,0xaf196d3d,0x77cc3f3d,0xea4bb3d7,0xb98e782b,0x42a5bd03,0x0624920d,0xac958c40,0xfc56fcc8,0xb838134c,0x89572e5e,0x86ec4ccf .long 0x9be47be0,0x69c43526,0xcb28fea1,0x323b7dd8,0x3a6c67e5,0xfa5538ba,0x1d378e46,0xef921d70,0x3c4b880e,0xf92961fc,0x98940a67,0x3f6f914e,0xfef0ff39,0xa990eb0a,0xf0eeff9c,0xa6c2920f .long 0x51b8d9a3,0xca804166,0x0ffb0db1,0x42531bc9,0xaa82e7ce,0x72ce4718,0xdf574741,0x6e199913,0xd5d36946,0xd5f1b13d,0xf68f0194,0x8255dc65,0x8710d230,0xdc9df4cd,0x138c1988,0x3453c20f .long 0x89a6ef01,0x9af98dc0,0x9857df85,0x4dbcc3f0,0x5c1ad924,0x34805601,0xd0493046,0x40448da5,0x4ee343e2,0xf629926d,0x90e8a301,0x6343f1bd,0x40815b3f,0xefc93491,0xde8f66fb,0xf882a423 .long 0xe7db9f57,0x3a12d5f4,0x3c384c27,0x7dfba38a,0x6fc660b1,0x7a904bfd,0x2773b21c,0xeb6c5db3,0x1cdfe049,0xc350ee66,0x44540f29,0x9baac0ce,0xa5ec6aad,0xbc57b6ab,0x0a7c1baa,0x167ce8c3 .long 0x53fb2b56,0xb23a03a5,0x4e057f78,0x6ce141e7,0x89e490d9,0x796525c3,0xa31a7e75,0x0bc95725,0x1220fd06,0x1ec56791,0x408b0bd6,0x716e3a3c,0xe8ebeba9,0x31cd6bf7,0xbee6b670,0xa7326ca6 .long 0xcd090c43,0x3d9f851c,0xf12c3988,0x561e8f13,0x904b7be4,0x50490b6a,0x0410737b,0x61690ce1,0x0f009052,0x299e9a37,0xf026092e,0x258758f0,0xfdfcdc0f,0x9fa255f3,0xc0e1bcd2,0xdbc9fb1f .long 0x24651840,0x35f9dd6e,0xa5c59abc,0xdca45a84,0xecca4938,0x103d396f,0xb97b3f29,0x4532da0a,0x1999a6bf,0xc4135ea5,0x5e6bf2ee,0x3aa9505a,0x3f5be093,0xf77cef06,0xa943152e,0x97d1a0f8 .long 0x2e1c21dd,0x2cb0ebba,0x2c6797c4,0xf41b29fc,0xb300101f,0xc6e17321,0xd0d79a89,0x4422b0e9,0x92f1bfc4,0x49e4901c,0xe1e10ed9,0x06ab1f8f,0xdb2926b8,0x84d35577,0x356e8ec2,0xca349d39 .long 0x343bf1a9,0x70b63d32,0x37d1a6b1,0x8fd3bd28,0x316865b4,0x0454879c,0xc458efa2,0xee959ff6,0x9706dc3f,0x0461dcf8,0x164e4b2e,0x737db0e2,0x2f8843c8,0x09262680,0x7745e6f6,0x54498bbc .long 0xa29e24af,0x359473fa,0x70aa87a1,0xfcc3c454,0x00573ace,0xfd2c4bf5,0x28dd1965,0xb65b514e,0x2193e393,0xe46ae7cf,0xf5444d97,0x60e9a4e1,0x00ff38ed,0xe7594e96,0x0a0e0f02,0x43d84d2f .long 0xee398a21,0x8b6db141,0xe3bcc5be,0xb88a56ae,0x373460ea,0x0a1aa52f,0x160bb19b,0x20da1a56,0x65bf0384,0xfb54999d,0x5d5a180e,0x71a14d24,0x21737b04,0xbc44db7b,0x01dd8e92,0xd84fcb18 .long 0xfa44b479,0x80de937b,0x5c98fd4f,0x53505499,0x28f08727,0x1edb12ab,0xa5f3ef53,0x4c58b582,0x8327f246,0xbfb236d8,0x4d7df320,0xc3a3bfaa,0xb96024f2,0xecd96c59,0x7f4e0433,0xfc293a53 .long 0x5acf6e10,0x5341352b,0xafe652c3,0xc50343fd,0x18577a7f,0x4af3792d,0xaf16823d,0xe1a4c617,0x33425d0a,0x9b26d0cd,0x9b7bc47f,0x306399ed,0x706bb20b,0x2a792f33,0x98111055,0x31219614 .long 0x87f5d28b,0x864ec064,0x962277fd,0x11392d91,0xbb6aed5f,0xb5aa7942,0x47e799d9,0x080094dc,0x208ba19b,0x4afa588c,0x8512f284,0xd3e7570f,0x02f5799a,0xcbae64e6,0x514b9492,0xdeebe7ef .long 0xe5c298ff,0x30300f98,0x3678361f,0x17f561be,0x98cb9a16,0xf52ff312,0x5562d490,0x6233c3bc,0x92e3a2cb,0x7bfa15a1,0xe6365119,0x961bcfd1,0x2c8c53b1,0x3bdd29bf,0x822844ba,0x739704df .long 0x7e7b754b,0x7dacfb58,0xa806c9b9,0x23360791,0x23504452,0xe7eb88c9,0x852c1783,0x2983e996,0x958d881d,0xdd4ae529,0x262c7b3c,0x026bae03,0x960b52d1,0x3a6f9193,0x92696cfb,0xd0980f90 .long 0xd5f30851,0x4c1f428c,0x2a4f6630,0x94dfed27,0xfc5d48a4,0x4df53772,0x933260ce,0xdd2d5a2f,0xd44cc7a5,0x574115bd,0xbd12533a,0x4ba6b20d,0x243057c9,0x30e93cb8,0x14de320e,0x794c486a .long 0xf21496e4,0xe925d4ce,0xec696331,0xf951d198,0x3e8d812f,0x9810e2de,0x389294ab,0xd0a47259,0x0e3bab66,0x513ba2b5,0xabad306f,0x462caff5,0xaf04c49e,0xe2dc6d59,0xe0b84b0b,0x1aeb8750 .long 0x2f7d0ca2,0xc034f12f,0xe06acf2f,0x6d2e8128,0x21facc2f,0x801f4f83,0xf40ef607,0xa1170c03,0x7805a99c,0xfe0a1d4f,0xcc26aba5,0xbde56a36,0x35531f40,0x5b1629d0,0x9afa6108,0xac212c2b .long 0x15697be5,0x30a06bf3,0x2c63c7c1,0x6f0545dc,0x7ccdadaf,0x5d8cb842,0xac7015bb,0xd52e379b,0xf462c23e,0xc4f56147,0x46bc24b0,0xd44a4298,0xe2856d4f,0xbc73d23a,0x0832bcdf,0x61cedd8c .long 0x99f241d7,0x60953556,0x001a349d,0xee4adbd7,0xaa89e491,0x0b35bf6a,0x136f7546,0x7f0076f4,0x9264da3d,0xd19a18ba,0x62a7a28b,0x6eb2d2cd,0x8761c971,0xcdba941f,0xa3be4a5d,0x1550518b .long 0x57d0b70c,0xd0e8e2f0,0xcd133ba3,0xeea8612e,0x44416aec,0x814670f0,0x30775061,0x424db6c3,0x16213fd1,0xd96039d1,0x18a3478f,0xc61e7fa5,0xcb0c5021,0xa805bdcc,0x0cc616dd,0xbdd6f3a8 .long 0x5d97f7e2,0x06009667,0xaf0bf4b6,0x31db0fc1,0x5491627a,0x23680ed4,0x7d741fb1,0xb99a3c66,0x36b1ff92,0xe9bb5f55,0x512b388d,0x29738577,0x50fcf263,0xdb8a2ce7,0x6c4f7b47,0x385346d4 .long 0x31631f9e,0xbe86c5ef,0x03a57a29,0xbf91da21,0x7b23f821,0xc3b1f796,0x770db354,0x0f7d00d2,0xd8fe79da,0x8ffc6c3b,0xd525c996,0xcc5e8c40,0xcfff632a,0x4640991d,0x67112528,0x64d97e8c .long 0x02f1cd1e,0xc232d973,0x1dd212a4,0xce87eacb,0xe69802f7,0x6e4c8c73,0x1fffddbd,0x12ef0290,0x1bcea6e2,0x941ec74e,0x3cb92cbb,0xd0b54024,0x7e8f9d05,0x809fb9d4,0xf2992aae,0x3bf16159 .long 0xf8a7a838,0xad40f279,0x05615660,0x11aea631,0xa01f6fa1,0xbf52e6f1,0x3dc2aec9,0xef046995,0xd8080711,0x785dbec9,0x9fdedf76,0xe1aec60a,0xfa21c126,0xece797b5,0x05e52732,0xc66e898f .long 0x08811fdb,0x39bb69c4,0x2fc7f082,0x8bfe1ef8,0x174f4138,0xc8e7a393,0xd58d1f98,0xfba8ad1d,0xbfd2fd5b,0xbc21d0ce,0x6ee60d61,0x0b839a82,0xafd22253,0xaacf7658,0xaae396b3,0xb526bed8 .long 0x38564464,0xccc1bbc2,0x8c45bc73,0x9e3ff947,0x58188a78,0xcde9bca3,0xd73bf8f7,0x138b8ee0,0x4123c489,0x5c7e234c,0xfa643297,0x66e69368,0x39a15fa3,0x0629eeee,0xa9e2a927,0x95fab881 .long 0xeafbb1e1,0xb2497007,0xe75b7a93,0xd75c9ce6,0xefb68d78,0x3558352d,0x223f6396,0xa2f26699,0xe469b17a,0xeb911ecf,0xe72d3ec2,0x62545779,0x82cb113f,0x8ea47de7,0x4e1fa98d,0xebe4b086 .long 0x8cdfedb1,0xec2d5ed7,0xfe211a74,0xa535c077,0x11d244c5,0x9678109b,0xbe299a76,0xf17c8bfb,0xfb11fbc4,0xb651412e,0x94ab3f65,0xea0b5482,0x0cf78243,0xd8dffd95,0xce0361d4,0x2e719e57 .long 0x304ddc5b,0x9007f085,0x4daba2ea,0x095e8c6d,0x3f9d28a9,0x5a33cdb4,0xe2283003,0x85b95cd8,0xb9744733,0xbcd6c819,0xfc7f5783,0x29c5f538,0xd59038e4,0x6c49b2fa,0x3bbe1018,0x68349cc1 .long 0x21830ee5,0xcc490c1d,0xe9bfa297,0x36f9c4ee,0x48de1a94,0x58fd7294,0x4e8f2cdc,0xaadb13a8,0x81313dba,0x515eaaa0,0xc2152dd8,0xc76bb468,0xa653dbf8,0x357f8d75,0xb14ac143,0xe4d8c4d1 .long 0xb055cb40,0xbdb8e675,0x977b5167,0x898f8e7b,0xb82fb863,0xecc65651,0x6d88f01f,0x56544814,0x263a75a9,0xb0928e95,0x1a22fcda,0xcfb6836f,0x3f3bd37c,0x651d14db,0xb6ad4664,0x1d3837fb .long 0xff4f94ab,0x7c5fb538,0x6d7fb8f2,0x7243c712,0xa85c5287,0xef13d60c,0x4bb8dd1b,0x18cfb7c7,0x72908219,0x82f9bfe6,0x9d5144ab,0x35c4592b,0x9cf4b42f,0x52734f37,0x8c60ddc4,0x6bac55e7 .long 0x94dea0f6,0xb5cd811e,0xe18cc1a3,0x259ecae4,0x15e660f8,0x6a0e836e,0x0e02bff2,0x6c639ea6,0x7e1026fd,0x8721b8cb,0x63261942,0x9e73b50b,0x77f01da3,0xb8c70974,0x8268f57f,0x1839e6a6 .long 0x5150b805,0x571b9415,0xf92c7097,0x1892389e,0x4a084b95,0x8d69c18e,0xbe5b495c,0x7014c512,0x1b07523c,0x4780db36,0x2c1c64fa,0x2f6219ce,0x602c105a,0xc38b81b0,0x5dc8e360,0xab4f4f20 .long 0xcf7d62d2,0x20d3c982,0x23ba8150,0x1f36e29d,0x92763f9e,0x48ae0bf0,0x1d3a7007,0x7a527e6b,0x581a85e3,0xb4a89097,0xdc158be5,0x1f1a520f,0x167d726e,0xf98db37d,0x1113e862,0x8802786e .long 0x36f09ab0,0xefb2149e,0x4a10bb5b,0x03f163ca,0x06e20998,0xd0297045,0x1b5a3bab,0x56f0af00,0x70880e0d,0x7af4cfec,0xbe3d913f,0x7332a66f,0x7eceb4bd,0x32e6c84a,0x9c228f55,0xedc4a79a .long 0xc55c4496,0xc37c7dd0,0x25bbabd2,0xa6a96357,0xadd7f363,0x5b7e63f2,0x2e73f1df,0x9dce3782,0xb2b91f71,0xe1e5a16a,0x5ba0163c,0xe4489823,0xf6e515ad,0xf2759c32,0x8615eecf,0xa5e2f1f8 .long 0xabded551,0x74519be7,0xc8b74410,0x03d358b8,0x0e10d9a9,0x4d00b10b,0x28da52b7,0x6392b0b1,0x0b75c904,0x6744a298,0xa8f7f96c,0xc305b0ae,0x182cf932,0x042e421d,0x9e4636ca,0xf6fc5d50 .long 0xd64cc78c,0x795847c9,0x9b6cb27b,0x6c50621b,0xdf8022ab,0x07099bf8,0xc04eda1d,0x48f862eb,0xe1603c16,0xd12732ed,0x5c9a9450,0x19a80e0f,0xb429b4fc,0xe2257f54,0x45460515,0x66d3b2c6 .long 0x822e37be,0x6ca4f87e,0x253bda4e,0x73f237b4,0x41190aeb,0xf747f3a2,0x804cf284,0xf06fa36f,0xfc621c12,0x0a6bbb6e,0x40b80ec6,0x5d624b64,0x7ba556f3,0x4b072425,0x3e2d20a8,0x7fa0c354 .long 0xe3229d41,0xe921fa31,0x94531bd4,0xa929c652,0xa6d38209,0x84156027,0x6bdb97bd,0xf3d69f73,0x16833631,0x8906d19a,0x03d51be3,0x68a34c2e,0x0e511cd8,0xcb59583b,0xfdc132a8,0x99ce6bfd .long 0xffcdb463,0x3facdaaa,0x34a38b08,0x658bbc1a,0xf1a9078d,0x12a801f8,0x6ab855de,0x1567bcf9,0x3572359b,0xe08498e0,0x8659e68b,0xcf0353e5,0x7d23807c,0xbb86e9c8,0x2198e8a2,0xbc08728d .long 0x453cadd6,0x8de2b7bc,0xbc0bc1f8,0x203900a7,0xa6abd3af,0xbcd86e47,0x8502effb,0x911cac12,0xec965469,0x2d550242,0x29e0017e,0x0e9f7692,0x65979885,0x633f078f,0x4cf751ef,0xfb87d449 .long 0xfc25419a,0xe1790e4b,0x4bff3cfd,0x36467203,0x25b6e83f,0xc8db6386,0x6cad6fd2,0x6cc69f23,0x6bc68bb9,0x0219e45a,0x297f7334,0xe43d79b6,0x465dc97c,0x7d445368,0x2a0b949a,0x4b9eea32 .long 0x6102d021,0x1b96c6ba,0x2f4461ea,0xeaafac78,0xc49f19a8,0xd4b85c41,0xcf538875,0x275c28e4,0xdd2e54e0,0x35451a9d,0x0605618b,0x6991adb5,0x7b36cd24,0x5b8b4bcd,0x56f37216,0x372a4f8c .long 0xa6a5da60,0xc890bd73,0xdc4c9ff0,0x6f083da0,0xf0536e57,0xf4e14d94,0xaaec8243,0xf9ee1eda,0x8bdcf8e7,0x571241ec,0x0b041e26,0xa5db8271,0xe3fff040,0x9a0b9a99,0x7c271202,0xcaaf21dd .long 0x4f0dd2e8,0xb4e2b2e1,0x0a377ac7,0xe77e7c4f,0x0d7a2198,0x69202c3f,0x28200eb8,0xf759b7ff,0xdcfe314e,0xc87526ed,0x53d5cf99,0xeb84c524,0x515138b6,0xb1b52ace,0x23fca3f4,0x5aa7ff8c .long 0xb9791a26,0xff0b13c3,0xcdd58b16,0x960022da,0x57aad2de,0xdbd55c92,0xf30fe619,0x3baaaaa3,0x0d881efd,0x9a4b2346,0x46325e2a,0x506416c0,0x035c18d4,0x91381e76,0xf27817b0,0xb3bb68be .long 0x5116f937,0x15bfb8bf,0xc1268943,0x7c64a586,0x8419a2c8,0x71e25cc3,0x8335f463,0x9fd6b0c4,0xe8ee0e0e,0x4bf0ba3c,0x298c21fa,0x6f6fba60,0xae66bee0,0x57d57b39,0x22672544,0x292d5130 .long 0xbab093b3,0xf451105d,0x02839986,0x012f59b9,0x3474a89c,0x8a915802,0x2de03e97,0x048c919c,0x91071cd5,0xc476a2b5,0x034970a5,0x791ed89a,0xe1b7994b,0x89bd9042,0xa1057ffd,0x8eaf5179 .long 0xd551ee10,0x6066e2a2,0x727e09a6,0x87a8f1d8,0x2c01148d,0x00d08bab,0x424f33fe,0x6da8e4f1,0xcf9a4e71,0x466d17f0,0x3bf5cb19,0xff502010,0xd062ecc0,0xdccf97d8,0x81d80ac4,0x80c0d9af .long 0x033f2876,0xe87771d8,0x7d5cc3db,0xb0186ec6,0x3bc9bc1d,0x58e8bb80,0x6f6ef60e,0x4d1395cc,0x186244a0,0xa73c62d6,0x110a5b53,0x918e5f23,0x741b7eab,0xed4878ca,0xdbe03e51,0x3038d71a .long 0xa93c3246,0x840204b7,0xa0b9b4cd,0x21ab6069,0xb1d64218,0xf5fa6e2b,0xf3d56191,0x1de6ad0e,0xff1929c7,0x570aaa88,0x640e87b5,0xc6df4c6b,0xc65f0ccc,0xde8a74f2,0xe6f6cc01,0x8b972fd5 .long 0x0b846531,0x3fff36b6,0x10a5e475,0xba7e45e6,0x4145b6c5,0x84a1d10e,0x5e046d9d,0xf1f7f91a,0x44de90d7,0x0317a692,0xf199c15e,0x951a1d4a,0xc9d73deb,0x91f78046,0xfab8224f,0x74c82828 .long 0xe7560b90,0xaa6778fc,0xa7e824ce,0xb4073e61,0xd642eba8,0xff0d693c,0x5dccef38,0x7ce2e57a,0x1df1ad46,0x89c2c789,0x098346fd,0x83a06922,0xda2fc177,0x2d715d72,0x85b6cf1d,0x7b6dd71d .long 0x73fa9cb0,0xc60a6d0a,0x328bf5a9,0xedd3992e,0x832c8c82,0xc380ddd0,0xa2a0bf50,0xd182d410,0xd9a528db,0x7d9d7438,0xcaf53994,0xe8b1a0e9,0x0e19987c,0xddd6e5fe,0x190b059d,0xacb8df03 .long 0x8300129f,0x53703a32,0x68c43bfd,0x1f637662,0x00e54051,0xbcbd1913,0x7bf5a8c5,0x812fcc62,0x29fb85da,0x3f969d5f,0x694759e8,0x72f4e00a,0x790726b7,0x426b6e52,0x3bdbb209,0x617bbc87 .long 0x97aee317,0x511f8bb9,0xe81536a8,0x812a4096,0x3ac09b9b,0x137dfe59,0xba8c9a7a,0x0682238f,0xaeccb4bd,0x7072ead6,0x692ba633,0x6a34e9aa,0x6fff9d33,0xc82eaec2,0x1d4d2b62,0xfb753512 .long 0x1d7aadab,0x1a0445ff,0xd5f6a67c,0x65d38260,0x91cfb26f,0x6e62fb08,0x5c7d91d6,0xef1e0fa5,0x33db72cd,0x47e7c7ba,0xfa7c74b2,0x017cbc09,0xf50a503c,0x3c931590,0x616baa42,0xcac54f60 .long 0xb2369f0f,0x9b6cd380,0x23c76151,0x97d3a70d,0x9862a9c6,0x5f9dd6fc,0x12312f51,0x044c4ab2,0x834a2ddc,0x035ea0fd,0xcc7b826d,0x49e6b862,0x62fce490,0xb03d6883,0xb37e36e9,0x62f2497a .long 0xc6458293,0x04b005b6,0xe8d10af7,0x36bb5276,0x8ee617b8,0xacf2dc13,0xb004b3d4,0x470d2d35,0xfeeb1b77,0x06790832,0x85657f9c,0x2bb75c39,0xc0f60004,0xd70bd4ed,0x219b018b,0xfe797ecc .long 0x753aebcc,0x9b5bec2a,0xc939eca5,0xdaf9f3dc,0xd095ad09,0xd6bc6833,0xdaa4d2fc,0x98abdd51,0x8d168be5,0xd9840a31,0x2325a23c,0xcf7c10e0,0x7e6ecfaf,0xa5c02aa0,0xb5bfdf18,0x2462e7e6 .long 0xa0cc3f12,0xab2d8a8b,0xbc672a29,0x68dd485d,0x596f2cd3,0x72039752,0xa0cf3d8d,0x5d3eea67,0xe6602671,0x810a1a81,0x14026c0c,0x8f144a40,0x76b50f85,0xbc753a6d,0x645cd4a4,0xc4dc21e8 .long 0x521d0378,0xc5262dea,0x05011c6f,0x802b8e0e,0x0b4c19ea,0x1ba19cbb,0xebf0aaec,0x21db64b5,0x70342f9d,0x1f394ee9,0x1bc44a14,0x93a10aee,0x3efd0baa,0xa7eed31b,0x1d154e65,0x6e7c824e .long 0x9966e7ee,0xee23fa81,0x05b7920d,0x64ec4aa8,0x2d90aad4,0x2d44462d,0xdf277ad5,0xf44dd195,0xbb46b6a1,0x8d6471f1,0xfd885090,0x1e65d313,0x13a977b4,0x33a800f5,0x0797e1ef,0xaca9d721 .long 0xfcff6a17,0x9a5a85a0,0x1eca7cee,0x9970a3f3,0xc9504be3,0xbb9f0d6b,0xadd24ee2,0xe0c504be,0x77fcc2f4,0x7e09d956,0x65bb5fc4,0xef1a5227,0x8b9286aa,0x145d4fb1,0x6649028b,0x66fd0c5d .long 0x1bf4581c,0x98857ceb,0xaca7b166,0xe635e186,0x659722ac,0x278ddd22,0x1db68007,0xa0903c4c,0x48f21402,0x366e4589,0xb96abda2,0x31b49c14,0xe0403190,0x329c4b09,0xd29f43fe,0x97197ca3 .long 0x274983d8,0x8073dd1e,0x55717c8f,0xda1a3bde,0x0361f9d1,0xfd3d4da2,0x4c7de1ce,0x1332d081,0xaa6d0e10,0x9b7ef7a3,0xf54f1c4a,0x17db2e73,0x4cd35567,0xaf3dffae,0xe56f4e71,0xaaa2f406 .long 0x7ace3fc7,0x8966759e,0x45a8d8c6,0x9594eacf,0x91834e0e,0x8de3bd8b,0x548c0421,0xafe4ca53,0xe6ee81c6,0xfdd7e856,0x6b891a3a,0x8f671beb,0xfae63829,0xf7a58f2b,0x9c11ac9f,0x9ab186fb .long 0x10b5be76,0x8d6eb369,0xfb040bcd,0x046b7739,0xcb73de88,0xccb4529f,0xcf26be03,0x1df0fefc,0xbcfcd027,0xad7757a6,0xbb3165ca,0xa8786c75,0x7e99a4d9,0xe9db1e34,0xb06c504b,0x99ee86df .long 0xc15c9f0a,0x5b7c2ddd,0x4295989e,0xdf87a734,0x03d08fda,0x59ece47c,0xad5fc702,0xb074d3dd,0x51a03776,0x20407903,0x2a608007,0x2bb1f77b,0xe1153185,0x25c58f4f,0x766e6447,0xe6df62f6 .long 0xed51275a,0xefb3d1be,0x2f0f483f,0x5de47dc7,0x97c2bedf,0x7932d98e,0x0219f8a1,0xd5c11927,0xa73a294e,0x9d751200,0x9dc20172,0x5f88434a,0xa26f506a,0xd28d9fd3,0x9d1dcd48,0xa890cd31 .long 0x70f4d3b4,0x0aebaec1,0x0ffc8d00,0xfd1a1369,0x57d57838,0xb9d9c240,0x68bac361,0x45929d26,0x25b15ca6,0x5a2cd060,0x6e474446,0x4b3c83e1,0xee1e5134,0x1aac7578,0xc91e2f41,0xa418f5d6 .long 0x213ed68b,0x6936fc8a,0x510a5224,0x860ae7ed,0xdef09b53,0x63660335,0xcd79c98d,0x641b2897,0x01110f35,0x29bd38e1,0x648b1937,0x79c26f42,0x9d9164f4,0x64dae519,0x0265c273,0xd85a2310 .long 0x4b07e2b1,0x7173dd5d,0x8d9ea221,0xd144c4cb,0x1105ab14,0xe8b04ea4,0xfe80d8f1,0x92dda542,0xcf03dce6,0xe9982fa8,0x1a22cffc,0x8b5ea965,0x3fad88c4,0xf7f4ea7f,0x6a5ba95c,0x62db773e .long 0x93f24567,0xd20f02fb,0x315257ca,0xfd46c69a,0x8bcab987,0x0ac74cc7,0x5ceca2f5,0x46f31c01,0x888b219e,0x40aedb59,0xe1fccd02,0xe50ecc37,0x911f816c,0x1bcd9dad,0x8db9b00c,0x583cc1ec .long 0xa483bf11,0xf3cd2e66,0xb1b2c169,0xfa08a6f5,0x4be9fa28,0xf375e245,0x5b6d011f,0x99a7ffec,0xc4ae62da,0x6a3ebddb,0x374aef5d,0x6cea00ae,0x9d4d05bc,0xab5fb98d,0xd560f252,0x7cba1423 .long 0x208490de,0x49b2cc21,0xbcfb2879,0x1ca66ec3,0x1b6fb16f,0x7f1166b7,0x65fe5db3,0xfff63e08,0x8b2610be,0xb8345abe,0x39de3df4,0xb732ed80,0x211c32b4,0x0e24ed50,0x848ff27d,0xd10d8a69 .long 0xed4de248,0xc1074398,0x10488927,0xd7cedace,0x85673e13,0xa4aa6bf8,0x6daf30af,0xb46bae91,0xfcef7ad8,0x07088472,0xd4b35e97,0x61151608,0xdde29986,0xbcfe8f26,0xd5a34c79,0xeb84c4c7 .long 0x164e1214,0xc1eec55c,0xa147bb03,0x891be86d,0x0ba96835,0x9fab4d10,0xa5c1ae9f,0xbf01e9b8,0xb186ebc0,0x6b4de139,0x85b91bca,0xd5c74c26,0xc2d93854,0x5086a99c,0xa7a9dfbc,0xeed62a7b .long 0x76b7618a,0x8778ed6f,0x03b66062,0xbff750a5,0xb65186db,0x4cb7be22,0xcc3a6d13,0x369dfbf0,0x7191a321,0xc7dab26c,0x40ed718e,0x9edac3f9,0xd0cfd183,0xbc142b36,0x7c991693,0xc8af82f6 .long 0x97ce0b2a,0xb3d1e4d8,0xc3a55cdf,0xe6d7c87f,0x68b81afe,0x35846b95,0xd3c239d8,0x018d12af,0x01206e15,0x2b2c6208,0xa3b882c6,0xe0e42453,0xa50162d5,0x854470a3,0x7017a62a,0x08157478 .long 0x820357c7,0x18bd3fb4,0x6f1458ad,0x992039ae,0x25b44aa1,0x9a1df3c5,0xed3d5281,0x2d780357,0xc77ad4d4,0x58cf7e4d,0xf9df4fc4,0xd49a7998,0x1d71205e,0x4465a8b5,0x649254aa,0xa0ee0ea6 .long 0xab7bd771,0x4b5eeecf,0x35c262b9,0x6c873073,0x3c9d61e7,0xdc5bd648,0x321460d2,0x233d6d54,0xfc195bcc,0xd20c5626,0x04d78b63,0x25445958,0x17ec8ef3,0xe03fcb3d,0x46b8f781,0x54b690d1 .long 0x21230646,0x82fa2c8a,0x084f418c,0xf51aabb9,0x1a30ba43,0xff4fbec1,0x743c9df7,0x6a5acf73,0xd635b4d5,0x1da2b357,0xecd5c1da,0xc3de68dd,0xd61af0dd,0xa689080b,0xd665bf99,0xdea5938a .long 0xfe637294,0x0231d71a,0xa5a81cd8,0x01968aa6,0x048e63b5,0x11252d50,0x6ca007e9,0xc446bc52,0x96d6134b,0xef8c50a6,0x9e09a05c,0x9361fbf5,0xdca3291a,0xf17f85a6,0xff251a21,0xb178d548 .long 0xa4df3915,0x87f6374b,0x2fd5d608,0x566ce1bf,0x7de35102,0x425cba4d,0x58c5d5e2,0x6b745f8f,0x63122edf,0x88402af6,0x3b989a89,0x3190f9ed,0xebba3156,0x4ad3d387,0xc7c469a5,0xef385ad9 .long 0x3f642c29,0xb08281de,0x910ffb88,0x20be0888,0xd5292546,0xf353dd4a,0x8377a262,0x3f1627de,0xeefcd638,0xa5faa013,0x74cc77c3,0x8f3bf626,0xa348f55e,0x32618f65,0x9fefeb9e,0x5787c0dc .long 0xd9a23e44,0xf1673aa2,0x4e10690d,0x88dfa993,0x2bf91108,0x1ced1b36,0x3af48649,0x9193ceca,0x2d738fc5,0xfb34327d,0x975fee6c,0x6697b037,0xc04079a5,0x2f485da0,0x2feaa1ac,0x2cdf5735 .long 0xbd55659e,0x76944420,0x4376090c,0x7973e32b,0x163b591a,0x86bb4fe1,0xc196f0ca,0x10441aed,0x045ad915,0x3b431f4a,0xa4afacb1,0x6c11b437,0x71fdbbd8,0x30b0c7db,0xeda65acd,0xb642931f .long 0x9c92b235,0x4baae6e8,0x6b3993a1,0xa73bbd0e,0x693dd031,0xd06d60ec,0x7156881c,0x03cab91b,0x1db3574b,0xd615862f,0x64bb061a,0x485b0185,0xa0181e06,0x27434988,0xc1c0c757,0x2cd61ad4 .long 0x2ff9f403,0x3effed5a,0x62239029,0x8dc98d8b,0x1f17b70d,0x2206021e,0xbf510015,0xafbec0ca,0x80130dfa,0x9fed7164,0x8a02dcf5,0x306dc2b5,0xfeb10fc0,0x48f06620,0x5a57cf51,0x78d1e1d5 .long 0x192ef710,0xadef8c5a,0x3b7431f9,0x88afbd4b,0x64250c9e,0x7e1f7407,0xb58bec07,0x6e31318d,0x24f89b4e,0xfd4fc4b8,0x48c36a2a,0x65a5dd88,0xf024baa7,0x4f1eccff,0xcba94650,0x22a21cf2 .long 0x42a554f7,0x95d29dee,0x002ec4ba,0x828983a5,0x8badb73d,0x8112a1f7,0xa27c1839,0x79ea8897,0xd065fd83,0x8969a5a7,0xb262a0bc,0xf49af791,0xaf2b5127,0xfcdea8b6,0x564c2dbc,0x10e913e1 .long 0xbc21ef51,0x51239d14,0x4ce57292,0xe51c3ceb,0x47bbcc3b,0x795ff068,0xbd7e11e6,0x86b46e1e,0x80041ef4,0x0ea6ba23,0x6262342e,0xd72fe505,0x31d294d4,0x8abc6dfd,0x1278c2c9,0xbbe017a2 .long 0xb389328a,0xb1fcfa09,0xd01771b5,0x322fbc62,0x60b045bf,0x04c0d063,0x10e52d01,0xdb652edc,0x03ec6627,0x50ef932c,0xc1ee50e3,0xde1b3b2d,0xdc37a90d,0x5ab7bdc5,0x31e33a96,0xfea67213 .long 0x4f2999aa,0x6482b5cb,0xb8cbf0dd,0x38476cc6,0x173405bb,0x93ebfacb,0xe52369ec,0x15cdafe7,0xd935b7db,0xd42d5ba4,0x1c99a4cd,0x648b6004,0xa3b5545b,0x785101bd,0x9dd67faf,0x4bf2c38a .long 0x4442449c,0xb1aadc63,0x33ad4fb8,0xe0e9921a,0xaa686d82,0x5c552313,0x465d866c,0xdee635fa,0x18ee6e8a,0xbc3c224a,0xed42e02f,0xeed748a6,0xd474cd08,0xe70f930a,0xfff24adf,0x774ea6ec .long 0xf3480d4a,0x03e2de1c,0xbc8acf1a,0xf0d8edc7,0x68295a9c,0xf23e3303,0xc546a97d,0xfadd5f68,0x96f8acb1,0x895597ad,0x671bdae2,0xbddd49d5,0x21dd43f4,0x16fcd528,0x6619141a,0xa5a45412 .long 0xc360e25a,0x8ce9b6bf,0x075a1a78,0xe6425195,0x481732f4,0x9dc756a8,0x5432b57a,0x83c0440f,0xd720281f,0xc670b3f1,0xd135e051,0x2205910e,0xdb052be7,0xded14b0e,0xc568ea39,0x697b3d27 .long 0xfb3ff9ed,0x2e599b9a,0x17f6515c,0x28c2e0ab,0x474da449,0x1cbee4fd,0x4f364452,0x071279a4,0x01fbe855,0x97abff66,0x5fda51c4,0x3ee394e8,0x67597c0b,0x190385f6,0xa27ee34b,0x6e9fccc6 .long 0x14092ebb,0x0b89de93,0x428e240c,0xf17256bd,0x93d2f064,0xcf89a7f3,0xe1ed3b14,0x4f57841e,0xe708d855,0x4ee14405,0x03f1c3d0,0x856aae72,0xbdd7eed5,0xc8e5424f,0x73ab4270,0x3333e4ef .long 0xdda492f8,0x3bc77ade,0x78297205,0xc11a3aea,0x34931b4c,0x5e89a3e7,0x9f5694bb,0x17512e2e,0x177bf8b6,0x5dc349f3,0x08c7ff3e,0x232ea4ba,0xf511145d,0x9c4f9d16,0x33b379c3,0xccf109a3 .long 0xa1f25897,0xe75e7a88,0xa1b5d4d8,0x7ac6961f,0x08f3ed5c,0xe3e10773,0x0a892dfb,0x208a54ec,0x78660710,0xbe826e19,0x237df2c8,0x0cf70a97,0xed704da5,0x418a7340,0x08ca33fd,0xa3eeb9a9 .long 0x169bca96,0x49d96233,0x2da6aafb,0x04d286d4,0xa0c2fa94,0xc09606ec,0x23ff0fb3,0x8869d0d5,0xd0150d65,0xa99937e5,0x240c14c9,0xa92e2503,0x108e2d49,0x656bf945,0xa2f59e2b,0x152a733a .long 0x8434a920,0xb4323d58,0x622103c5,0xc0af8e93,0x938dbf9a,0x667518ef,0x83a9cdf2,0xa1843073,0x5447ab80,0x350a94aa,0xc75a3d61,0xe5e5a325,0x68411a9e,0x74ba507f,0x594f70c5,0x10581fc1 .long 0x80eb24a9,0x60e28570,0x488e0cfd,0x7bedfb4d,0xc259cdb8,0x721ebbd7,0xbc6390a9,0x0b0da855,0xde314c70,0x2b4d04db,0x6c32e846,0xcdbf1fbc,0xb162fc9e,0x33833eab,0xb0dd3ab7,0x9939b48b .long 0xcb0c9c8c,0x5aaa98a7,0x81c4375c,0x75105f30,0x5ef1c90f,0xceee5057,0xc23a17bf,0xb31e065f,0xd4b6d45a,0x5364d275,0x62ec8996,0xd363f3ad,0x4391c65b,0xb5d21239,0xebb41b47,0x84564765 .long 0x37107c78,0x20d18ecc,0x570c2a66,0xacff3b6b,0x9bd0d845,0x22f975d9,0xba178fa0,0xef0a0c46,0x76b6028e,0x1a419651,0x248612d4,0xc49ec674,0x7338af55,0x5b6ac4f2,0x7bee5a36,0x06145e62 .long 0xe75746b5,0x33e95d07,0xc40c78be,0x1c1e1f6d,0x222ff8e2,0x967833ef,0xb49180ad,0x4bedcf6a,0x3d7a4c8a,0x6b37e9c1,0x6ddfe760,0x2748887c,0xaa3a5bbc,0xf7055123,0x7bbb8e74,0x954ff225 .long 0x97c3dfb9,0xc42b8ab1,0xcf168154,0x55a549b0,0xc1b50692,0xad6748e7,0x6fc5cbcb,0x2775780f,0xe1c9d7c8,0x4eab80b8,0x3fdbcd56,0x8c69dae1,0x9969eace,0x47e6b4fb,0xa705cb5a,0x002f1085 .long 0x6d3fea55,0x4e23ca44,0xf4810568,0xb4ae9c86,0x2a62f27d,0x47bfb91b,0xd9bac28c,0x60deb4c9,0x7de6c34c,0xa892d894,0x4494587d,0x4ee68259,0x1a3f8a5b,0x914ee14e,0x28700385,0xbb113eaa .long 0x2115b4c9,0x81ca03b9,0x8908cad1,0x7c163d38,0xaa18179a,0xc912a118,0x886e3081,0xe09ed750,0x26f516ca,0xa676e3fa,0x8e732f91,0x753cacf7,0x833da8b4,0x51592aea,0x4cbea8aa,0xc626f42f .long 0xa7b56eaf,0xef9dc899,0x34ef7316,0x00c0e52c,0xfe818a86,0x5b1e4e24,0xc538be47,0x9d31e20d,0x3ed68974,0x22eb932d,0x7c4e87c4,0xe44bbc08,0x0dde9aef,0x4121086e,0x134f4345,0x8e6b9cff .long 0x711b0eb9,0x96892c1f,0x780ab954,0xb905f2c8,0xa20792db,0xace26309,0x0684e126,0xec8ac9b3,0xb40a2447,0x486ad8b6,0x9fe3fb24,0x60121fc1,0x1a8e3b3f,0x5626fccf,0x6ad1f394,0x4e568622 .long 0x196aa5a1,0xda7aae0d,0x1041b5fb,0xe0df8c77,0x26b318b7,0x451465d9,0x7ab136e9,0xc29b6e55,0x71148463,0x2c2ab48b,0x64454a76,0xb5738de3,0x5a03abe4,0x54ccf9a0,0x0427d58e,0x377c0296 .long 0x2bb39c1f,0x73f5f0b9,0xe608d8c5,0x14373f2c,0x00fbb805,0xdcbfd314,0x83afdcfb,0xdf18fb20,0x42b3523f,0x81a57f42,0x87f650fb,0xe958532d,0x8b0a7d7c,0xaa8dc8b6,0x150166be,0x1b75dfb7 .long 0x2d7d1413,0x90e4f7c9,0x9834f597,0x67e2d6b5,0xa808c3e8,0x4fd4f4f9,0xd5281ec1,0xaf8237e0,0x84687cee,0x25ab5fdc,0xa5b26c09,0xc5ded6b1,0xc8ea7650,0x8e4a5aec,0x14cc417f,0x23b73e5c .long 0x3037bf52,0x2bfb4318,0x78c725d7,0xb61e6db5,0xbbb3e5d7,0x8efd4060,0xdbac488e,0x2e014701,0x360aa449,0xac75cf9a,0x79634d08,0xb70cfd05,0xfffb15ef,0xa591536d,0xd07c106c,0xb2c37582 .long 0xf50225f9,0xb4293fdc,0xb0e12b03,0xc52e175c,0xd0a8bf64,0xf649c3ba,0xeb8ae3c6,0x745a8fef,0x58321bc3,0x30d7e5a3,0x0bc4df48,0xb1732be7,0xe9ea5058,0x1f217993,0x3e4fd745,0xf7a71cde .long 0x894c5bbb,0x86cc533e,0x69d83082,0x6915c7d9,0x5815c244,0xa6aa2d05,0x49b22ce5,0xaeeee592,0x78135486,0x89e39d13,0x16b76f2f,0x3a275c1f,0xe036e8f5,0xdb6bcc1b,0x5e4709f5,0x4df69b21 .long 0x2d0f39aa,0xa188b250,0x15a85947,0x622118bb,0xfde0f4fa,0x2ebf520f,0x4860e539,0xa40e9f29,0x22b57f0f,0x7b6a51eb,0x7e80644a,0x849a33b9,0x1cf095fe,0x50e5d16f,0xec55f002,0xd754b54e .long 0x236f4a98,0x5cfbbb22,0x066800bb,0x0b0c59e9,0x5a9a7774,0x4ac69a8f,0xd6bec948,0x2b33f804,0x32e6c466,0xb3729295,0x4e599c73,0x68956d0f,0x155c31cc,0xa47a249f,0xe1ce284e,0x24d80f0d .long 0x988baf01,0xcd821dfb,0xdbb16647,0xe6331a7d,0x094cb960,0x1eb8ad33,0xc91bbca5,0x593cca38,0x26567456,0x384aac8d,0xc04b6490,0x40fa0309,0xdab6c8f6,0x97834cd6,0x3f91e55f,0x68a7318d .long 0xfc4d3157,0xa00fd04e,0x2bf3bdea,0xb56f8ab2,0x4fa57172,0x014f5648,0x450abdb3,0x948c5860,0x0ebd4f08,0x342b5df0,0x0e82938e,0x3e5168cd,0xb0df5dd0,0x7aedc1ce,0xe5732516,0x6bbbc6d9 .long 0x605daaa6,0xc7bfd486,0xbb9a6c9e,0x46fd72b7,0xa124fb89,0xe4847fb1,0xa2d8ffbc,0x75959cbd,0xc8a588ee,0x42579f65,0xb80b499d,0x368c92e6,0x999a5df1,0xea4ef6cd,0x936fe604,0xaa73bb7f .long 0x6457d188,0xf347a70d,0x8b7a388b,0x86eda86b,0x0ccd6013,0xb7cdff06,0xd0053fb2,0xbeb1b6c7,0x99240a9f,0x0b022387,0x776189b2,0x1bbb384f,0x9066193a,0x8695e71e,0x06ffac7e,0x2eb50097 .long 0x4a7d2caa,0x0654a9c0,0xa5aaa290,0x6f3fb3d1,0xff476e8f,0x835db041,0xc42295e4,0x540b8b0b,0x05e214f5,0xa5c73ac9,0x56a0b638,0x9a74075a,0xce9e680b,0x2e4b1090,0x6b8d9afa,0x57a5b479 .long 0x26bfe65c,0x0dca48e7,0x7290c307,0x097e391c,0x6669e72e,0x683c462e,0x062559ac,0xf505be1e,0xe3a3035a,0x5fbe3ea1,0x9cd50da8,0x6431ebf6,0x1f6407f2,0xfd169d5c,0x60fce6b8,0x8d838a95 .long 0x650006f0,0x2a2bfa7f,0x50c0fbb2,0xdfd7dad3,0xccf9ad96,0x92452495,0xd95635f9,0x183bf494,0x4a7bd989,0x02d5df43,0xa5431095,0x505385cc,0xfd43f53e,0xdd98e67d,0x500c34a9,0xd61e1a6c .long 0x4a8a3d62,0x5a4b46c6,0x247743d2,0x8469c4d0,0x88f7e433,0x2bb3a13d,0x01be5849,0x62b23a10,0xa63d1a4c,0xe83596b4,0x7d183f3e,0x454e7fea,0x17afb01c,0x643fce61,0x1c4c3638,0x4e65e5e6 .long 0xef74c45b,0x41d85ea1,0xae328506,0x2cfbfa66,0x3ada7da9,0x98b078f5,0xec752fbb,0xd985fe37,0x5a0148b4,0xeece68fe,0x2d78136d,0x6f9a55c7,0xd2b729ce,0x232dccc4,0x90aafbc4,0xa27e0dfd .long 0x12b4603e,0x96474452,0x6b706d14,0xa876c551,0x69a9d412,0xdf145fcf,0x2d479c34,0xe2ab75b7,0x1a23ff97,0x12df9a76,0x5d359d10,0xc6138992,0xfa835f22,0x6e51c7ae,0xc0fcc4d9,0x69a79cb1 .long 0x594cc7e1,0xf57f350d,0x3350ab79,0x3079ca63,0x9aff594a,0x226fb614,0x6d59a62b,0x35afec02,0x06ed2c6e,0x9bee46f4,0x7d939a57,0x58da1735,0x8fd1797e,0x44c50402,0x5ccea6ca,0xd8853e7c .long 0xa35fcd5f,0x4065508d,0x495ccaeb,0x8965df8c,0x12e1a962,0x0f2da850,0xc1cf1cc4,0xee471b94,0x0a08fb75,0xcef19bc8,0x81de3591,0x704958f5,0x3aef4f88,0x2867f8b2,0xea9f9a5f,0x8d749384 .long 0x8c9049f4,0x1b385537,0x7b92d8b6,0x5be948f3,0xb6e2bd6b,0xd96f725d,0x958c454d,0x37a222bc,0x8809bf61,0xe7c61abb,0x1346f18d,0x46f07fbc,0xe87c0d1c,0xfb567a7a,0x7ef3d07a,0x84a461c8 .long 0xd9278d98,0x0a5adce6,0x9dfc73e1,0x24d94813,0x054321c3,0x4f3528b6,0x692ea706,0x2e03fdde,0x47b533c0,0x10e60619,0x2ca3c055,0x1a8bc73f,0x1bb62b8f,0xae58d4b2,0x584a24e3,0xb2045a73 .long 0xbd76e195,0x3ab3d5af,0x6938a810,0x478dd1ad,0x6ee3d5cb,0x6ffab393,0x22b361e4,0xdfb693db,0x51dbf1a7,0xf9694496,0x08a2e762,0xcab4b4ef,0xd39bba9a,0xe8c92f25,0xf1464d96,0x850e61bc .long 0xdc09508b,0xb7e830e3,0x74317655,0xfaf6d2cf,0xdf690355,0x72606ceb,0xd0c3ded6,0x48bb92b3,0x5c7cf892,0x65b75484,0xd5d5f01f,0xf6cd7ac9,0x96401d69,0xc2c30a59,0xed921878,0x91268650 .long 0xb78c558f,0x380bf913,0xc8afdaa9,0x43c0baeb,0x54f169d3,0x377f61d5,0xae5ff20b,0xf8da07e3,0xa8a90ea8,0xb676c49d,0x83a29b21,0x81c1ff2b,0x2ad8d276,0x383297ac,0xba89f982,0x3001122f .long 0x6718e448,0xe1d794be,0x7c3e6e13,0x246c1482,0x5d26b5ef,0x56646ef8,0x88069cdd,0x80f5091e,0x724bdd38,0xc5992e2f,0x8471e8c7,0x02e915b4,0x0d0ff2a9,0x96ff320a,0x4384d1a0,0xbf886487 .long 0xc93f72d6,0xbbe1e6a6,0xcad800ea,0xd5f75d12,0xe7acf117,0xfa40a09f,0x7581a355,0x32c8cdd5,0x7023c499,0x74221992,0x38ec3901,0xa8afe5d7,0xa90e83f0,0x5691afcb,0x0b8f8eac,0x41bcaa03 .long 0x8d2668d5,0xe38b5ff9,0x7ad81965,0x0715281a,0x03c6ce11,0x1bc8fc7c,0x8b650436,0xcbbee6e2,0x0cdb9808,0x06b00fe8,0xfe3ed315,0x17d6e066,0x4d0b5018,0x2e9d38c6,0x844dcaef,0xab8bfd56 .long 0x513aed8b,0x42894a59,0x314bd07a,0xf77f3b6d,0x8e42b582,0xbbdecb8f,0xd2390fe6,0xf10e2fa8,0x62a2f201,0xefb95022,0x50ee32b0,0x4d59ea50,0x6da789a8,0xd87f7728,0xf79492c4,0xcf98a2cf .long 0x720943c2,0xf9577239,0x3990b9d0,0xba044cf5,0x95f2884a,0x5aa8e823,0x0278a0af,0x834de6ed,0x5f25bd12,0xc8e1ee9a,0x6f7ab271,0x9259ceaa,0x77d00b76,0x7e6d97a2,0xa437832a,0x5c0c6eea .long 0x5606b81d,0x5232c20f,0x0d991ee5,0xabd7b375,0x8632d951,0x4d2bfe35,0x98ed9364,0x78f85146,0xf30c3282,0x951873f0,0xa789230b,0x0da8ac80,0x5398967f,0x3ac7789c,0xbdda0fb5,0xa69b8f7f .long 0x6add8545,0xe5db7717,0x72c49b66,0x1b71cb66,0x68421d77,0xd8560739,0x83e3afea,0x03840fe8,0x1ec69977,0xb391dad5,0x307f6726,0xae243fb9,0xe8ca160c,0xc88ac87b,0x4ce355f4,0x5174cced .long 0xe58ba37d,0x98a35966,0x7817335d,0xfdcc8da2,0x83fbc7bf,0x5b752830,0xd9c96984,0x68e419d4,0x02a40380,0x409a39f4,0x1fe977bc,0x88940faf,0x8f8edea6,0xc640a94b,0xed11547d,0x1e22cd17 .long 0x59ffc3e2,0xe28568ce,0xc1dee4e7,0x60aa1b55,0x837cb363,0xc67497c8,0x105a2bf2,0x06fb438a,0x500d8e20,0x30357ec4,0x0670db10,0x1ad9095d,0xc73b7cfd,0x7f589a05,0x880d6d28,0xf544607d .long 0xa20ef103,0x17ba93b1,0x6ba6577b,0xad859130,0x6fa214a0,0x65c91cf6,0x27990da5,0xd7d49c6c,0x20bb569d,0xecd9ec8d,0xeeffbc33,0xbd4b2502,0x6bed0467,0x2056ca5a,0x5b63728c,0x7916a1f7 .long 0x53a4f566,0xd4f9497d,0x97b56810,0x89734664,0x0494a621,0xf8e1da74,0x8d011c68,0x82546a93,0xc61ac162,0x1f3acb19,0xabad0d3e,0x52f8fa9c,0xb4b7ea43,0x15356523,0xae608125,0x5a16ad61 .long 0x4faed184,0xb0bcb87f,0x5029f45f,0x5f236b1d,0x0bc6b1fc,0xd42c7607,0x68aefce3,0xc644324e,0x5c5d8446,0x8e191d59,0x13ae1979,0xc0208077,0x3ba59cc7,0xadcaee55,0xa2cb81ba,0x20ed6d6b .long 0xb6efcffc,0x0952ba19,0x97c0b87c,0x60f12d68,0x9caa30bc,0x4ee2c7c4,0x97fbff4e,0x767238b7,0x501b5d92,0xebc73921,0xc2a37737,0x3279e3df,0x6d197543,0x9fc12bc8,0x0a40db4e,0xfa94dc6f .long 0x530ccbbd,0x7392b41a,0xea823525,0x87c82146,0x05d98d0c,0xa52f984c,0x5ef6974c,0x2ae57d73,0x3042a6dd,0x9377f7bf,0x19647a64,0xb1a007c0,0x0cca9767,0xfaa9079a,0xf68f72d5,0x3d81a25b .long 0xff81578e,0x752067f8,0x9045447d,0x78622150,0x0505aa6f,0xc0c22fcf,0x6bed1c77,0x1030f0a6,0x1f0bd739,0x31f29f15,0xe6debe85,0x2d7989c7,0x8e677e98,0x5c070e72,0x06e81fd5,0x0a817bd3 .long 0xb0f2ac95,0xc110d830,0xab20e64e,0x48d0995a,0x7729cd9a,0x0f3e00e1,0xdd556946,0x2a570c20,0x4e86214d,0x912dbcfd,0xcf615498,0x2d014ee2,0x3530d76e,0x55e2b1e6,0xfd0fd6d1,0xc5135ae4 .long 0xd4f3049f,0x0066273a,0xe7087477,0xbb8e9893,0x14c6e5fd,0x2dba1ddb,0x51f57e6c,0xdba37886,0x5a72f2cf,0x5aaee0a6,0x7bea5642,0x1208bfbf,0x67872c37,0xf5c6aa3b,0x43f93224,0xd726e083 .long 0x061f1658,0x1854daa5,0xdf0cd2b3,0xc0016df1,0x833d50de,0xc2a3f23e,0xbbbd3017,0x73b681d2,0x3ac343c0,0x2f046dc4,0x85716421,0x9c847e7d,0x0917eed4,0xe1e13c91,0x63a1b9c6,0x3fc9eebd .long 0x7fe02299,0x0f816a72,0x294f3319,0x6335ccc2,0x4745c5be,0x3820179f,0x922f066e,0xe647b782,0x02cafb8a,0xc22e49de,0xfcc2eccc,0x299bc2ff,0x6e0e8282,0x9a8feea2,0xfe893205,0xa627278b .long 0x7933e47b,0xa7e19733,0x2e766402,0xf4ff6b13,0x98440d9f,0xa4d8be0a,0x38938808,0x658f5c2f,0xc95b3b3e,0x90b75677,0x3137b6ff,0xfa044269,0x43c47c29,0x077b039b,0x8a6445b2,0xcca95dd3 .long 0x2333fc4c,0x0b498ba4,0xf736a1b1,0x274f8e68,0x5f1d4b2e,0x6ca348fd,0xa8f10199,0x24d3be78,0xca14f530,0x8535f858,0x5b982e51,0xa6e7f163,0x36e1bf62,0x847c8512,0x03448418,0xf6a7c58e .long 0xf9374ab6,0x583f3703,0x6e564145,0x864f9195,0x22526d50,0x33bc3f48,0x1262a496,0x9f323c80,0x3f046a9a,0xaa97a7ae,0xdf8a039a,0x70da183e,0x52aa0ba6,0x5b68f71c,0x21459c2d,0x9be0fe51 .long 0xcbc613e5,0xc1e17eb6,0x497ea61c,0x33131d55,0xaf7eded5,0x2f69d39e,0xde6af11b,0x73c2f434,0xa4a375fa,0x4ca52493,0xb833c5c2,0x5f06787c,0x3e6e71cf,0x814e091f,0x8b746666,0x76451f57 .long 0x694db7e0,0x80f9bdef,0xb9fcddc6,0xedca8787,0x03b8dce1,0x51981c34,0x70e10ba1,0x4274dcf1,0x6def6d1a,0xf72743b8,0xebdb1866,0xd25b1670,0x050c6f58,0xc4491e8c,0x87fbd7f5,0x2be2b2ab .long 0xd111f8ec,0x3e0e5c9d,0xb7c4e760,0xbcc33f8d,0xbd392a51,0x702f9a91,0xc132e92d,0x7da4a795,0x0bb1151b,0x1a0b0ae3,0x02e32251,0x54febac8,0x694e9e78,0xea3a5082,0xe4fe40b8,0xe58ffec1 .long 0xd1e0cf9e,0xf85592fc,0xc0e7b2e8,0xdea75f0d,0xc135584e,0xc04215cf,0x2f57092a,0x174fc727,0xeb930bea,0xe7277877,0x5eb02a5a,0x504caccb,0xf5241b9b,0xf9fe08f7,0x8d5ca954,0xe7fb62f4 .long 0x29c4120b,0xfbb8349d,0xc0d0d915,0x9f94391f,0x5410ba51,0xc4074fa7,0x150a5911,0xa66adbf6,0x34bfca38,0xc164543c,0xb9e1ccfc,0xe0f27560,0xe820219c,0x99da0f53,0xc6b4997a,0xe8234498 .long 0x9d4c5423,0xcfb88b76,0xb0521c49,0x9e56eb10,0xbe8700a1,0x418e0b5e,0xf93cb58a,0x00cbaad6,0xd92a5e67,0xe923fbde,0x1f347f11,0xca4979ac,0x6bc0585b,0x89162d85,0xac3c70e3,0xdd6254af .long 0x516e19e4,0x7b23c513,0xc5c4d593,0x56e2e847,0x5ce71ef6,0x9f727d73,0xf79a44c5,0x5b6304a6,0x3ab7e433,0x6638a736,0xfe742f83,0x1adea470,0x5b7fc19f,0xe054b854,0xba1d0698,0xf935381a .long 0x799e9a74,0x546eab2d,0xa949f729,0x96239e0e,0x7090055a,0xca274c6b,0x9020c9b0,0x835142c3,0xa2e8807f,0xa405667a,0x1aa3d39e,0x29f2c085,0x42fc72f5,0xcc555d64,0xfbeacb3c,0xe856e0e7 .long 0x918e4936,0xb5504f9d,0xb2513982,0x65035ef6,0x6f4d9cb9,0x0553a0c2,0xbea85509,0x6cb10d56,0xa242da11,0x48d957b7,0x672b7268,0x16a4d3dd,0x8502a96b,0x3d7e637c,0x730d463b,0x27c7032b .long 0xe4136a14,0xbdc02b18,0x678e32bf,0xbacf969d,0xdd9c3c03,0xc98d89a3,0x23becc4f,0x7b92420a,0xc64d565c,0xd4b41f78,0x10f28295,0x9f969d00,0xb13d051a,0xec7f7f76,0xa92da585,0x08945e1e .long 0x5846426f,0x55366b7d,0x247d441d,0xe7d09e89,0x736fbf48,0x510b404d,0xe784bd7d,0x7fa003d0,0x17fd9596,0x25f7614f,0x35cb98db,0x49e0e0a1,0x2e83a76a,0x2c65957b,0xcddbe0f8,0x5d40da8d .long 0x050bad24,0xf2b8c405,0xc2aa4823,0x8918426d,0xa38365a7,0x2aeab3dd,0x7c91b690,0x72031717,0x60a94120,0x8b00d699,0xe99eaeec,0x478a255d,0x6f60aafd,0xbf656a5f,0x5dee77b3,0xdfd7cb75 .long 0xa595939d,0x37f68bb4,0x28740217,0x03556479,0x84ad7612,0x8e740e7c,0x9044695f,0xd89bc843,0x85a9184d,0xf7f3da5d,0x9fc0b074,0x562563bb,0xf88a888e,0x06d2e6aa,0x161fbe7c,0x612d8643 .long 0xf64085e7,0x465edba7,0x29aa8511,0xb230f304,0xcda2d188,0x53388426,0x4b666649,0x90885735,0x652f54f6,0x6f02ff9a,0x5fae2bf0,0x65c82294,0x62f5eee3,0x7816ade0,0xfcc56d70,0xdcdbdf43 .long 0x54530bb2,0x9fb3bba3,0xcb0869ea,0xbde3ef77,0x0b431163,0x89bc9046,0xe4819a35,0x4d03d7d2,0x43b6a782,0x33ae4f9e,0x9c88a686,0x216db307,0x00ffedd9,0x91dd88e0,0x12bd4840,0xb280da9f .long 0x1635e741,0x32a7cb8a,0x78be02a7,0xfe14008a,0x1b7ae030,0x3fafb334,0x5add0ce9,0x7fd508e7,0xd607ad51,0x72c83219,0x8d40964a,0x0f229c0a,0x1c878da2,0x1be2c336,0xeab2ab86,0xe0c96742 .long 0x3e538cd7,0x458f8691,0x8e08ad53,0xa7001f6c,0xbf5d15ff,0x52b8c6e6,0x011215dd,0x548234a4,0x3d5b4045,0xff5a9d2d,0x4a904190,0xb0ffeeb6,0x48607f8b,0x55a3aca4,0x30a0672a,0x8cbd665c .long 0x42583068,0x87f834e0,0xf3f6e683,0x02da2aeb,0x05c12248,0x6b763e5d,0x65a8aefc,0x7230378f,0x71e8e5ca,0x93bd80b5,0xb3b62524,0x53ab041c,0x6c9c552e,0x1b860513,0xd5524e66,0xe84d402c .long 0xf37f5937,0xa37f3573,0xd1e4fca5,0xeb0f6c7d,0xac8ab0fc,0x2965a554,0x274676ac,0x17fbf56c,0xacf7d720,0x2e2f6bd9,0x10224766,0x41fc8f88,0x85d53bef,0x517a14b3,0x7d76a7d1,0xdae327a5 .long 0xc4818267,0x6ad0a065,0x37c1bbc1,0x33aa189b,0x27392a92,0x64970b52,0x2d1535ea,0x21699a1c,0xc2d7a7fd,0xcd20779c,0x99c83cf2,0xe3186059,0x72c0b8c7,0x9b69440b,0x7b9e0e4d,0xa81497d7 .long 0x1f5f82dc,0x515d5c89,0x6361079e,0x9a7f67d7,0x11a35330,0xa8da81e3,0x4b18be1b,0xe44990c4,0xaf103e59,0xc7d5ed95,0x8dac9261,0xece8aba7,0x9394b8d3,0xbe82b099,0x16adfe83,0x6830f09a .long 0x88172d01,0x250a29b4,0xcaff9e02,0x8b20bd65,0xe8a6329a,0xb8a7661e,0xd3fce920,0x4520304d,0x2b47f7ef,0xae45da1f,0x5bffc540,0xe07f5288,0x3464f874,0xf7997009,0xa6fa1f38,0x2244c2cd .long 0x94d7d9b1,0x43c41ac1,0xc82e7f17,0x5bafdd82,0x5fda0fca,0xdf0614c1,0xa8ae37ad,0x74b043a7,0x9e71734c,0x3ba6afa1,0x9c450f2e,0x15d5437e,0x67e242b1,0x4a5883fe,0x2c1953c2,0x5143bdc2 .long 0xfc5e8920,0x542b8b53,0x9a9cee08,0x363bf9a8,0xc3486e08,0x02375f10,0x8c5e70d2,0x2037543b,0x625640b4,0x7109bccc,0x8bc62c3b,0xcbc1051e,0x803f26ea,0xf8455fed,0xeb372424,0x6badceab .long 0x6b53f5f9,0xa2a9ce7c,0x1b176d99,0x64246595,0xb95c081b,0xb1298d36,0x1d9a9ee6,0x53505bb8,0xf2ba70b0,0x3f6f9e61,0x8afad453,0xd07e16c9,0xe7eb4a6a,0x9f1694bb,0x3cb0bc8e,0xdfebced9 .long 0x53868c8b,0x92d3dcdc,0x386107a6,0x174311a2,0x689b4e64,0x4109e07c,0x2df3dcb6,0x30e4587f,0x0811b3b2,0x841aea31,0x0cce43ea,0x6144d41d,0x2a9a7803,0x464c4581,0x3e158930,0xd03d371f .long 0xb1f3390b,0xc676d7f2,0xa5b61272,0x9f7a1b8c,0xc2e127a9,0x4ebebfc9,0x5dd997bf,0x4602500c,0x4711230f,0x7f09771c,0x020f09c1,0x058eb37c,0xfee5e38b,0xab693d4b,0x4653cbc0,0x9289eb1f .long 0xd51b9cf5,0xbecf46ab,0x9f0121af,0xd2aa9c02,0xe90dc274,0x36aaf7d2,0x48b95a3c,0x909e4ea0,0x6f32dbdb,0xe6b70496,0x8b030b3e,0x672188a0,0xcfb617e2,0xeeffe5b3,0x7c82709e,0x87e947de .long 0x1770f5a7,0xa44d2b39,0x0e44eb82,0xe4d4d791,0x3f69712a,0x42e69d1e,0xac6a820e,0xbf11c4d6,0x42c4224c,0xb5e7f3e5,0x449d941c,0xd6b4e81c,0x5450e878,0x5d72bd16,0xee25ac54,0x6a61e28a .long 0xe6f1cd95,0x33272094,0x0d18673f,0x7512f30d,0x5afc1464,0x32f7a4ca,0x6bbb977b,0x2f095656,0xa8226200,0x586f47ca,0x1ac07369,0x02c868ad,0xc613acbe,0x4ef2b845,0x0386054c,0x43d7563e .long 0xab952578,0x54da9dc7,0x26e84d0b,0xb5423df2,0x9b872042,0xa8b64eeb,0x5990f6df,0xac205782,0x21f4c77a,0x4ff696eb,0xaab273af,0x1a79c3e4,0x9436b3f1,0x29bc922e,0xd6d9a27a,0xff807ef8 .long 0x778f22a0,0x82acea3d,0x5b5e7469,0xfb10b2e8,0x2818ee7d,0xc0b16980,0xc91c1a2f,0x011afff4,0xad124418,0x95a6d126,0xe72e295f,0x31c081a5,0xf2f4db75,0x36bb283a,0x7acef462,0xd115540f .long 0x33f6746c,0xc7f3a8f8,0xfea990ca,0x21e46f65,0xcaddb0a9,0x915fd5c5,0x78614555,0xbd41f016,0x426ffb58,0x346f4434,0x14dbc204,0x80559436,0x5a969b7f,0xf3dd20fe,0xe899a39a,0x9d59e956 .long 0x8ad4cf4b,0xf1b0971c,0x2ffb8fb8,0x03448860,0x65340ba4,0xf071ac3c,0xb27fd758,0x408d0596,0x98c364b0,0xe7c78ea4,0x051e8ab5,0xa4aac4a5,0x485d9002,0xb9e1d560,0x88844455,0x9acd518a .long 0xd06f56c0,0xe4ca688f,0xdf027972,0xa48af70d,0x5e9a609d,0x691f0f04,0xee61270e,0xa9dd82cd,0xa0ef18d3,0x8903ca63,0x3d6ca3bd,0x9fb7ee35,0xabf47d03,0xa7b4a09c,0x1c67de8e,0x4cdada01 .long 0x9355a244,0x52003749,0x4f2151a9,0xe77fd2b6,0x66b4efcb,0x695d6cf6,0xda2cfe25,0xc5a0cacf,0xef811865,0x104efe5c,0x9ea5cc3d,0xf52813e8,0x40b58dbc,0x855683dc,0x175fcb11,0x0338ecde .long 0x74921592,0xf9a05637,0xb9bb9d31,0xb4f1261d,0x4e9c5459,0x551429b7,0x6ea71f53,0xbe182e6f,0xdfc50573,0xd3a3b07c,0x62be8d44,0x9ba1afda,0x52ab65d3,0x9bcfd2cb,0xa9571802,0xdf11d547 .long 0x02a2404a,0x099403ee,0x21088a71,0x497406f4,0x5004ae71,0x99479409,0xa812c362,0xbdb42078,0xd8828442,0x2b72a30f,0xfcb5ed1c,0x283add27,0x66a40015,0xf7c0e200,0x08b295ef,0x3e3be641 .long 0xe038a675,0xac127dc1,0x8c5c6320,0x729deff3,0xa90d2c53,0xb7df8fd4,0x681e7cd3,0x9b74b0ec,0xdab407e5,0x5cb5a623,0x76b340c6,0xcdbd3615,0x7d28392c,0xa184415a,0xe96f7830,0xc184c1d8 .long 0x81d3a80f,0xc3204f19,0xc8e02432,0xfde0c841,0x8149e0c1,0x78203b3e,0x08053a73,0x5904bdbb,0x101b6805,0x30fc1dd1,0x49aa6d49,0x43c223bc,0x7a174087,0x9ed67141,0xd5997008,0x311469a0 .long 0x5e43fc61,0xb189b684,0xe0d3ab57,0xf3282375,0xb1181da8,0x4fa34b67,0x99ee52b8,0x621ed0b2,0xad990676,0x9b178de1,0x56d54065,0xd51de67b,0x7538c201,0x2a2c27c4,0x38a40f5c,0x33856ec8 .long 0xbe6cdcde,0x2522fc15,0x9f0c6f89,0x1e603f33,0x103e30a6,0x7994edc3,0x220c853e,0x033a00db,0xf7bb7fd7,0xd3cfa409,0x462d18f6,0x70f8781e,0x687fe295,0xbbd82980,0x595669f3,0x6eef4c32 .long 0x2f7e85c3,0x86a9303b,0x71988f9b,0x5fce4621,0xc138acb5,0x5b935bf6,0x25661212,0x30ea7d67,0xe51ab9a2,0xef1eb5f4,0xae067c78,0x0587c98a,0x77ca9ca6,0xb3ce1b3c,0x54b5f057,0x2a553d4d .long 0x4da29ec2,0xc7898236,0xb9c57316,0xdbdd5d13,0x2cd80d47,0xc57d6e6b,0xfe9e7391,0x80b460cf,0xf963c31e,0x98648cab,0xcc4d32fd,0x67f9f633,0xfdf7c687,0x0af42a9d,0x0b015ea7,0x55f292a3 .long 0xcd21ab3d,0x89e468b2,0xc393d392,0xe504f022,0xa5013af9,0xab21e1d4,0xc2c28acb,0xe3283f78,0x226bf99f,0xf38b35f6,0x0e291e69,0xe8354274,0xb20c162d,0x61673a15,0xb04fbdbe,0xc101dc75 .long 0x255bd617,0x8323b4c2,0x6c2a9154,0x6c969693,0x62679387,0xc6e65860,0xb8c88e23,0x8e01db0c,0x893a5559,0x33c42873,0x47a3e149,0x7630f04b,0xddcf35f8,0xb5d80805,0x77dfe732,0x582ca080 .long 0x0b1894a0,0x2c7156e1,0xd81c68c0,0x92034001,0xc8b115b5,0xed225d00,0x83b907f2,0x237f9c22,0x4470e2c0,0x0ea2f32f,0x58be4e95,0xb725f7c1,0xb1ae5463,0x0f1dcafa,0x1ba2fc04,0x59ed5187 .long 0xd0115d4d,0xf6e0f316,0xd3691599,0x5180b12f,0x527f0a41,0x157e32c9,0xa8e0ecc0,0x7b0b081d,0xbf4f0dd0,0x6dbaaa8a,0x4d252696,0x99b289c7,0xdbf864fe,0x79b7755e,0x76cad3ab,0x6974e2b1 .long 0x06ddd657,0x35dbbee2,0x2ff3a96d,0xe7cbdd11,0x076be758,0x88381968,0x08c91f5d,0x2d737e72,0x86ec3776,0x5f83ab62,0x945fa7a1,0x98aa649d,0x72ef0933,0xf477ec37,0x098c17b1,0x66f52b1e .long 0xd803738b,0x9eec58fb,0xe4e86aa4,0x91aaade7,0xa5b51492,0x6b1ae617,0xbbc45974,0x63272121,0x862c5129,0x7e0e28f0,0x3321a4a0,0x0a8f79a9,0x5041c88f,0xe26d1664,0x53233e3a,0x0571b805 .long 0xc9520711,0xd1b0ccde,0x3c8b84bf,0x55a9e4ed,0xa1fef314,0x9426bd39,0x6eb93f2b,0x4f5f638e,0x2bf9341b,0xba2a1ed3,0x4d42d5a9,0xd63c1321,0x316dc7c5,0xd2964a89,0xca511851,0xd1759606 .long 0xf9e6ed35,0xd8a9201f,0x6736925a,0xb7b5ee45,0x99581af7,0x0a83fbbc,0x64eeb051,0x3076bc40,0x02dec312,0x5511c98c,0x238dcb78,0x270de898,0x539c08c9,0x2cf4cf9c,0x38d3b06e,0xa70cb65e .long 0xcfe57bbd,0xb12ec10e,0x35a0c2b5,0x82c7b656,0x161c67bd,0xddc7d5cd,0xae3a32cc,0xe32e8985,0xd11a5529,0x7aba9444,0x2427fa1a,0xe964ed02,0x24a1770a,0x1528392d,0x12c72fcd,0xa152ce2c .long 0x8ec07649,0x714553a4,0x459dd453,0x18b4c290,0x7b64b110,0xea32b714,0x2e6f07a2,0xb871bfa5,0x9e2e3c9b,0xb67112e5,0x44aa90f6,0xfbf250e5,0xbd539006,0xf77aedb8,0xd172a66f,0x3b0cdf9a .long 0xf8c51187,0xedf69fea,0x741e4da7,0x05bb67ec,0x08114345,0x47df0f32,0xbb9792b1,0x56facb07,0x8f6229e4,0xf3e007e9,0x526fba0f,0x62d103f4,0xb0339d79,0x4f33bef7,0xb59bfec1,0x9841357b .long 0xc34e6705,0xfa8dbb59,0x7fdaa84c,0xc3c7180b,0xa4108537,0xf95872fc,0x932a3e5a,0x8750cc3b,0xb7275d7d,0xb61cc69d,0x2e59b2e9,0xffa0168b,0x6ecbb493,0xca032abc,0x2c9082d8,0x1d86dbd3 .long 0xe28ef5ba,0xae1e0b67,0xcb18e169,0x2c9a4699,0x1e6bbd20,0x0ecd0e33,0xaf5e81d2,0x571b360e,0x101c1d45,0xcd9fea58,0x18880452,0x6651788e,0x1f8dd446,0xa9972635,0xe37281d0,0x44bed022 .long 0x33da525d,0x094b2b2d,0x13144fd8,0xf193678e,0xf4c1061d,0xb8ab5ba4,0xdccbe0f4,0x4343b5fa,0x63812713,0xa8702371,0xf7611d93,0x47bf6d2d,0xbd21e1d7,0x46729b8c,0xd629e77d,0x7484d4e0 .long 0x60dbac1f,0x830e6eea,0xda06a2f7,0x23d8c484,0x50ca535b,0x896714b0,0xebd97a9b,0xdc8d3644,0xb12177b4,0x106ef9fa,0x534d5d9c,0xf79bf464,0xa6ab360b,0x2537a349,0xa00c744f,0xc7c54253 .long 0xe5911a76,0xb3c7a047,0x647f1ee7,0x61ffa5c8,0x8f56ab42,0x15aed36f,0xa3ff9ac9,0x6a0d41b0,0xcc30d357,0x68f469f5,0x6b72be96,0xbe9adf81,0x903ad461,0x1cd926fe,0xcaca441b,0x7e89e38f .long 0xfacf69d4,0xf0f82de5,0x4775344c,0x363b7e76,0xb2e36d04,0x6894f312,0x11d1c9a5,0x3c6cb4fe,0x4008e1f2,0x85d9c339,0x249f326c,0x5e9a85ea,0x678c5e06,0xdc35c60a,0x9f86fba9,0xc08b944f .long 0x89f71f0f,0xde40c02c,0xff3da3c0,0xad8f3e31,0x42125ded,0x3ea5096b,0xa7379183,0x13879cbf,0x6b306a0b,0x6f4714a5,0x67646c5e,0x359c2ea6,0x07726368,0xfacf8943,0x65ff431e,0x07a58935 .long 0x68754ab0,0x24d661d1,0x6f429a76,0x801fce1d,0xa58ce769,0xc068a85f,0x5d5eca2b,0xedc35c54,0xa3f660d1,0xea31276f,0xb8fc7167,0xa0184ebe,0x1d8db0ae,0x0f20f21a,0x56c35e12,0xd96d095f .long 0xf8c2a25b,0xedf402b5,0x059204b6,0x1bb772b9,0x19b4e34c,0x50cbeae2,0x3fa0845a,0x93109d80,0x8ef59fb5,0x54f7ccf7,0x88070963,0x3b438fe2,0x31f3ba9b,0x9e28c659,0xead9da92,0x9cc31b46 .long 0xb733aa5f,0x3c2f0ba9,0xf05af235,0xdece47cb,0xa2ac82a5,0xf8e3f715,0x2203f18a,0xc97ba641,0x09c11060,0xc3af5504,0x46af512d,0x56ea2c05,0xf3f28146,0xfac28daf,0x959ef494,0x87fab43a .long 0xd4c5105f,0x09891641,0x6d7fbd65,0x1ae80f8e,0xbee6bdb0,0x9d67225f,0x7fc4d860,0x3b433b59,0x93e85638,0x44e66db6,0xe3e9862f,0xf7b59252,0x665c32ec,0xdb785157,0xae362f50,0x702fefd7 .long 0x0fefb0c3,0x3754475d,0x46d7c35d,0xd48fb56b,0x363798a4,0xa070b633,0x8fdb98e6,0xae89f3d2,0x6363d14c,0x970b89c8,0x67abd27d,0x89817521,0x44d5a021,0x9bf7d474,0xcac72aee,0xb3083baf .long 0xbe949a44,0x389741de,0x546a4fa5,0x638e9388,0xa0047bdc,0x3fe6419c,0xaaea57ca,0x7047f648,0x41fbab17,0x54e48a90,0x576bdba2,0xda8e0b28,0xc72afddc,0xe807eebc,0xf42577bf,0x07d3336d .long 0xbfe20925,0x62a8c244,0x8fdce867,0x91c19ac3,0xdd387063,0x5a96a5d5,0x21d324f6,0x61d587d4,0xa37173ea,0xe87673a2,0x53778b65,0x23848008,0x05bab43e,0x10f8441e,0x4621efbe,0xfa11fe12 .long 0x81685d7b,0x047b772e,0xbf34a976,0x23f27d81,0x915f48ef,0xc27608e2,0xa521d5c3,0x3b0b43fa,0x63ca7284,0x7613fb26,0x1d4db837,0x7f5729b4,0x583b526b,0x87b14898,0xbbadd3d1,0x00b732a6 .long 0x2048e396,0x8e02f426,0x383d9de4,0x436b50b6,0x471e85ad,0xf78d3481,0xd005c8d6,0x8b01ea6a,0x97015c07,0xd3c7afee,0x4e3ba2ae,0x46cdf1a9,0x83d3a1d2,0x7a42e501,0xb541dff4,0xd54b5268 .long 0x4e23e9bc,0x3f24cf30,0x126e3624,0x4387f816,0x3b0b6d61,0x26a46a03,0x8b2d777c,0xaf1bc845,0x527de79c,0x25c401ba,0x4261bbb6,0x0e1346d4,0x287b4bc7,0x4b96c44b,0x5254562f,0x658493c7 .long 0xb8a24a20,0x23f949fe,0xf52ca53f,0x17ebfed1,0xbcfb4853,0x9b691bbe,0x6278a05d,0x5617ff6b,0xe3c99ebd,0x241b34c5,0x1784156a,0xfc64242e,0x695d67df,0x4206482f,0xee27c011,0xb967ce0e .long 0x21c80b5d,0x65db3751,0xa31ecca0,0x2e7a563c,0x5238a07e,0xe56ffc4e,0x32ced854,0x3d6c2966,0xaf70b885,0xe99d7d1a,0x2d686459,0xafc3bad9,0x0cc8ba5b,0x9c78bf46,0x18955aa3,0x5a439519 .long 0x5fe4e314,0xf8b517a8,0xfcb8906f,0xe60234d0,0xf2061b23,0xffe542ac,0x6b4cb59c,0x287e191f,0x09d877d8,0x21857ddc,0x14678941,0x1c23478c,0xb6e05ea4,0xbbf0c056,0xb01594fe,0x82da4b53 .long 0xfadb8608,0xf7526791,0x7b74cdf6,0x049e832d,0xc2b90a34,0xa43581cc,0x9360b10c,0x73639eb8,0xe1e4a71b,0x4fba331f,0x8072f919,0x6ffd6b93,0x65679032,0x6e53271c,0xf14272ce,0x67206444 .long 0xb2335834,0xc0f734a3,0x90ef6860,0x9526205a,0x04e2bb0d,0xcb8be717,0x02f383fa,0x2418871e,0x4082c157,0xd7177681,0x29c20073,0xcc914ad0,0xe587e728,0xf186c1eb,0x61bcd5fd,0x6fdb3c22 .long 0xf2f9f8e9,0x30d014a6,0x4fec49d2,0x963ece23,0x9605a8d9,0x862025c5,0x19f8929a,0x39874445,0x12bf476a,0x01b6ff65,0x09cf7d91,0x598a64d8,0x93be56ca,0xd7ec7749,0xcbb33615,0x10899785 .long 0x02eee3ad,0xb8a092fd,0x30145270,0xa86b3d35,0x8512b675,0x323d98c6,0x62ebb40f,0x4b8bc785,0x413f9cde,0x7d301f54,0x2bab5664,0xa5e4fb4f,0x1cbfec23,0x1d2b252d,0xe177120d,0xfcd576bb .long 0x83731a34,0x04427d3e,0xed836e8e,0x2bb9028e,0xb612ca7c,0xb36acff8,0xd3d9c73a,0xb88fe5ef,0xedea4eb3,0xbe2a6bc6,0x488eec77,0x43b93133,0xb17106e1,0xf41ff566,0x654efa32,0x469e9172 .long 0x41c23fa3,0xb4480f04,0xc1989a2e,0xb4712eb0,0x93a29ca7,0x3ccbba0f,0xd619428c,0x6e205c14,0xb3641686,0x90db7957,0x45ac8b4e,0x0432691d,0xf64e0350,0x07a759ac,0x9c972517,0x0514d89c .long 0xa8e67fc3,0x1701147f,0xab2085be,0x9e2e0b8b,0xac284e57,0xd5651824,0x74893664,0x890d4325,0xc55e68a3,0x8a7c5e6e,0x4339c85a,0xbf12e90b,0xf922b655,0x31846b85,0x0bf4d700,0x9a54ce4d .long 0xf1a14295,0xd7f4e83a,0xb285d4f9,0x916f955c,0x99ffdaba,0xe57bb0e0,0xeab0d152,0x28a43034,0xb8a9cef8,0x0a36ffa2,0xb9ec051a,0x5517407e,0xea68e672,0x9c796096,0xfb3c77fb,0x853db5fb .long 0xe864a51a,0x21474ba9,0x6e8a1b8b,0x6c267699,0x94120a28,0x7c823626,0x8383a5db,0xe61e9a48,0x9f84216d,0x7dd75003,0xad43cd85,0xab020d07,0xda12c659,0x9437ae48,0xe65452ad,0x6449c2eb .long 0x2cf9d7c1,0xcc7c4c1c,0xee95e5ab,0x1320886a,0xbeae170c,0xbb7b9056,0xdbc0d662,0xc8a5b250,0xc11d2303,0x4ed81432,0x1f03769f,0x7da66912,0x84539828,0x3ac7a5fd,0x3bccdd02,0x14dada94 .long 0x7ef6b0d1,0x8b84c321,0x7c933f22,0x52a9477a,0xfd440b82,0x5ef6728a,0x6ce4bd5e,0x5c3bd859,0xf22c2d3e,0x918b80f5,0xb7bb6cc5,0x368d5040,0x2695a11c,0xb66142a1,0xeb19ea70,0x60ac583a .long 0x0eab2437,0x317cbb98,0x5e2654c8,0x8cc08c55,0xe6d8307f,0xfe2d6520,0x57428993,0xe9f147f3,0xd2fd6cf1,0x5f9c7d14,0x2d4fcbb0,0xa3ecd064,0x8e7341f7,0xad83fef0,0x3a63115c,0x643f23a0 .long 0xe65ab743,0xd38a78ab,0x35edc89c,0xbf7c75b1,0x530df568,0x3dd8752e,0xe308c682,0xf85c4a76,0xe68acf37,0x4c9955b2,0xab32af85,0xa544df3d,0xa25cf493,0x4b8ec3f5,0x1a622feb,0x4d8f2764 .long 0xf0dcbc49,0x7bb4f7aa,0x70bbb45b,0x7de551f9,0x9f2ca2e5,0xcfd0f3e4,0x1f5c76ef,0xece58709,0x167d79ae,0x32920edd,0xfa7d7ec1,0x039df8a2,0xbb30af91,0xf46206c0,0x22676b59,0x1ff5e2f5 .long 0x6ea51d66,0x11f4a039,0x807d7a26,0x506c1445,0x755a9b24,0x60da5705,0x1f1a319e,0x8fc8cc32,0x9433d67d,0x83642d4d,0x6a7dd296,0x7fa5cb8f,0x9b7bde07,0x576591db,0x419716fb,0x13173d25 .long 0xd5b340ff,0xea30599d,0xb0fe76c5,0xfc6b5297,0xab8f5adc,0x1c6968c8,0x901c928d,0xf723c7f5,0x9773d402,0x4203c321,0x1b51dd47,0xdf7c6aa3,0x552be23c,0x3d49e37a,0x0b5a6e87,0x57febee8 .long 0x7bd8e739,0xc5ecbee4,0xae63bf75,0x79d44994,0x38fb8923,0x168bd00f,0xd0533130,0x75d48ee4,0xdb5cdf33,0x554f77aa,0x3c696769,0x3396e896,0xd3fd674e,0x2fdddbf2,0x99d0e3e5,0xbbb8f6ee .long 0xcbae2f70,0x51b90651,0x93aaa8eb,0xefc4bc05,0xdd1df499,0x8ecd8689,0x22f367a5,0x1aee99a8,0xae8274c5,0x95d485b9,0x7d30b39c,0x6c14d445,0xbcc1ef81,0xbafea90b,0xa459a2ed,0x7c5f317a .long 0x4ef44227,0x01211075,0xdc20f496,0xa17bed6e,0x819853cd,0x0cdfe424,0xf71e2ce7,0x13793298,0xdbbe307b,0x3c1f3078,0x76ee9936,0x6dd1c20e,0x423caa20,0x23ee4b57,0x8efb840e,0x4ac3793b .long 0xed1f8ca0,0x934438eb,0x4ebb25a2,0x3e546658,0xc069896f,0xc415af0e,0x9a5aa43d,0xc13eddb0,0xd49eb8f6,0x7a04204f,0xd74f1670,0xd0d5bdfc,0x56fc0558,0x3697e286,0x01cebade,0x10207371 .long 0x0647a82b,0x5f87e690,0x8f40054f,0x908e0ed4,0x79853803,0xa9f633d4,0x4a28b252,0x8ed13c9a,0x1f460f64,0x3e2ef676,0x36d06336,0x53930b9b,0x8fc4979b,0x347073ac,0x5ecd5597,0x84380e0e .long 0xc4fe3c39,0xe3b22c6b,0x6c7bebdf,0xba4a8153,0x25693459,0xf23ab6b7,0x14922b11,0x53bc3770,0x5afc60db,0x4645c8ab,0x20b9f2a3,0xaa022355,0xce0fc507,0x52a2954c,0x7ce1c2e7,0x8c2731bb .long 0x18a0339d,0xf39608ab,0x3735436c,0xac7a658d,0xcd992b4f,0xb22c2b07,0xf40dcfd4,0x4e83daec,0x2f39ea3e,0x8a34c7be,0xb0a56d2e,0xef0c005f,0x6edd8038,0x62731f6a,0x4e3cb075,0x5721d740 .long 0xfbeeee1b,0x1ea41511,0xef1d0c05,0xd1ef5e73,0x73c07d35,0x42feefd1,0x8a329493,0xe530a00a,0xf15ebfb0,0x5d55b7fe,0xd322491a,0x549de03c,0x745b3237,0xf7b5f602,0x1ab6e2b6,0x3632a3a2 .long 0x0ef59f78,0x0d3bba89,0xc9e52b9a,0x0dfc6443,0x72631447,0x1dc79699,0xb3be20b1,0xef033917,0xb1383948,0x0c92735d,0xc0dd7d7d,0xc1fc29a2,0x403ed068,0x6485b697,0xaac93bdc,0x13bfaab3 .long 0x0deeaf52,0x410dc6a9,0x4c641c15,0xb003fb02,0x5bc504c4,0x1384978c,0x864a6a77,0x37640487,0x222a77da,0x05991bc6,0x5e47eb11,0x62260a57,0xf21b432c,0xc7af6613,0xab4953e9,0x22f3acc9 .long 0x8e41d155,0x52934922,0x3ac059ef,0x4d024568,0x4d884411,0xb0201755,0xa59a178f,0xce8055cf,0xf6204549,0xcd77d1af,0xc7066759,0xa0a00a3e,0x0272c229,0x471071ef,0xd3c4b6b0,0x009bcf6b .long 0x22305177,0x2a2638a8,0x41645bbf,0xd51d59df,0xc0a7a3c0,0xa81142fd,0x4c7063ee,0xa17eca6d,0x60d9dcec,0x0bb887ed,0x20ad2455,0xd6d28e51,0xa67102ba,0xebed6308,0x8bffa408,0x042c3114 .long 0x8aa68e30,0xfd099ac5,0x1483513e,0x7a6a3d7c,0xba2d8f0c,0xffcc6b75,0x1e78b954,0x54dacf96,0xa4a9af89,0xf645696f,0x06ac98ec,0x3a411940,0x22a67a20,0x41b8b3f6,0x99dec626,0x2d0b1e0f .long 0x40be34e8,0x27c89192,0x91907f35,0xc7162b37,0xa956702b,0x90188ec1,0xdf93769c,0xca132f7d,0x0e2025b4,0x3ece44f9,0x0c62f14c,0x67aaec69,0x22e3cc11,0xad741418,0x7ff9a50e,0xcf9b75c3 .long 0x4d348272,0x02fa2b16,0x9959d56d,0xbd99d61a,0x18762916,0xbc4f19db,0x49c1ac80,0xcc7cce50,0xd846bd83,0x4d59ebaa,0xa9202849,0x8775a9dc,0x6e1f4ca9,0x07ec4ae1,0xba893f11,0x27eb5875 .long 0x662cc565,0x00284d51,0x0db4138d,0x82353a6b,0xaa32a594,0xd9c7aaaa,0xa5669c47,0xf5528b5e,0x2f23c5ff,0xf3220231,0x6affa3a1,0xe3e8147a,0x202ddda0,0xfb423d5c,0x6b871bd4,0x3d6414ac .long 0xa51a168a,0x586f82e1,0x48ae5448,0xb712c671,0x76233eb8,0x9a2e4bd1,0x78811ca9,0x0188223a,0xf7c18de1,0x553c5e21,0xb27bb286,0x7682e451,0x0e51e929,0x3ed036b3,0xec9cb34f,0xf487211b .long 0x0c24efc8,0x0d094277,0xbef737a4,0x0349fd04,0x514cdd28,0x6d1c9dd2,0x30da9521,0x29c135ff,0xf78b0b6f,0xea6e4508,0x678c143c,0x176f5dd2,0x4be21e65,0x08148418,0xe7df38c4,0x27f7525c .long 0x748ab1a4,0x1fb70e09,0x5efe4433,0x9cba50a0,0x15f75af2,0x7846c7a6,0x5ee73ea8,0x2a7c2c57,0x3f0a449a,0x42e566a4,0xad90fc3d,0x45474c3b,0x8b61d057,0x7447be3d,0x3a4ec092,0x3e9d1cf1 .long 0xf380a6e6,0x1603e453,0x9b1437c2,0x0b86e431,0xef29610a,0x7a4173f2,0xf03d57f7,0x8fa729a7,0x6c9c217e,0x3e186f6e,0x91919524,0xbe1d3079,0x153d4fb1,0x92a62a70,0xd68c2f71,0x32ed3e34 .long 0x9eb1a8b7,0xd785027f,0xc5b22fe8,0xbc37eb77,0xb9d6a191,0x466b34f0,0x9a05f816,0x008a89af,0x7d42c10a,0x19b028fb,0x49b3f6b8,0x7fe8c92f,0xa5a0ade3,0x58907cc0,0x559d1a7c,0xb3154f51 .long 0xd9790ed6,0x5066efb6,0xa6aa793b,0xa77a0cbc,0x223e042e,0x1a915f3c,0x69c5874b,0x1c5def04,0x73b6c1da,0x0e830078,0xfcd8557a,0x55cf85d2,0x0460f3b1,0x0f7c7c76,0x46e58063,0x87052acb .long 0x907eae66,0x09212b80,0x4d721c89,0x3cb068e0,0xdd45ac1c,0xa87941ae,0x0daa0dbb,0xde8d5c0d,0xe3502e6e,0xda421fdc,0x4d89a084,0xc8944201,0xf0c24bfb,0x7307ba5e,0x20bde0ef,0xda212beb .long 0xf82ce682,0xea2da24b,0x07f71fe4,0x058d3816,0x5ffad8de,0x35a02462,0xaadcefab,0xcd7b05dc,0x1d9f54ec,0xd442f8ed,0xb2d3b5ca,0x8be3d618,0xe06b2ce2,0xe2220ed0,0x1b0da4c0,0x82699a5f .long 0x71c0c3a7,0x3ff106f5,0x0d34180c,0x8f580f5a,0x22d7d375,0x4ebb120e,0xe9513675,0x5e5782cc,0x99c82a70,0x2275580c,0x15ea8c4c,0xe8359fbf,0x7b415e70,0x53b48db8,0x100c6014,0xaacf2240 .long 0xe4652f1d,0x9faaccf5,0xd56157b2,0xbd6fdd2a,0x6261ec50,0xa4f4fb1f,0x476bcd52,0x244e55ad,0x047d320b,0x881c9305,0x6181263f,0x1ca983d5,0x278fb8ee,0x354e9a44,0x396e4964,0xad2dbc0f .long 0x9268b3de,0x723f3aa2,0xe6e0609a,0x0d1ca29a,0x6cf44252,0x794866aa,0x01af87ed,0x0b59f3e3,0x7f4a6c51,0xe234e5ff,0x61dc2f7e,0xa8768fd2,0x0a94d81f,0xdafc7332,0x06938ce1,0xd7f84282 .long 0x0546063e,0xae0b3c0e,0x5d61abc6,0x7fbadcb2,0x369ac400,0xd5d7a2c9,0xae67d10c,0xa5978d09,0x4f85eaac,0x290f211e,0xfacac681,0xe61e2ad1,0x388384cd,0xae125225,0xccfde30f,0xa7fb68e9 .long 0x3daed4c2,0x7a59b936,0x2606f789,0x80a9aa40,0xf6a6d90a,0xb40c1ea5,0x514d5885,0x948364d3,0x70985182,0x062ebc60,0x33310895,0xa6db5b0e,0xe329c2f5,0x64a12175,0x90ea237e,0xc5f25bd2 .long 0x2d0a4c23,0x7915c524,0x6bb3cc52,0xeb5d26e4,0xc09e2c92,0x369a9116,0xcf182cf8,0x0c527f92,0x2aede0ac,0x9e591938,0x6cc34939,0xb2922208,0x99a34361,0x3c9d8962,0xc1905fe6,0x3c81836d .long 0xa001ec5a,0x4bfeb57f,0xa0dc5dba,0xe993f5bb,0x724a1380,0x47884109,0x32fe9a04,0x8a0369ab,0x8c927db8,0xea068d60,0x94655741,0xbf5f37cf,0x04b6c7ea,0x47d402a2,0x6af259cb,0x4551c295 .long 0xed77ee8b,0x698b71e7,0xf309d5c7,0xbddf7bd0,0x34e780ca,0x6201c22c,0x4c295ef4,0xab04f7d8,0x4313a8ce,0x1c947294,0x92ca4cfe,0xe532e4ac,0xd0a7a97a,0x89738f80,0xa580fd5b,0xec088c88 .long 0x42ce9e51,0x612b1ecc,0xb25fdd2a,0x8f9840fd,0x01e7f839,0x3cda78c0,0xece05480,0x546b3d3a,0x80d30916,0x271719a9,0x584c20c4,0x45497107,0x5bc78608,0xaf8f9478,0x277e2a4c,0x28c7d484 .long 0x88a2ffe4,0xfce01767,0x28e169a5,0xdc506a35,0x7af9c93a,0x0ea10861,0x03fa0e08,0x1ed24361,0xa3d694e7,0x96eaaa92,0xef50bc74,0xc0f43b4d,0x64114db4,0xce6aa58c,0x7c000fd4,0x8218e8ea .long 0x185f8844,0xac815dfb,0x1557abfb,0xcd7e90cb,0xafbfecdf,0x23d16655,0x085cac4a,0x80f3271f,0xd0e62f47,0x7fc39aa7,0x460a48e5,0x88d519d1,0xd28f101e,0x59559ac4,0xca9ae816,0x7981d9e9 .long 0x9ac38203,0x5c38652c,0x57657fe5,0x86eaf87f,0xe21f5416,0x568fc472,0xe7e597b5,0x2afff39c,0x256d4eab,0x3adbbb07,0x8285ab89,0x22598692,0x041caefe,0x35f8112a,0xa5064c8b,0x95df02e3 .long 0xc7004bf3,0x4d63356e,0xdb83c7de,0x230a08f4,0x8709a7b7,0xca27b270,0xcb9abd2d,0x0d1c4cc4,0x7550fee8,0x8a0bc66e,0x9cf7247e,0x369cd4c7,0x92b5b7e7,0x75562e84,0x5802af7b,0x8fed0da0 .long 0xe48fb889,0x6a7091c2,0x7b8a9d06,0x26882c13,0x1b82a0e2,0xa2498663,0x3518152d,0x844ed736,0xd86e27c7,0x282f476f,0x04afefdc,0xa04edaca,0x6119e34d,0x8b256ebc,0x0787d78b,0x56a413e9 .long 0x5a74be50,0x82ee061d,0xdea16ff5,0xe41781c4,0x99bfc8a2,0xe0b0c81e,0x0b547e2d,0x624f4d69,0xbdcc9ae4,0x3a83545d,0x409b1e8e,0x2573dbb6,0xa6c93539,0x482960c4,0x5ae18798,0xf01059ad .long 0x3112795f,0x715c9f97,0x984e6ee1,0xe8244437,0xecb66bcd,0x55cb4858,0xabaffbee,0x7c136735,0x5dbec38e,0x54661595,0x388ad153,0x51c0782c,0xc6e0952f,0x9ba4c53a,0x1b21dfa8,0x27e6782a .long 0x4ed2dbc2,0x682f903d,0x7c3b2d83,0x0eba59c8,0x9c7e9335,0x8e9dc84d,0x0eb226d7,0x5f9b21b0,0xaf267bae,0xe33bd394,0xbe2e15ae,0xaa86cc25,0x6a8ec500,0x4f0bf67d,0xf9630658,0x5846aa44 .long 0xe2c2bf15,0xfeb09740,0xa9e99704,0x627a2205,0xc2fbc565,0xec8d73d0,0xc20c8de8,0x223eed8f,0xa8363b49,0x1ee32583,0xc9c2b0a6,0x1a0b6cb9,0x90dbc85c,0x49f7c3d2,0x1ef4c1ac,0xa8dfbb97 .long 0x65c7c2ab,0xafb34d4c,0xe2c5ea84,0x1d4610e7,0x973c4ab5,0x893f6d1b,0x945ba5c4,0xa3cdd7e9,0x064417ee,0x60514983,0xad6bdf2b,0x1459b23c,0x5cf726c3,0x23b2c341,0x32d6354a,0x3a829635 .long 0xab192c18,0x294f901f,0x7030164f,0xec5fcbfe,0xe2246ba6,0xe2e2fcb7,0x221a1a0c,0x1e7c88b3,0xc92d88c5,0x72c7dd93,0x1106fb59,0x41c2148e,0xa0f60f14,0x547dd4f5,0x63960f31,0xed9b52b2 .long 0xb0a5b358,0x6c8349eb,0x9e7e2ed6,0xb154c5c2,0xeda462db,0xcad5eccf,0x2de66b69,0xf2d6dbe4,0x8665e5b2,0x426aedf3,0x7b7f5723,0x488a8513,0x8bcbb386,0x15cc43b3,0xd791d879,0x27ad0af3 .long 0x846e364f,0xc16c236e,0xdea50ca0,0x7f33527c,0x0926b86d,0xc4810775,0x0598e70c,0x6c2a3609,0xf024e924,0xa6755e52,0x9db4afca,0xe0fa07a4,0x66831790,0x15c3ce7d,0xa6cbb0d6,0x5b4ef350 .long 0xb6205969,0x2c4aafc4,0xf6c7854f,0x42563f02,0x1d983b48,0x016aced5,0x99949755,0xfeb356d8,0xd1a39bd7,0x8c2a2c81,0xe6934ae9,0x8f44340f,0x447904da,0x148cf91c,0x0f51a926,0x7340185f .long 0x7409ab46,0x2f8f00fb,0x80e289b2,0x057e78e6,0xa888e5d1,0x03e5022c,0x9dede4e2,0x3c87111a,0x7809460b,0x5b9b0e1c,0x71c9abc7,0xe751c852,0xc7cc1dc9,0x8b944e28,0x1d3cfa08,0x4f201ffa .long 0x3e6721ce,0x02fc905c,0xd0b3674c,0xd52d70da,0x18810da4,0x5dc2e5ca,0x5c69dd99,0xa984b273,0x84de5ca4,0x63b92527,0xc852dec4,0x2f1c9872,0xc2e3de09,0x18b03593,0x9813dc2f,0x19d70b01 .long 0xa6dc1d29,0x42806b2d,0xf871e144,0xd3030009,0xaaf49276,0xa1feb333,0xc70bc04b,0xb5583b9e,0x95695f20,0x1db0be78,0x89d012b5,0xfc841811,0x05f61643,0x6409f272,0xd5883128,0x40d34174 .long 0x67419833,0xd79196f5,0x863b7b08,0x6059e252,0x1c56700c,0x84da1817,0xb28d3ec4,0x5758ee56,0x013b0ea6,0x7da2771d,0x54c5e9b9,0xfddf524b,0x24305d80,0x7df4faf8,0x3a97763f,0x58f5c1bf .long 0x7c696042,0xa5af37f1,0x4a2538de,0xd4cba22c,0x9ea42600,0x211cb995,0x7b069889,0xcd105f41,0xddb81e74,0xb1e1cf19,0x5157b8ca,0x472f2d89,0xee9db885,0x086fb008,0x0f26d131,0x365cd570 .long 0xa2be7053,0x284b02bb,0x7ab9a6d6,0xdcbbf7c6,0x20f7a530,0x4425559c,0x188767c8,0x961f2dfa,0x70dc80c4,0xe2fd9435,0xf0784120,0x104d6b63,0x53567122,0x7f592bc1,0xf688ad77,0xf6bc1246 .long 0x0f15dde9,0x05214c05,0x0d5f2b82,0xa47a76a8,0x62e82b62,0xbb254d30,0x3ec955ee,0x11a05fe0,0x9d529b36,0x7eaff46e,0x8f9e3df6,0x55ab1301,0x99317698,0xc463e371,0xccda47ad,0xfd251438 .long 0x23d695ea,0xca9c3547,0x16e589b5,0x48ce626e,0xb187d086,0x6b5b64c7,0xb2207948,0xd02e1794,0x7198111d,0x8b58e98f,0xdcf9c3cc,0x90ca6305,0xf34089b0,0x5691fe72,0xfc7c80ff,0x60941af1 .long 0x22eb51e5,0xa09bc0a2,0xaa9cf09a,0xc0bb7244,0x80159f06,0x36a8077f,0xdddc560e,0x8b5c989e,0x512e1f43,0x19d2f316,0xad08ff62,0x02eac554,0x07d20b4e,0x012ab84c,0xd6d4e4e1,0x37d1e115 .long 0xab7b19a8,0xb6443e1a,0xdef8cd45,0xf08d067e,0x685e03da,0x63adf3e9,0x4792b916,0xcf15a10e,0xb738a425,0xf44bcce5,0x9636b2fd,0xebe131d5,0x7850d605,0x94068841,0xb40d749d,0x09684eaa .long 0x72ba075b,0x8c3c669c,0xba469015,0x89f78b55,0x3e9f8ba8,0x5706aade,0xb32d7ed7,0x6d8bd565,0x805f08d6,0x25f4e63b,0xc3bcc1b5,0x7f48200d,0xb025d847,0x4e801968,0x87cbe0a8,0x74afac04 .long 0x7e63d690,0x43ed2c2b,0x0223cdb8,0xefb6bbf0,0x2884d3fe,0x4fec3cae,0xd75e25a4,0x065ecce6,0x69f79071,0x6c2294ce,0x044b8666,0x0d9a8e5f,0x17b69d8f,0x5009f238,0xc5dfdaf7,0x3c29f8fe .long 0xebae68c4,0x9067528f,0x30c5ba21,0x5b385632,0x1fdd1aec,0x540df119,0xcfba4c78,0xcf37825b,0xbeb11454,0x77eff980,0x60c1b066,0x40a1a991,0xf889a1c7,0xe8018980,0x76c24be0,0xb9c52ae9 .long 0x45650ef4,0x05fbbcce,0x8aa29ac7,0xae000f10,0x4f04c470,0x884b7172,0x19bb5c25,0x7cd4fde2,0xe8840869,0x6477b22a,0x5fbd0686,0xa8868859,0x1116dfba,0xf23cc02e,0xd87d7776,0x76cd563f .long 0xa9d82abf,0xe2a37598,0xe6c170f5,0x5f188ccb,0x5066b087,0x81682200,0xc7155ada,0xda22c212,0xfbddb479,0x151e5d3a,0x6d715b99,0x4b606b84,0xf997cb2e,0x4a73b54b,0x3ecd8b66,0x9a1bfe43 .long 0x2a67d48a,0x1c312809,0x031fa9e2,0xcd6a671e,0x0e43a34a,0xbec3312a,0x55ef47d3,0x1d935639,0x8fea73ea,0x5ea02489,0xa035afb2,0x8247b364,0x5265b54c,0xb58300a6,0x722c7148,0x3286662f .long 0xb4ec4c20,0xb77fd76b,0x0f3fe3fd,0xf0a12fa7,0x41d8c7e8,0xf845bbf5,0x5ec10aa8,0xe4d969ca,0x43e232a3,0x4c0053b7,0x37f8a45a,0xdc7a3fac,0x20d81c8f,0x3c4261c5,0xb00eab00,0xfd4b3453 .long 0xd36e3062,0x76d48f86,0xa143ff02,0x626c5277,0xaf76f42e,0x538174de,0x6407ceac,0x2267aa86,0x72e572d5,0xfad76351,0xba7330eb,0xab861af7,0x418d8657,0xa0a1c8c7,0x20289a52,0x988821cb .long 0xcccc18ad,0x79732522,0xf1a6e027,0xaadf3f8d,0x17c2354d,0xf7382c93,0xd818b689,0x5ce1680c,0xd9ecbee9,0x359ebbfc,0x1cae62ac,0x4330689c,0xc51ac38a,0xb55ce5b4,0xfe238ee8,0x7921dfea .long 0x271d1ca5,0x3972bef8,0xe8aabd18,0x3e423bc7,0x44a3e5e3,0x57b09f3f,0x7b444d66,0x5da886ae,0xa9964375,0x68206634,0x699cd0ff,0x356a2fa3,0xdba515e9,0xaf0faa24,0xb321d79a,0x536e1f5c .long 0x5c04e4ea,0xd3b9913a,0xd6f11513,0xd549dcfe,0x79fd1d94,0xee227bf5,0xb43f2c67,0x9f35afee,0xf1314f53,0xd2638d24,0xcabcd822,0x62baf948,0x4ef48db0,0x5542de29,0xfc5f6bb2,0xb3eb6a04 .long 0x1208e16a,0x23c110ae,0xf8363e24,0x1a4d15b5,0x164be00b,0x30716844,0xf6f4690d,0xa8e24824,0x90b170cf,0x548773a2,0x42f191f4,0xa1bef331,0x9247aa97,0x70f418d0,0x48be9147,0xea06028e .long 0xdbfb894e,0xe13122f3,0xce274b18,0xbe9b79f6,0xca58aadf,0x85a49de5,0x11487351,0x24957758,0xbb939099,0x111def61,0x26d13694,0x1d6a974a,0xd3fc253b,0x4474b4ce,0x4c5db15e,0x3a1485e6 .long 0x147c15b4,0xe79667b4,0x7bc61301,0xe34f553b,0x17094381,0x032b80f8,0x723eaa21,0x55d8bafd,0xf1c0e74e,0x5a987995,0xebba289c,0x5a9b292e,0xeb4c8251,0x413cd4b2,0xd162db0a,0x98b5d243 .long 0x68342520,0xbb47bf66,0xbaa862d1,0x08d68949,0xe906abcd,0x11f349c7,0xed7bf00e,0x454ce985,0xb55b803b,0xacab5c9e,0x31e3c16d,0xb03468ea,0xd273bf12,0x5c24213d,0x71587887,0x211538eb .long 0x731dea2d,0x198e4a2f,0x74ed7b2a,0xd5856cf2,0x13a664fe,0x86a632eb,0xbda41291,0x932cd909,0xc0c4ddc0,0x850e95d4,0x347fc2c9,0xc0f422f8,0x86076bcb,0xe68cbec4,0xcd6cd286,0xf9e7c0c0 .long 0x0f5f27ca,0x65994ddb,0xa80d59ff,0xe85461fb,0x66601023,0xff05481a,0xfc9ebbfb,0xc665427a,0x7587fd52,0xb0571a69,0x8d49efce,0x935289f8,0xea420688,0x61becc60,0x13a786af,0xb22639d9 .long 0x361ecf90,0x1a8e6220,0x25506463,0x001f23e0,0x0a5c2b79,0xe4ae9b5d,0xd8149db5,0xebc9cdad,0x934aa728,0xb33164a1,0xae9b60f3,0x750eb00e,0x9b9cfbfd,0x5a91615b,0xef45f7f6,0x97015cbf .long 0xbf5151df,0xb462c4a5,0xb07118f2,0x21adcc41,0x043fa42c,0xd60c545b,0xe96be1ab,0xfc21aa54,0x4e51ea80,0xe84bc32f,0x259b5d8d,0x3dae45f0,0xc38f1b5e,0xbb73c7eb,0xe8ae617d,0xe405a74a .long 0x9f1c56bd,0xbb1ae9c6,0x49f196a4,0x8c176b98,0x6875092b,0xc448f311,0x9f976033,0xb5afe3de,0x145813e5,0xa8dafd49,0xe2b34226,0x687fc4d9,0x4c7ff57f,0xf2dfc92d,0x401f1b46,0x004e3fc1 .long 0x1430c9ab,0x5afddab6,0x2238e997,0x0bdd41d3,0x418042ae,0xf0947430,0xcdddc4cb,0x71f9adda,0xc52dd907,0x7090c016,0x29e2047f,0xd9bdf44d,0x1b1011a6,0xe6f1fe80,0xd9acdc78,0xb63accbc .long 0x1272a95b,0xcfc7e235,0xa6276ac8,0x0c667717,0xe2d7eef7,0x3c0d3709,0x9a685b3e,0x5add2b06,0x14ea5d65,0x363ad32d,0x8d7dd506,0xf8e01f06,0x75b4aac6,0xc9ea2213,0x0d353466,0xed2a2bf9 .long 0xe9d3a7c3,0x439d79b5,0x81b7f34b,0x8e0ee5a6,0x1dc4ba75,0xcf3dacf5,0xeb3310c7,0x1d3d1773,0x7747ae83,0xa8e67112,0x197d6b40,0x31f43160,0xcd961400,0x0521ccee,0xf6535768,0x67246f11 .long 0xef0c3133,0x702fcc5a,0x7e16693b,0x247cc45d,0xc729b749,0xfd484e49,0xb218320f,0x522cef7d,0x59ab93b3,0xe56ef405,0x9f181071,0x225fba11,0x15330ed0,0x33bd6595,0x1ddb32f7,0xc4be69d5 .long 0x0448087c,0x264c7668,0x71432dae,0xac30903f,0x00f9bf47,0x3851b266,0x6cdd6d03,0x400ed311,0xf8fd2424,0x045e79fe,0xfa6da98b,0xfdfd974a,0x0c1e673a,0x45c9f641,0x5b2c5168,0x76f2e733 .long 0x2a601753,0x1adaebb5,0xc57c2d49,0xb286514c,0x1e0bfd24,0xd8769670,0x04478922,0x950c547e,0xe5d32bfe,0xd1d41969,0x750d6c3e,0x30bc1472,0xe0e27f3a,0x8f3679fe,0xa4a6ee0c,0x8f64a7dc .long 0x633dfb1f,0x2fe59937,0x977f2547,0xea82c395,0x661ea646,0xcbdfdf1a,0xb9085451,0xc7ccc591,0x81761e13,0x82177962,0x9196885c,0xda57596f,0x28ffbd70,0xbc17e849,0x2671d36f,0x1e6e0a41 .long 0x4152fcf5,0x61ae872c,0x9e77e754,0x441c87b0,0xa34dff09,0xd0799dd5,0x88a6b171,0x766b4e44,0x11f1c792,0xdc06a512,0x4be35c3e,0xea02ae93,0xe90c469e,0xe5ca4d6d,0x56e4ff5c,0x4df4368e .long 0x4baef62e,0x7817acab,0xa85b91e8,0x9f5a2202,0x6ce57610,0x9666ebe6,0xf73bfe03,0x32ad31f3,0x25bcf4d6,0x628330a4,0x515056e6,0xea950593,0xe1332156,0x59811c89,0x8c11b2d7,0xc89cf1fe .long 0x04e60cc0,0x75b63913,0x4625d375,0xce811e8d,0x2d26e562,0x030e43fc,0x608d36a0,0xfbb30b4b,0x48528118,0x634ff82c,0xcd285911,0x7c6fe085,0x99358f28,0x7f2830c0,0x665e6c09,0x2e60a95e .long 0x9b785dbf,0x08407d3d,0xa759bce7,0x530889ab,0x52f61239,0xf228e0e6,0x6879be3c,0x2b6d1461,0x51a7bbf7,0xe6902c04,0x76f24a64,0x30ad99f0,0x98bc6da0,0x66d9317a,0xcb596ac0,0xf4f877f3 .long 0x4c44f119,0xb05ff62d,0xe9b77416,0x4555f536,0x8caed63b,0xc7c0d059,0xc358b2a9,0x0cd2b7ce,0x46945fa3,0x3f33287b,0xd67c8791,0xf8785b20,0x9637bd08,0xc54a7a61,0x18be79d7,0x54d4598c .long 0xc46d7ce1,0x889e5acb,0x8b085877,0x9a515bb7,0x0b7a5050,0xfac1a03d,0xf2926035,0x7d3e738a,0x2a6cb0eb,0x861cc2ce,0x8f7adc79,0x6f2e2955,0x33016376,0x61c4d451,0x5ad59090,0xd9fd2c80 .long 0xb2b836a1,0xe5a83738,0x7c0d6622,0x855b41a0,0x7cc19af1,0x186fe317,0xfdd99acb,0x6465c1ff,0x6974b99e,0x46e5c23f,0xa2717cbe,0x75a7cf8b,0x062be658,0x4d2ebc3f,0x5f209c98,0x094b4447 .long 0xb940cb5a,0x4af285ed,0x7cc82f10,0x6706d792,0x030526fa,0xc8c8776c,0xa0da9140,0xfa8e6f76,0x591ee4f0,0x77ea9d34,0x40274166,0x5f46e337,0xea671457,0x1bdf98bb,0x862a1fe2,0xd7c08b46 .long 0x1c08ad63,0x46cc303c,0x4c845e7b,0x99543440,0x48f36bf7,0x1b8fbdb5,0x8c8273a7,0x5b82c392,0x928435d5,0x08f712c4,0x79330380,0x071cf0f1,0xa8da054a,0xc74c2d24,0x43c46b5c,0xcb0e7201 .long 0xc0b7eff3,0x0ad7337a,0xc5e48b3c,0x8552225e,0x73f13a5f,0xe6f78b0c,0x82349cbe,0x5e70062e,0xe7073969,0x6b8d5048,0xc33cb3d2,0x392d2a29,0x4ecaa20f,0xee4f727c,0x2ccde707,0xa068c99e .long 0xb87a2913,0xfcd5651f,0x3cc252f0,0xea3e3c15,0x3b6cd3e4,0x777d92df,0xc5a732e7,0x7a414143,0xa71ff493,0xa895951a,0xbbd37cf6,0xfe980c92,0xdecfeeff,0x45bd5e64,0xa44c43e9,0x910dc2a9 .long 0xcca9f54d,0xcb403f26,0x9303f6db,0x928bbdfb,0xa9eee67c,0x3c37951e,0xf79961c3,0x3bd61a52,0x395c9a79,0x09a238e6,0x61eb352d,0x6940ca2d,0xc1875631,0x7d1e5c5e,0x1e1b20d1,0x1e19742c .long 0x23fc2e6e,0x4633d908,0x08959149,0xa76e29a9,0x84ed7da5,0x61069d9c,0x5dbcad51,0x0baa11cf,0x961849da,0xd01eec64,0xaf3d8c28,0x93b75f1f,0x1ca2ee44,0x57bc4f9f,0x00e00558,0x5a26322d .long 0x61a023ef,0x1888d658,0xb9e5246e,0x1d72aab4,0xe5563ec0,0xa9a26348,0xc3439a43,0xa0971963,0xadb9b5b7,0x567dd54b,0xc45a524b,0x73fac1a1,0xfe38e608,0x8fe97ef7,0x3f384f48,0x608748d2 .long 0xc486094f,0xb0571794,0x8bf3a8d6,0x869254a3,0x310b0e25,0x148a8dd1,0x9aa3f7d8,0x99ab9f3f,0x6706c02e,0x0927c68a,0x69790e6c,0x22b5e76c,0x6c71376c,0x6c325260,0x09ef6657,0x53a57690 .long 0xedffcf3a,0x8d63f852,0x3c0a6f55,0xb4d2ed04,0x12519b9e,0xdb3aa8de,0x1e0a569a,0x5d38e9c4,0x303747e2,0x871528bf,0xf5b5c18d,0xa208e77c,0xca6bf923,0x9d129c88,0xbf02839f,0xbcbf197f .long 0x27323194,0x9b9bf030,0x339ca59d,0x3b055a8b,0x0f669520,0xb46b2312,0x497e5f24,0x19789f1f,0xaaf01801,0x9c499468,0x8b69d59c,0x72ee1190,0xacf4c079,0x8bd39595,0x8e0cd048,0x3ee11ece .long 0x1ed66f18,0xebde86ec,0xd61fce43,0x225d906b,0xe8bed74d,0x5cab07d6,0x27855ab7,0x16e4617f,0xb2fbc3dd,0x6568aadd,0x8aeddf5b,0xedb5484f,0x6dcf2fad,0x878f20e8,0x615f5699,0x3516497c .long 0xfa181e69,0xef0a3fec,0x30d69a98,0x9ea02f81,0x66eab95d,0xb2e9cf8e,0x24720021,0x520f2beb,0x1df84361,0x621c540a,0x71fa6d5d,0x12037721,0x0ff5f6ff,0x6e3c7b51,0xabb2bef3,0x817a069b .long 0xb294cda6,0x83572fb6,0xb9039f34,0x6ce9bf75,0x095cbb21,0x20e012f0,0xd063f0da,0xa0aecc1b,0xf02909e5,0x57c21c3a,0x48ce9cdc,0xc7d59ecf,0x8ae336f8,0x2732b844,0x3f4f85f4,0x056e3723 .long 0x89e800ca,0x8a10b531,0x145208fd,0x50fe0c17,0xb714ba37,0x9e43c0d3,0x34189acc,0x427d200e,0xe616e2c0,0x05dee24f,0xee1854c1,0x9c25f4c8,0x8f342a73,0x4d3222a5,0xa027c952,0x0807804f .long 0x4f0d56f3,0xc222653a,0xca28b805,0x961e4047,0x4a73434b,0x2c03f8b0,0xab712a19,0x4c966787,0x864fee42,0xcc196c42,0x5b0ece5c,0xc1be93da,0xc131c159,0xa87d9f22,0xdce45655,0x2bb6d593 .long 0xb809b7ce,0x22c49ec9,0xe2c72c2c,0x8a41486b,0xfea0bf36,0x813b9420,0xa66dac69,0xb3d36ee9,0x328cc987,0x6fddc08a,0x3a326461,0x0a3bcd2c,0xd810dbba,0x7103c49d,0x4b78a4c4,0xf9d81a28 .long 0xe4d55941,0x3de865ad,0x30384087,0xdedafa5e,0x4ef18b9b,0x6f414abb,0xfaee5268,0x9ee9ea42,0x37a55a4a,0x260faa16,0x015f93b9,0xeb19a514,0x9e9c3598,0x51d7ebd2,0x1932178e,0x523fc56d .long 0xb98fe684,0x501d070c,0x124a1458,0xd60fbe9a,0x92bc6b3f,0xa45761c8,0xfe6f27cb,0xf5384858,0xb59e763b,0x4b0271f7,0x5b5a8e5e,0x3d4606a9,0x05a48292,0x1eda5d9b,0xe6fec446,0xda7731d0 .long 0x90d45871,0xa3e33693,0x06166d8d,0xe9764040,0x89a90403,0xb5c33682,0x72f1d637,0x4bd17983,0xd5d2c53a,0xa616679e,0xfdcf3b87,0x5ec4bcd8,0xb66a694e,0xae6d7613,0xe3fc27e5,0x7460fc76 .long 0x95caabee,0x70469b82,0x889501e3,0xde024ca5,0x076ed265,0x6bdadc06,0x5a0ef8b2,0x0cb1236b,0x0972ebf9,0x4065ddbf,0x22aca432,0xf1dd3875,0x744aff76,0xa88b97cf,0xfe8e3d24,0xd1359afd .long 0x91502cf3,0x52a3ba2b,0x084db75d,0x2c3832a8,0xde30b1c9,0x04a12ddd,0xe31fd60c,0x7802eabc,0xa37fddab,0x33707327,0xfaafa973,0x65d6f2ab,0x11e6f91a,0x3525c5b8,0x5f46530b,0x76aeb0c9 .long 0x2f93a675,0xe8815ff6,0x05f48679,0xa6ec9684,0x358ae884,0x6dcbb556,0xe19e3873,0x0af61472,0xa5f696be,0x72334372,0x6f22fb70,0xc65e57ea,0x946cea90,0x268da30c,0x65681b2a,0x136a8a87 .long 0x0f9f44d4,0xad5e81dc,0x2c46585a,0xf09a6960,0xc447d1b1,0xd1649164,0x879dc8b1,0x3b4b36c8,0x3b6b234c,0x20d4177b,0x1730d9d0,0x096a2505,0xef80531d,0x0611b9b8,0x64bb495d,0xba904b3b .long 0x93a3147a,0x1192d9d4,0x9a565545,0x9f30a5dc,0x6ef07212,0x90b1f9cb,0x0d87fc13,0x29958546,0xc17db9ba,0xd3323eff,0xcb1644a8,0xcb18548c,0x4f49ffbc,0x18a306d4,0x4c2e8684,0x28d658f1 .long 0xa99f8c71,0x44ba60cd,0x4bf742ff,0x67b7abdb,0x914b3f99,0x66310f9c,0xf412c161,0xae430a32,0x88ace52f,0x1e6776d3,0x52d7067d,0x4bc0fa24,0x8f07cd1b,0x03c286aa,0xa985b2c1,0x4cb8f38c .long 0x8c3bff36,0x83ccbe80,0x5263e575,0x005a0bd2,0x259bdcd1,0x460d7dda,0xfa5cab6b,0x4a1c5642,0x9fe4fc88,0x2b7bdbb9,0xcc97bbb5,0x09418e28,0xa12321ae,0xd8274fb4,0x5c87b64e,0xb137007d .long 0xc63c4962,0x80531fe1,0x981fdb25,0x50541e89,0xfd4c2b6b,0xdc1291a1,0xa6df4fca,0xc0693a17,0x0117f203,0xb2c4604e,0x0a99b8d0,0x245f1963,0xc6212c44,0xaedc20aa,0x520f52a8,0xb1ed4e56 .long 0xf8547be3,0xfe48f575,0xa9e45f98,0x0a7033cd,0x18c50100,0x4b45d3a9,0xa61d41da,0xb2a6cd6a,0x57933c6b,0x60bbb4f5,0x2b0d7ffc,0xa7538ebd,0x8cd626b6,0x9ea3ab8d,0x3601625a,0x8273a484 .long 0x0168e508,0x88859845,0x99a94abd,0x8cbc9bb2,0xfab0a671,0x713ac792,0x6c9ebffc,0xa3995b19,0x1239e152,0xe711668e,0xbbb8dff4,0x56892558,0xdbf17963,0x8bfc7dab,0xb3de1253,0x5b59fe5a .long 0x34a9f7ae,0x7e3320eb,0xd751efe4,0xe5e8cf72,0xd9be2f37,0x7ea003bc,0xb6c08ef7,0xc0f551a0,0x038f6725,0x56606268,0x6d92d3b6,0x1dd38e35,0xc3cbd686,0x07dfce7c,0x651c5da8,0x4e549e04 .long 0x08b19340,0x4058f93b,0xcac6d89d,0xc2fae6f4,0x8f159cc7,0x4bad8a8c,0xcb0b601c,0x0ddba4b3,0x1dd95f8c,0xda4fc7b5,0xcea5c255,0x1d163cd7,0x274a8c4c,0x30707d06,0x2802e9ce,0x79d9e008 .long 0xe6ddd505,0x02a29ebf,0xb50bed1a,0x37064e74,0xa7327d57,0x3f6bae65,0xf83920bc,0x3846f5f1,0x60df1b9b,0x87c37491,0x2d1da29f,0x4cfb2895,0x4ed1743c,0x10a478ca,0x3edd47c6,0x390c6030 .long 0x8c0a78de,0x8f3e5312,0x1e85df70,0xccd02bda,0xa61b6582,0xd6c75c03,0xfc0eebd1,0x0762921c,0xd85010c0,0xd34d0823,0x0044cf1f,0xd73aaacb,0xa3b5e78a,0xfb4159bb,0xe5826f3f,0x2287c7f7 .long 0x580b1a01,0x4aeaf742,0x60423b79,0xf080415d,0xa7dea144,0xe12622cd,0x59d62472,0x49ea4996,0x571f3913,0xb42991ef,0xf5b25a8a,0x0610f214,0x30b79e8f,0x47adc585,0x07a065a2,0xf90e3df6 .long 0x43e2e034,0x5d0a5deb,0x444024aa,0x53fb5a34,0x6b0c9f7f,0xa8628c68,0xac563656,0x9c69c29c,0xbace47b6,0x5a231feb,0x9ea5a2ec,0xbdce0289,0x9463853e,0x05da1fac,0x509e78aa,0x96812c52 .long 0x57151692,0xd3fb5771,0xd98e1c44,0xeb2721f8,0x32399be1,0xc0506087,0xd979d8b8,0xda5a5511,0xc6f56780,0x737ed55d,0x0dc7a7f4,0xe20d3004,0xf5941a03,0x02ce7301,0xed30f83a,0x91ef5215 .long 0x4092d85f,0x28727fc1,0x5c49e41a,0x72d223c6,0xba6a4d81,0xa7cf30a2,0xb030d87d,0x7c086209,0xfc588b09,0x04844c7d,0x5874bbb0,0x728cd499,0xe84c0495,0xcc1281ee,0xec31958f,0x0769b5ba .long 0xf99c2471,0x665c228b,0x191eb110,0xf2d8a11b,0xd36d7024,0x4594f494,0xcdcb25a1,0x482ded8b,0xdadd4885,0xc958a9d8,0xf1d2b547,0x7004477e,0x2a0af550,0x0a45f6ef,0x2f8d6351,0x4fc739d6 .long 0x786f08a9,0x75cdaf27,0x42c2737f,0x8700bb26,0x1c4e2670,0x855a7141,0x15076fef,0x810188c1,0xabcd3297,0xc251d0c9,0xf48108eb,0xae4c8967,0x18ceed30,0xbd146de7,0xc986bced,0xf9d4f07a .long 0x83fa1e08,0x5ad98ed5,0xbeabd1fb,0x7780d33e,0x903b1196,0xe330513c,0xa47bc8c4,0xba11de9e,0x02c2d064,0x684334da,0xa48de23b,0x7ecf360d,0x0a9089d8,0x57a1b474,0xff36734c,0xf28fa439 .long 0xea4570b3,0xf2a482cb,0xa5ebcee9,0xee65d68b,0xb9694cd5,0x988d0036,0x37885d32,0x53edd0e9,0xbeb9bc6d,0xe37e3307,0x9f5c6768,0xe9abb907,0x51f2160f,0x4396ccd5,0x47336da6,0x2500888c .long 0x926fce43,0x383f9ed9,0x04da2930,0x809dd1c7,0x8a4cb227,0x30f6f596,0x73a56b38,0x0d700c7f,0xab64a065,0x1825ea33,0x1338df80,0xaab9b735,0x9b63f57f,0x1516100d,0x27a6a634,0x2574395a .long 0x700a1acd,0xb5560fb6,0xfd999681,0xe823fd73,0x6cb4e1ba,0xda915d1f,0x6ebe00a3,0x0d030118,0x89fca8cd,0x744fb0c9,0xf9da0e0b,0x970d01db,0x7931d76f,0x0ad8c564,0xf659b96a,0xb15737bf .long 0xa8b484e7,0xdc9933e8,0x7a26dec7,0xb2fdbdf9,0x9f1f0136,0x2349e9a4,0x70fddddb,0x7860368e,0xf9ad3e18,0xd93d2c1c,0x689f4e79,0x6d6c5f17,0xb24ff1b6,0x7a544d91,0xfe16cd8c,0x3e12a5eb .long 0xa56b872f,0x543574e9,0xfcf68ea2,0xa1ad550c,0x3f560ef7,0x689e37d2,0xc9d47a8b,0x8c54b9ca,0x088ac342,0x46d40a4a,0x1576c6d0,0xec450c7c,0x1f9689e9,0xb589e31c,0xb8781718,0xdacf2602 .long 0xc8cb6b42,0xa89237c6,0xb96ef381,0x1326fc93,0xb5f07825,0x55d56c6d,0x7449e22d,0xacba2eea,0x633c3000,0x74e0887a,0xd7cbcf71,0xcb6cd172,0xc36cf1be,0x309e81de,0x60ae399b,0x07a18a6d .long 0x9edce57e,0xb36c2679,0xdf001d41,0x52b892f4,0x16a1f2c6,0xd884ae5d,0xefcc370a,0x9b329424,0xbd2e21df,0x3120daf2,0x02470a99,0x55298d2d,0xa05db32e,0x0b78af6c,0x601f5636,0x5c76a331 .long 0xf8a4f29c,0xaae861ff,0xd68f8d49,0x70dc9240,0x81b1321c,0x960e649f,0x8792e4ce,0x3d2c801b,0x42521876,0xf479f772,0x416c79b1,0x0bed93bc,0x263e5bc9,0xa67fbc05,0x521db049,0x01e8e630 .long 0xc6f3431e,0x76f26738,0xe3267541,0xe609cb02,0x818c877c,0xb10cff2d,0x786a13cb,0x1f0e75ce,0x1158544d,0xf4fdca64,0x6cb71ed0,0x5d777e89,0xa9aa4755,0x3c233737,0xe527ab40,0x7b453192 .long 0x39f05ffe,0xdb59f688,0x6d82574e,0x8f4f4be0,0xee292d1b,0xcce3450c,0x61ccd086,0xaa448a12,0xf7914967,0xabce91b3,0x1908a5ed,0x4537f09b,0xf51042e7,0xa812421e,0xec0b3a34,0xfaf5cebc .long 0x4ca6b39a,0x730ffd87,0x02efd342,0x70fb72ed,0xd75c8edb,0xeb4735f9,0xc278aa51,0xc11f2157,0xbf3bfebf,0xc459f635,0x6bd9601f,0x3a1ff0b4,0xc420cb73,0xc9d12823,0x3c2915a3,0x3e9af3e2 .long 0xb41c3440,0xe0c82c72,0xe3039a5f,0x175239e5,0x558795a3,0xe1084b8a,0xd01e5c60,0x328d0a1d,0xd3788a04,0x0a495f2e,0x66c11a9f,0x25d8ff16,0x9ed692d6,0xf5155f05,0x4f425fe4,0x954fa107 .long 0xe98aaa99,0xd16aabf2,0x96b0f88a,0x90cd8ba0,0xc154026a,0x957f4782,0x52af56d2,0x54ee0734,0x45b4147a,0xbcf89e54,0x9a52816c,0x3d102f21,0x39b62e77,0x6808517e,0x69169ad8,0x92e25421 .long 0xbb608558,0xd721d871,0xf6d4ff9b,0x60e4ebae,0x41f2763e,0x0ba10819,0x51ee3247,0xca2e45be,0x2bfd7a5f,0x66d172ec,0x74d0b12d,0x528a8f2f,0xdabe70dc,0xe17f1e38,0x9f93983c,0x1d5d7316 .long 0xdf423e31,0x51b2184a,0xaedb1a10,0xcb417291,0x625bcab9,0x2054ca93,0xa98998f0,0x54396860,0xa54ae57e,0x4e53f6c4,0xee648e9d,0x0ffeb590,0x6afaf6bc,0xfbbdaadc,0xaa3bfb8a,0xf88ae796 .long 0xd2359ed9,0x209f1d44,0xf3544ce2,0xac68dd03,0xfd51e569,0xf378da47,0x2cc80097,0xe1abd860,0x343b6e3a,0x23ca18d9,0xb40a1bae,0x480797e8,0x533f3e67,0xd1f0c717,0x06e6cdfc,0x44896970 .long 0x52a82e8d,0x8ca21055,0x78460cdc,0xb2caf785,0xe9037178,0x4c1b7b62,0xdb514b58,0xefc09d2c,0x9113be5c,0x5f2df9ee,0xb3f9271c,0x2fbda78f,0x8f83fc54,0xe09a81af,0x8afb5141,0x06b13866 .long 0x43e3865d,0x38f6480f,0x1ddf47d9,0x72dd77a8,0x4c205ff7,0xf2a8e971,0x9d088ad8,0x46d449d8,0x185d706f,0x926619ea,0xc7dd7f62,0xe47e02eb,0x8cbc2031,0xe7f120a7,0x998d4ac9,0xc18bef00 .long 0x6bdf22da,0x18f37a9c,0x90dc82df,0xefbc432f,0x5d703651,0xc52cef8e,0xd99881a5,0x82887ba0,0xb920ec1d,0x7cec9dda,0xec3e8d3b,0xd0d7e8c3,0x4ca88747,0x445bc395,0x9fd53535,0xedeaa2e0 .long 0x6cc87475,0x461b1d93,0x6d2383bd,0xd92a52e2,0xd7903546,0xfabccb59,0x3d14b112,0x6111a761,0xb3d5f612,0x0ae584fe,0x60e828ec,0x5ea69b8d,0x54087030,0x6c078985,0xac4821fe,0x649cab04 .long 0x8bdce214,0x25ecedcf,0x86af7361,0xb5622f72,0x7038b9e2,0x0e1227aa,0xac20fa77,0xd0efb273,0x79df975b,0x817ff88b,0x1999503e,0x856bf286,0x5038ec46,0xb4d5351f,0xfc42af6e,0x740a52c5 .long 0x2cbb1a3f,0x2e38bb15,0x17a83429,0xc3eb99fe,0xdd66bb74,0xca4fcbf1,0xcde5e8fc,0x880784d6,0xb4e7a0be,0xddc84c1c,0xbd15a72f,0x8780510d,0x81ec30e1,0x44bcf1af,0x0a61073e,0x141e50a8 .long 0x47be87ae,0x0d955718,0xf76a4372,0x68a61417,0xc607c3d3,0xf57e7e87,0x5252f332,0x043afaf8,0x1552a4d2,0xcc14e121,0xbb4d4ab4,0xb6dee692,0xa03816a4,0xb6ab74c8,0x6f394a29,0x84001ae4 .long 0xd795fb45,0x5bed8344,0xb79f55a5,0x57326e7d,0x4accdffc,0xc9533ce0,0x3993fa04,0x53473caf,0xa13df4c8,0x7906eb93,0x97cbe46f,0xa73e51f6,0x0ae4ccf8,0xd1ab3ae1,0x8a5b3dbc,0x25614508 .long 0x11a71b27,0x61eff962,0x6bb7fa39,0xdf71412b,0x2bd7f3ef,0xb31ba6b8,0x69180d29,0xb0b9c415,0x014cdde5,0xeec14552,0x227b4bbb,0x702c624b,0xd3e988f3,0x2b15e8c2,0xa4f7fd04,0xee3bcc6d .long 0x42ac6c85,0x9d00822a,0x1df9f2b7,0x2db0cea6,0x42de1e58,0xd7cad2ab,0x2d6fbb61,0x346ed526,0x1a2faf09,0xb3962995,0x7c25612e,0x2fa8a580,0x7cf56490,0x30ae04da,0x0eea3961,0x75662908 .long 0x3d080847,0x3609f5c5,0x5241d4f6,0xcb081d39,0x77961a63,0xb4fb3810,0x2abb66fc,0xc20c5984,0xf902f245,0x3d40aa7c,0x4e536b1e,0x9cb12736,0x99b3134f,0x5eda24da,0x5cd011af,0xafbd9c69 .long 0xc7088c7d,0x9a16e30a,0x3207389f,0x5ab65710,0xe7407a53,0x1b09547f,0x4fdc6eab,0x2322f9d7,0x7430de4d,0xc0f2f22d,0xe68ca9a9,0x19382696,0x918e5868,0x17f1eff1,0x586f4204,0xe3b5b635 .long 0x3fbc4341,0x146ef980,0x5b5eed4e,0x359f2c80,0x7482e41d,0x9f35744e,0xf3b224c2,0x9a9ac3ec,0x91fc50ae,0x9161a6fe,0xc613fa7c,0x89ccc66b,0xc732f15a,0x89268b14,0xb467ed03,0x7cd6f4e2 .long 0xce56b40e,0xfbf79869,0xc02dde98,0xf93e094c,0xedee2cd7,0xefe0c3a8,0xb268fd42,0x90f3ffc0,0x08241aed,0x81a7fd56,0x00b1afe8,0x95ab7ad8,0x3e310d52,0x40127056,0x09d9fc43,0xd3ffdeb1 .long 0xd11a8594,0xc8f85c91,0x31cf6db8,0x2e74d258,0x02b5dfd0,0x829c7ca3,0x69143c86,0xe389cfbe,0x941768d8,0xd01b6405,0x03bf825d,0x45103995,0x56cd17e2,0xcc4ee166,0xba037e79,0xbea3c283 .long 0xd9a47520,0x4e1ac06e,0xaf852404,0xfbfe18aa,0x8087648a,0x5615f8e2,0xb9d150d9,0x7301e47e,0xb299b977,0x79f9f9dd,0xa5b78314,0x76697a7b,0x7d7c90e7,0x10d67468,0x937210b5,0x7afffe03 .long 0x28c22cee,0x5aef3e4b,0x09fd55ae,0xefb0ecd8,0x0d2a5d6a,0x4cea7132,0x01db6357,0x9cfb5fa1,0xf36e1ac5,0x395e0b57,0x36cafb7d,0x008fa9ad,0x5308c4db,0x8f6cdf70,0x95ed2477,0x51527a37 .long 0x5bd21311,0xba0dee30,0x909c90d7,0x6ed41b22,0x7c8696d3,0xc5f6b758,0x3ce83a80,0x0db8eaa8,0xb24b4b6f,0xd297fe37,0x522d1f0d,0xfe58afe8,0x8c98dbd9,0x97358736,0x9454a527,0x6bc226ca .long 0xce53c2d0,0xa12b384e,0x5e4606da,0x779d897d,0x73ec12b0,0xa53e47b0,0x5756f1ad,0x462dbbba,0xcafe37b6,0x69fe09f2,0xecce2e17,0x273d1ebf,0x3cf607fd,0x8ac1d538,0x12e10c25,0x8035f7ff .long 0x7e6c5520,0x854d34c7,0xdcb9ea58,0xc27df9ef,0xd686666d,0x405f2369,0x0417aa85,0x29d1febf,0x93470afe,0x9846819e,0xe2a27f9e,0x3e6a9669,0xe31e6504,0x24d008a2,0x9cb7680a,0xdba7cecf .long 0x338d6e43,0xecaff541,0x4541d5cc,0x56f7dd73,0x96bc88ca,0xb5d426de,0x9ed3a2c3,0x48d94f6b,0x2ef8279c,0x6354a3bb,0x0b1867f2,0xd575465b,0x95225151,0xef99b0ff,0xf94500d8,0xf3e19d88 .long 0xe32dd620,0x92a83268,0x627849a2,0x913ec99f,0x2c378882,0xedd8fdfa,0xee6f8cfe,0xaf96f33e,0xdc3fa8a5,0xc06737e5,0xb0b03a1d,0x236bb531,0x89f037b0,0x33e59f29,0xd9a12a53,0x13f9b5a7 .long 0x51efb310,0x0d0df6ce,0x958df5be,0xcb5b2eb4,0x36158e59,0xd6459e29,0x1466e336,0x82aae2b9,0x411aa636,0xfb658a39,0xd4c0a933,0x7152ecc5,0x49f026b7,0xf10c758a,0xcb09311f,0xf4837f97 .long 0xc753c45f,0xddfb02c4,0xf9c840fe,0x18ca81b6,0xb0f8a3e6,0x846fd09a,0xe7733dbc,0xb1162add,0x236e3ab6,0x7070ad20,0xb2a56326,0xf88cdaf5,0x997cbc7a,0x05fc8719,0x4b665272,0x442cd452 .long 0xb71698f5,0x7807f364,0x9f7b605e,0x6ba418d2,0xa03b2cbb,0xfd20b00f,0xda54386f,0x883eca37,0xf3437f24,0xff0be43f,0xa48bb33c,0xe910b432,0x329df765,0x4963a128,0xbe2fe6f7,0xac1dd556 .long 0x24a0a3fc,0x557610f9,0xe881c3f9,0x38e17bf4,0xed0dac99,0x6ba84faf,0x59eeb918,0xd4a222c3,0x13f542b6,0xc79c1dbe,0xe425d457,0x1fc65e0d,0x1debb779,0xeffb754f,0x9e08af60,0x638d8fd0 .long 0x626332d5,0x994f523a,0x5561bb44,0x7bc38833,0x3d845ea2,0x005ed4b0,0xc2a1f08a,0xd39d3ee1,0xe7676b0d,0x6561fdd3,0xfb706017,0x620e35ff,0xf264f9a8,0x36ce424f,0xda2681f7,0xc4c3419f .long 0x69beb6e8,0xfb6afd2f,0x6d700d03,0x3a50b993,0x0c83a14f,0xc840b2ad,0x54085bef,0x573207be,0x09fe7e5b,0x5af882e3,0x3b40a7e1,0x957678a4,0x543056e2,0x172d4bdd,0x0df13c0a,0x9c1b26b4 .long 0xf405ff06,0x1c30861c,0x486e828b,0xebac86bd,0x636933fc,0xe791a971,0x7aeee947,0x50e7c2be,0xfa90d767,0xc3d4a095,0xe670ab7b,0xae60eb7b,0x397b056d,0x17633a64,0x105012aa,0x93a21f33 .long 0xabb88643,0x663c370b,0x22e21599,0x91df36d7,0x8b761671,0x183ba835,0x728f3bf1,0x381eea1d,0x39966e6c,0xb9b2f1ba,0xe7295492,0x7c464a28,0x09b26b7f,0x0fd5f70a,0xfbe009df,0xa9aba1f9 .long 0x369b87ad,0x857c1f22,0x32fca556,0x3c00e5d9,0x90b06466,0x1ad74cab,0x550faaf2,0xa7112386,0x6d9bd5f5,0x7435e198,0x59c3463f,0x2dcc7e38,0xca7bd4b2,0xdc7df748,0x9dec2f31,0x13cd4c08 .long 0xe3237710,0x0d3b5df8,0xcbd2f7b0,0x0dadb26e,0xe4aa082b,0x9f5966ab,0x350e966e,0x666ec8de,0xee524216,0x1bfd1ed5,0x41dab0b6,0xcd93c59b,0xd186d6ba,0x658a8435,0x159d1195,0x1b7d34d2 .long 0x22caf46b,0x5936e460,0x9a96fe4f,0x6a45dd8f,0xb98f474e,0xf7925434,0x0053ef15,0x41410412,0x41de97bf,0x71cf8d12,0xbd80bef4,0xb8547b61,0xc4db0037,0xb47d3970,0xfef20dff,0xf1bcd328 .long 0x10caad67,0x31a92e09,0x5531a1e1,0x1f591960,0x5f4fc840,0x3bb852e0,0x93a72c6c,0x63e297ca,0x49abad67,0x3c2b0b2e,0xed3db0d9,0x6ec405fc,0x7fef1d40,0xdc14a530,0x280896fc,0xccd19846 .long 0x9bb81648,0x00f83176,0x653120d0,0xd69eb485,0x4ccabc62,0xd17d75f4,0xb749fcb1,0x34a07f82,0xbbfb5554,0x2c3af787,0x62e283f8,0xb06ed4d0,0xa19213a0,0x5722889f,0xdcf3c7b4,0x162b085e .long 0xe0dd3eca,0xbcaecb31,0xe52f13a5,0xc6237fbc,0x27bac297,0xcc2b6b03,0xb917f54a,0x2ae1cac5,0x7845ae4f,0x474807d4,0xce5972e0,0xfec7dd92,0x1d7915bb,0xc3bd2541,0xd94907ca,0x66f85dc4 .long 0xbdbcf0ca,0xd981b888,0xdf279e9f,0xd75f5da6,0x7054e934,0x128bbf24,0x81db134b,0x3c6ff6e5,0x047d26e4,0x795b7cf4,0x5049ec37,0xf370f7b8,0xced945af,0xc6712d4d,0x095642bc,0xdf30b5ec .long 0x4896246e,0x9b034c62,0xee90bbd1,0x5652c016,0x87fedb73,0xeb38636f,0x0135a613,0x5e32f847,0xcf933c83,0x0703b312,0x1a7f47e6,0xd05bb76e,0x949c2415,0x825e4f0c,0x7250d6f8,0x569e5622 .long 0x6568013e,0xbbe9eb3a,0x22f243fc,0x8dbd203f,0xb342734a,0x9dbd7694,0x46afa984,0x8f6d12f8,0xc9eade29,0xb98610a2,0x47dd0f18,0xbab4f323,0x671c0d46,0x5779737b,0xd3e0a42a,0x10b6a7c6 .long 0x3035b41c,0xfb19ddf3,0x99c45895,0xd336343f,0x54c857e5,0x61fe4938,0xae4e57d5,0xc4d506be,0xbbc33f75,0x3cd8c8cb,0x9262c77d,0x7281f08a,0xf11a2823,0x083f4ea6,0x9fba2e33,0x8895041e .long 0x9c438edf,0xfcdfea49,0x91edba44,0x7678dcc3,0xe2ba50f0,0xf07b3b87,0x43948c1b,0xc13888ef,0x1140af42,0xc2135ad4,0x926ed1a7,0x8e5104f3,0x88f6695f,0xf24430cb,0x6d73c120,0x0ce0637b .long 0xfe631e8f,0xb2db01e6,0xd7bdd24b,0x1c5563d7,0x369ad44f,0x8daea3ba,0x8187a9f9,0x000c81b6,0xaae1fd9a,0x5f48a951,0x8d5aed8a,0xe35626c7,0x0498c622,0x20952763,0x773aa504,0x76d17634 .long 0xeb300f7a,0x36d90dda,0xedb5e801,0x9dcf7dfc,0x74d5244c,0x645cb268,0x348e3aa2,0xa127ee79,0x575f1dbb,0x488acc53,0x80e6161e,0x95037e85,0x292650d0,0x57e59283,0x14938216,0xabe67d99 .long 0x3f8e1065,0x3c7f944b,0x330e8924,0xed908cb6,0x6f530136,0x08ee8fd5,0xd7ffc169,0x2227b7d5,0xb5cd6dd5,0x4f55c893,0xa62796e8,0x82225e11,0xcb18e12c,0x5c6cead1,0x84f5a51a,0x4381ae0c .long 0x7fafa4c8,0x345913d3,0x0491aac0,0x3d918082,0x3e69264c,0x9347871f,0xb4f4f0cd,0xbea9dd3c,0x3eadd3e7,0xbda5d067,0x0573bcd8,0x0033c1b8,0x5da2486c,0x25589379,0x86abbee7,0xcb89ee5b .long 0x22532e5d,0x8fe0a8f3,0x727dfc4c,0xb6410ff0,0x226726db,0x619b9d58,0x7a2b2dc7,0x5ec25669,0x4c3beb01,0xaf4d2e06,0x7acea556,0x852123d0,0xf783487a,0x0e9470fa,0x5664b3eb,0x75a7ea04 .long 0x6798e4ba,0x4ad78f35,0xc7d0e091,0x9214e6e5,0xb1290403,0xc420b488,0xfc295749,0x64049e0a,0x3ae9841f,0x03ef5af1,0xb0b662a6,0xdbe4ca19,0xfa453458,0x46845c5f,0x10b66722,0xf8dabf19 .long 0xcce2793b,0xb650f0aa,0xc5ec47c1,0x71db851e,0x3b234fa9,0x3eb78f3e,0xfc0106ce,0xb0c60f35,0x774eadbd,0x05427121,0xce323863,0x25367faf,0xcd086976,0x7541b5c9,0xdc507ad1,0x4ff069e2 .long 0x8776e667,0x74145256,0xb23c6bb5,0x6e76142c,0x1b3a8a87,0xdbf30712,0x98450836,0x60e7363e,0xb7366d80,0x5741450e,0x4837dbdf,0xe4ee14ca,0x69d4316f,0xa765eb9b,0x8ef43825,0x04548dca .long 0x5ae888eb,0x9c9f4e4c,0x56e9ac99,0x733abb51,0xba6ac029,0xdaad3c20,0x2ba3e38e,0x9b8dd3d3,0x0bc5d11a,0xa9bb4c92,0x9c5f88a3,0xf20127a7,0x161d3cb8,0x4f52b06e,0x6afaf0a6,0x26c1ff09 .long 0x7189e71f,0x32670d2f,0x5ecf91e7,0xc6438748,0xdb757a21,0x15758e57,0x290a9ce5,0x427d09f8,0x38384a7a,0x846a308f,0xb0732b99,0xaac3acb4,0x17845819,0x9e941009,0xa7ce5e03,0x95cba111 .long 0xb00009c4,0x6f3d4f7f,0x8ff28b5f,0xb8396c27,0x1c97975d,0xb1a9ae43,0xe5d9fed5,0x9d7ba8af,0x34f485b6,0x338cf09f,0x64122516,0xbc0ddacc,0x05d471fe,0xa450da12,0x628dd8c9,0x4c3a6250 .long 0xd1295837,0x69c7d103,0x3807eb2f,0xa2893e50,0xbdb41491,0xd6e1e1de,0x5e138235,0xc630745b,0x48661ae1,0xc892109e,0xea2b2674,0x8d17e7eb,0xc328d6b5,0x00ec0f87,0xf079ff9e,0x6d858645 .long 0x19115ead,0x6cdf243e,0x4bac4fcf,0x1ce1393e,0x9c29f25b,0x2c960ed0,0x9d388a05,0x59be4d8e,0xd0def72b,0x0d46e06c,0xe0342748,0xb923db5d,0x936d4a3d,0xf7d3aacd,0x0b0b099e,0x558519cc .long 0x827097ef,0x3ea8ebf8,0xd054f55d,0x259353db,0x6d2ed089,0x84c89abc,0x8e096a7c,0x5c548b69,0x994b995d,0xd587f616,0xa5845601,0x4d1531f6,0x451fd9f0,0x792ab31e,0x65adf6ca,0xc8b57bb2 .long 0x1cd5ad73,0x68440fcb,0x6144da4f,0xb9c860e6,0x8462beb8,0x2ab286aa,0xef46797f,0xcc6b8fff,0x20c8a471,0xac820da4,0x77ff7faf,0x69ae05a1,0xbfb5da77,0xb9163f39,0x2c73ab7a,0xbd03e590 .long 0xb2940d9e,0x7e862b5e,0x4b9af564,0x3c663d86,0xbde3033d,0xd8309031,0xd42c5bc6,0x298231b2,0x552ad093,0x42090d2c,0xff854695,0xa4799d1c,0xd31f0d00,0x0a88b5d6,0xa2f26b46,0xf8b40825 .long 0xf1bd7218,0xec29b1ed,0x4b24c86e,0xd491c53b,0x3395ea65,0xd2fe588f,0x4456ef15,0x6f3764f7,0xcdc34800,0xdb43116d,0xc1e33955,0xcdbcd456,0x74ab286b,0xefdb5540,0xd18c5d7c,0x948c7a51 .long 0x7378058e,0xeb81aa37,0x04411154,0x41c746a1,0xfb828ac7,0xa10c73bc,0x9d972b29,0x6439be91,0x43a2fbad,0x4bf3b4b0,0x82b5e840,0x39e6dadf,0x6397bd4c,0x4f716408,0x7f1eeccb,0x0f7de568 .long 0xd2ffbfc1,0x5865c5a1,0x4ccb6451,0xf74211fa,0xc0b32558,0x66368a88,0x9ad7812e,0x5b539dc2,0x2f3af6f6,0x579483d0,0x99934ece,0x52132078,0xdcc9e983,0x50b9650f,0xaee42b8a,0xca989ec9 .long 0xd6f62f99,0x6a44c829,0x4c2a7c0c,0x8f06a309,0x98a0cb0a,0x4ea2b3a0,0xbeee8364,0x5c547b70,0x682afe11,0x461d40e1,0x7b41c0a8,0x9e0fc77a,0xe20d5d36,0x79e4aefd,0x32dd9f63,0x2916e520 .long 0x3f883faf,0xf59e52e8,0x2b868d35,0x396f9639,0x4ca19881,0xc902a9df,0xdb2401a6,0x0fc96822,0x66f1c68d,0x41237587,0xfb476c0d,0x10fc6de3,0x841f5d90,0xf8b6b579,0xfa24f44a,0x2ba8446c .long 0xef4a9975,0xa237b920,0x2330435f,0x60bb6004,0xcfb7e7b5,0xd6f4ab5a,0x83435391,0xb2ac5097,0xb0d1ea67,0xf036ee2f,0x74c56230,0xae779a6a,0xab838ae6,0x59bff8c8,0x9b38e6f0,0xcd83ca99 .long 0xe33deed3,0xbb27bef5,0x001892a8,0xe6356f6f,0x7adfbd3e,0xbf3be6cc,0x33d1ac9d,0xaecbc81c,0xe6e861dc,0xe4feb909,0x53f5f801,0x90a247a4,0x27346e57,0x01c50acb,0x461acc1b,0xce29242e .long 0x2f998a91,0x04dd214a,0xd4baf27b,0x271ee9b1,0xe8c26722,0x7e3027d1,0x1820dce5,0x21d1645c,0x7501779c,0x086f242c,0xfa0e8009,0xf0061407,0x60187129,0xf23ce477,0x0fde9bd0,0x05bbdedb .long 0x25d98473,0x682f4832,0x5c658427,0xf207fe85,0x4166ffa1,0xb6fdd7ba,0x9eed799d,0x0c314056,0x4107e28f,0x0db8048f,0x41216840,0x74ed3871,0x56a3c06e,0x74489f8f,0x12777134,0x1e1c005b .long 0xf37ec3c3,0xdb332a73,0xdd59eba0,0xc65259bd,0xdb4d3257,0x2291709c,0xbd389390,0x9a793b25,0xe43756f0,0xf39fe34b,0x9afb56c9,0x2f76bdce,0x61208b27,0x9f37867a,0x089972c3,0xea1d4307 .long 0x8bdf623a,0x8c595330,0x8441fb7d,0x5f5accda,0x32ddfd95,0xfafa9418,0x0fde9be7,0x6ad40c5a,0xaeca8709,0x43faba89,0x2c248a9d,0xc64a7cf1,0x72637a76,0x16620252,0x22b8d1bb,0xaee1c791 .long 0x21a843b2,0xf0f798fd,0x8d005cb1,0x56e4ed4d,0x1f0d8abe,0x355f7780,0x34522326,0x197b04cf,0xfd42c13f,0x41f9b31f,0xb40f933d,0x5ef7feb2,0x5d60bad4,0x27326f42,0x8c92cf89,0x027ecdb2 .long 0x4e3352fe,0x04aae4d1,0x73591b90,0x08414d2f,0xb7da7d60,0x5ed6124e,0x4d13d4ec,0xb985b931,0x96bf36f9,0xa592d3ab,0xbbdf51df,0x012dbed5,0xdf6c177d,0xa57963c0,0x87ca29cf,0x010ec869 .long 0xbf926dff,0xba1700f6,0xf4bf6bc2,0x7c9fdbd1,0x64da11f5,0xdc18dc8f,0xd938ae75,0xa6074b7a,0xe84f44a4,0x14270066,0xd27b954e,0x99998d38,0xb4f38e9a,0xc1be8ab2,0x15c01016,0x8bb55bbf .long 0x0ea2ab30,0xf73472b4,0xf73d68dd,0xd365a340,0x19c2e1eb,0xc01a7168,0x34061719,0x32f49e37,0x01d8b4d6,0xb73c57f1,0x26b47700,0x03c8423c,0xa4d8826a,0x321d0bc8,0x4bc0e638,0x6004213c .long 0xc1c06681,0xf78c64a1,0xef018e50,0x16e0a16f,0xdb42b2b3,0x31cbdf91,0xe0d36f58,0xf8f4ffce,0x4cc5e3e0,0xcdcc71cd,0xa129e3e0,0xd55c7cfa,0x0fb2cbf1,0xccdb6ba0,0xc4bce3cb,0x6aba0005 .long 0xd232cfc4,0x501cdb30,0xd58a3cef,0x9ddcf12e,0x87e09149,0x02d2cf9c,0x2c976257,0xdc5d7ec7,0x0b50d7dd,0x6447986e,0x807f112a,0x88fdbaf7,0xb00ae9f6,0x58c9822a,0x6d3d27e0,0x6abfb950 .long 0x8a429f4f,0xd0a74487,0xdb516609,0x0649712b,0xe769b5df,0xb826ba57,0x1fc7aaf2,0x82335df2,0x5c93d995,0x2389f067,0x68677be6,0x59ac367a,0x21d9951b,0xa77985ff,0x85011cce,0x038956fb .long 0xbb734e37,0x608e48cb,0x2be5b26f,0xc08c0bf2,0xf9b1a0d9,0x17bbdd3b,0x10483319,0xeac7d898,0xbc1a6dea,0xc95c4baf,0x172aafdb,0xfdd0e2bf,0x8235c41a,0x40373cbc,0xfb6f41d5,0x14303f21 .long 0x0408f237,0xba063621,0xecd2d1ed,0xcad3b09a,0x52abb6a2,0x4667855a,0xaa8b417b,0xba9157dc,0x4f013efb,0xfe7f3507,0xaa38c4a2,0x1b112c4b,0x9ba64345,0xa1406a60,0x6993c80b,0xe53cba33 .long 0xded40d23,0x45466063,0x54908e25,0x3d5f1f4d,0x403c3c31,0x9ebefe62,0x0672a624,0x274ea0b5,0x451d1b71,0xff818d99,0x8f79cf79,0x80e82643,0x73ce37f5,0xa165df13,0xfe3a21fd,0xa744ef4f .long 0xcf551396,0x73f1e7f5,0x868c676b,0xc616898e,0x8c442c36,0x671c28c7,0x5e0a317d,0xcfe5e558,0x7051f476,0x1242d818,0x14f03442,0x56fad2a6,0x0a44d0f6,0x262068bc,0xce6edf4e,0xdfa2cd6e .long 0xd15d1517,0x0f43813a,0x377d44f5,0x61214cb2,0xc639b35f,0xd399aa29,0x54c51c19,0x42136d71,0x08417221,0x9774711b,0x52545a57,0x0a5546b3,0x1150582d,0x80624c41,0xfbc555bc,0x9ec5c418 .long 0x771849f1,0x2c87dcad,0x01d7bf6f,0xb0c932c5,0x89116eb2,0x6aa5cd3e,0x51ca7bd3,0xd378c25a,0x9e6e3e31,0xc612a0da,0xb68ad5d0,0x0417a54d,0x22c6edb8,0x00451e4a,0xb42827ce,0x9fbfe019 .long 0xba9384a2,0x2fa92505,0x64ad69c1,0x21b8596e,0x983b35a6,0x8f4fcc49,0x72754672,0xde093760,0xf7bffe6d,0x2f14ccc8,0x5d94263d,0x27566bff,0x2df3ec30,0xb5b4e9c6,0x3e6ea6ba,0x94f1d7d5 .long 0xaaca5e9b,0x97b7851a,0x56713b97,0x518aa521,0x150a61f6,0x3357e8c7,0xec2c2b69,0x7842e7e2,0x6868a548,0x8dffaf65,0xe068fc81,0xd963bd82,0x65917733,0x64da5c8b,0x7b247328,0x927090ff .long 0xd298c241,0x214bc9a7,0x56807cfd,0xe3b697ba,0x4564eadb,0xef1c7802,0xb48149c5,0xdde8cdcf,0x5a4d2604,0x946bf0a7,0x6c1538af,0x27154d7f,0xde5b1fcc,0x95cc9230,0x66864f82,0xd88519e9 .long 0x7cb1282c,0xb828dd1a,0xbe46973a,0xa08d7626,0xe708d6b2,0x6baf8d40,0x4daeb3f3,0x72571fa1,0xf22dfd98,0x85b1732f,0x0087108d,0x87ab01a7,0x5988207a,0xaaaafea8,0x69f00755,0xccc832f8 .long 0x36ff3bf0,0x964d950e,0xf0b34638,0x8ad20f6f,0xb5d7585f,0x4d9177b3,0xef3f019f,0xcf839760,0x8288c545,0x582fc5b3,0x13116bd1,0x2f8e4e9b,0x332120ef,0xf91e1b2f,0x2a17dd23,0xcf568724 .long 0xca8d9d1a,0x488f1185,0xd987ded2,0xadf2c77d,0x60c46124,0x5f3039f0,0x71e095f4,0xe5d70b75,0x6260e70f,0x82d58650,0xf750d105,0x39d75ea7,0x75bac364,0x8cf3d0b1,0x21d01329,0xf3a7564d .long 0x2f52d2a7,0x182f04cd,0xe2df565a,0x4fde149a,0xa79fb2f7,0xb80c5eec,0x22ddc897,0xab491d7b,0xc6312c7f,0x99d76c18,0x6aa41a57,0xca0d5f3d,0xd15363a0,0x71207325,0xbeb252c2,0xe82aa265 .long 0xec3128c2,0x94ab4700,0x8e383f49,0x6c76d862,0xc03024eb,0xdc36b150,0x53daac69,0xfb439477,0x8dc79623,0xfc68764a,0xb440fbb2,0x5b86995d,0xccc5ee0d,0xd66879bf,0x95aa8bd3,0x05228942 .long 0x1e6a75c1,0xb51a40a5,0x0ea7d817,0x24327c76,0x07774597,0x06630182,0x97fa7164,0xd6fdbec3,0x13c90f48,0x20c99dfb,0x686ef263,0xd6ac5273,0xfef64eeb,0xc6a50bdc,0x86fdfc32,0xcd87b281 .long 0x3fcd3efc,0xb24aa43e,0xb8088e9a,0xdd26c034,0xbd3d46ea,0xa5ef4dc9,0x8a4c6a6f,0xa2f99d58,0x2f1da46c,0xddabd355,0x1afacdd1,0x72c3f8ce,0x92d40578,0xd90c4eee,0xca623b94,0xd28bb41f .long 0x745edc11,0x50fc0711,0x3dc87558,0x9dd9ad7d,0xb49d1e64,0xce6931fb,0xc98bd0f9,0x6c77a0a2,0x6baf7cb1,0x62b9a629,0xccf72d22,0xcf065f91,0x79639071,0x7203cce9,0xf9cb732f,0x09ae4885 .long 0xee8314f3,0x5e7c3bec,0xdbea298f,0x1c068aed,0x7c80acec,0x08d381f1,0xe330495b,0x03b56be8,0x9222882d,0xaeffb8f2,0xc4af8bf7,0x95ff38f6,0x1fc57d8c,0x50e32d35,0x17b444f0,0x6635be52 .long 0xa5177900,0x04d15276,0xf6858752,0x4e1dbb47,0xc615796c,0x5b475622,0x691867bf,0xa6fa0387,0x2844c6d0,0xed7f5d56,0x03a2477d,0xc633cf9b,0x2d3721d6,0xf6be5c40,0xe9fd68e6,0xaf312eb7 .long 0xe7417ce1,0x242792d2,0x970ee7f5,0xff42bc71,0x5c67a41e,0x1ff4dc6d,0x20882a58,0x77709b7b,0xbe217f2c,0x3554731d,0x5bb72177,0x2af2a8cd,0x591dd059,0x58eee769,0x4bba6477,0xbb2930c9 .long 0x7d930cfc,0x863ee047,0x396fd1f4,0x4c262ad1,0x039af7e1,0xf4765bc8,0x5ba104f6,0x2519834b,0xd105f961,0x7cd61b4c,0xd63bca54,0xa5415da5,0x88a1f17c,0x778280a0,0x2329512c,0xc4968949 .long 0xcecdaa7a,0x174a9126,0x0b13247b,0xfc8c7e0e,0x3484c1c4,0x29c110d2,0x831dfc3b,0xf8eb8757,0xc0067452,0x022f0212,0x7b9b926c,0x3f6f69ee,0xef42daf4,0x09032da0,0x83f80de4,0x79f00ade .long 0x81236c97,0x6210db71,0x3ee0781f,0x74f7685b,0xa3e41372,0x4df7da7b,0xb1a1553e,0x2aae38b1,0xf6dd9d1b,0x1688e222,0x5b8b6487,0x57695448,0x4b2edeaa,0x478d2127,0x1e85956a,0xb2818fa5 .long 0xf176f2c0,0x1e6addda,0xe2572658,0x01ca4604,0x85342ffb,0x0a404ded,0x441838d6,0x8cf60f96,0xc9071c4a,0x9bbc691c,0x34442803,0xfd588744,0x809c0d81,0x97101c85,0x8c456f7f,0xa7fb754c .long 0xd51805e1,0xc95f3c5c,0xb299dca8,0xab4ccd39,0x47eaf500,0x3e03d20b,0xd7b80893,0xfa3165c1,0xe160e552,0x005e8b54,0x9019d11f,0xdc4972ba,0x0c9a4a7a,0x21a6972e,0x37840fd7,0xa52c258f .long 0xc1e99d81,0xf8559ff4,0xa3c617c0,0x08e1a7d6,0x248c6ba7,0xb398fd43,0xd1283794,0x6ffedd91,0xd629d208,0x8a6a59d2,0x3490530e,0xa9d141d5,0x38505989,0x42f6fc18,0x479d94ee,0x09bf250d .long 0xb3822790,0x223ad3b1,0x93b8971c,0x6c5926c0,0x75f7fa62,0x609efc7e,0x1ec2d989,0x45d66a6d,0x987d2792,0x4422d663,0x3eb31d2b,0x4a73caad,0xa32cb9e6,0xf06c2ac1,0x91aeba84,0xd9445c5f .long 0xaf71013f,0x6af7a1d5,0x0bedc946,0xe68216e5,0xd27370a0,0xf4cba30b,0x870421cc,0x7981afbf,0x9449f0e1,0x02496a67,0x0a47edae,0x86cfc4be,0xb1feca22,0x3073c936,0x03f8f8fb,0xf5694612 .long 0x901515ea,0xd063b723,0x749cf038,0x4c6c77a5,0xab9e5059,0x6361e360,0xa76a37c0,0x596cf171,0x6530ae7a,0x800f53fa,0x0792a7a6,0x0f5e631e,0xefdb81c9,0x5cc29c24,0x3f9c40ba,0xa269e868 .long 0x2cb7191e,0xec14f9e1,0xe5b08ea6,0x78ea1bd8,0x46332bb9,0x3c65aa9b,0xbf80ce25,0x84cc22b3,0xd49d5bf1,0x0098e9e9,0x19087da4,0xcd4ec1c6,0xaef6e357,0x3c9d07c5,0x9f8f64b8,0x839a0268 .long 0xc6d8607f,0xc5e9eb62,0x6aa995e4,0x759689f5,0xbbb48317,0x70464669,0xe402417d,0x921474bf,0x2a354c8c,0xcabe135b,0x812fa4b5,0xd51e52d2,0x53311fe8,0xec741096,0xb864514b,0x4f774535 .long 0x5bde48f8,0xbcadd671,0x2189bc7d,0xc9703873,0xc709ee8a,0x5d45299e,0x845aaff8,0xd1287ee2,0xdb1dbf1f,0x7d1f8874,0x990c88d6,0xea46588b,0x84368313,0x60ba649a,0x60d543ae,0xd5fdcbce .long 0x810d5ab0,0x90b46d43,0x04d7e5cc,0x6739d8f9,0x0d337c33,0x021c1a58,0x68e67c40,0x00a61162,0x379f0a1f,0x95ef413b,0xe9e2ab95,0xfe126605,0x2f5f199c,0x67578b85,0x2cb84913,0xf5c00329 .long 0x37577dd8,0xf7956430,0x29c5fe88,0x83b82af4,0xcdbdc132,0x9c1bea26,0x9c04339e,0x589fa086,0xb13799df,0x033e9538,0xd295d034,0x85fa8b21,0xbd9ddcca,0xdf17f73f,0xddb66334,0xf32bd122 .long 0x858b044c,0x55ef88a7,0x5aa9e397,0x1f0d69c2,0x40d85559,0x55fd9cc3,0x7785ddb2,0xc774df72,0xd3bd2e1c,0x5dcce9f6,0xa85dfed0,0xeb30da20,0xd3ed09c4,0x5ed7f5bb,0x82a9c1bd,0x7d42a35c .long 0x9890272d,0xcf3de995,0x3e713a10,0x75f3432a,0xe28227b8,0x5e13479f,0xfefacdc8,0xb8561ea9,0x8332aafd,0xa6a297a0,0x73809b62,0x9b0d8bb5,0x0c63036f,0xd2fa1cfd,0xbd64bda8,0x7a16eb55 .long 0x78e62ddc,0x3f5cf5f6,0x07fd752b,0x2267c454,0x5e437bbe,0x5e361b6b,0x8354e075,0x95c59501,0xf2b254d9,0xec725f85,0x2cb52b4e,0x844b617d,0xcf425fb5,0xed8554f5,0x2af9f312,0xab67703e .long 0x3cf48283,0x4cc34ec1,0x9c8a705e,0xb09daa25,0x5b7d4f84,0xd1e9d0d0,0xdb38929d,0x4df6ef64,0xaa21ba46,0xe16b0763,0xa293f8fb,0xc6b1d178,0xd520aabf,0x0ff5b602,0xc339397a,0x94d671bd .long 0x4f5792fa,0x7c7d98cf,0x11215261,0x7c5e0d67,0xa7c5a6d4,0x9b19a631,0x7a45274d,0xc8511a62,0xa5a60d99,0x0c16621c,0xcf5e48cb,0xf7fbab88,0xf7ddee08,0xab1e6ca2,0xe7867f3c,0x83bd08ce .long 0x2ac13e27,0xf7e48e8a,0x4eb1a9f5,0x4494f6df,0x981f0a62,0xedbf84eb,0x536438f0,0x49badc32,0x004f7571,0x50bea541,0xdf1c94ee,0xbac67d10,0xb727bc31,0x253d73a1,0x30686e28,0xb3d01cf2 .long 0x55fd0b8b,0x51b77b1b,0xfeec3173,0xa099d183,0x670e72b7,0x202b1fb7,0xa8e1635f,0xadc88b33,0xf989d905,0x34e8216a,0x29b58d01,0xc2e68d20,0x6fe55a93,0x11f81c92,0x8f296f40,0x15f1462a .long 0xea3d62f2,0x1915d375,0x01c8977d,0xa17765a3,0xe47b26f6,0x7559710a,0x535077a5,0xe0bd29c8,0x08d84858,0x615f976d,0x69ced5c1,0x370dfe85,0xa734fa56,0xbbc7503c,0x91ac4574,0xfbb9f1ec .long 0x060dd7ef,0x95d7ec53,0x6e657979,0xeef2dacd,0xe2a08235,0x54511af3,0x1f4aea3d,0x1e324aa4,0xe6e67671,0x550e7e71,0xbf52faf7,0xbccd5190,0x223cc62a,0xf880d316,0x2b32eb5d,0x0d402c7e .long 0x306a5a3b,0xa40bc039,0x96783a1b,0x4e0a41fd,0x0253cdd4,0xa1e8d39a,0xc7388638,0x6480be26,0x2285f382,0xee365e1d,0xec0b5c36,0x188d8d8f,0x1f0f4d82,0x34ef1a48,0xa487d29a,0x1a8f43e1 .long 0x77aefb3a,0x8168226d,0x1e72c253,0xf69a751e,0xe9594df1,0x8e04359a,0xd14c0467,0x475ffd7d,0x3844e95c,0xb5a2c2b1,0xdd12ef94,0x85caf647,0xf1063d00,0x1ecd2a9f,0x23843311,0x1dd2e229 .long 0x73d17244,0x38f0e09d,0x8fc653f1,0x3ede7746,0xdc20e21c,0xae4459f5,0x6a8599ea,0x00db2ffa,0x30cfd905,0x11682c39,0xa5c112a6,0x4934d074,0x568bfe95,0xbdf063c5,0x016c441a,0x779a440a .long 0x97d6fbdc,0x0c23f218,0xe0776aac,0xd3a5cd87,0xd712e8db,0xcee37f72,0x26f74e8d,0xfb28c70d,0xb61301a0,0xffe0c728,0xd3724354,0xa6282168,0x768ffedc,0x7ff4cb00,0x03b02de9,0xc51b3088 .long 0x3902dda5,0xa5a8147c,0xfe6973b4,0x35d2f706,0xc257457e,0x5ac2efcf,0x8700611b,0x933f48d4,0x4912beb2,0xc365af88,0x162edf94,0x7f5a4de6,0x0c32f34b,0xc646ba7c,0xb2091074,0x632c6af3 .long 0x753e43a9,0x58d4f2e3,0x24d4e23f,0x70e1d217,0xafede6a6,0xb24bf729,0x710c8b60,0x7f4a94d8,0x8d4faa6a,0xaad90a96,0xb066b690,0xd9ed0b32,0x78b6dbfd,0x52fcd37b,0x8bd2b431,0x0b64615e .long 0xcfb9fad5,0x228e2048,0x240b76bd,0xbeaa386d,0x90dad7bc,0x2d6681c8,0x06d38f5e,0x3e553fc3,0x9d5f9750,0xf27cdb9b,0xd28c5b0e,0x3e85c52a,0x5247c39b,0x190795af,0xbddd6828,0x547831eb .long 0x4a82f424,0xf327a227,0x7e47f89d,0x36919c78,0x43c7392c,0xe4783919,0x2316fefe,0xf101b9aa,0x1c5009d2,0xbcdc9e9c,0x9cd18345,0xfb55ea13,0xa3ce77c7,0xf5b5e231,0xd2f2cb3d,0xde6b4527 .long 0x9bb26f5f,0x10f6a333,0x044d85b6,0x1e85db8e,0x94197e54,0xc3697a08,0xa7cb4ea8,0x65e18cc0,0xa471fe6e,0xa38c4f50,0x2f13439c,0xf031747a,0xc007318b,0x53c4a6ba,0x1deccb3d,0xa8da3ee5 .long 0x558216b1,0x0555b31c,0x2f79e6c2,0x90c7810c,0xfe8eed3c,0x9b669f4d,0xe0fac126,0x70398ec8,0xf701b235,0xa96a449e,0xeb94f395,0x0ceecdb3,0xd0cb7431,0x285fc368,0x16a18c64,0x0d37bb52 .long 0xb880d2dd,0x05110d38,0x65930d57,0xa60f177b,0xf36235f5,0x7da34a67,0x183816b9,0x47f5e17c,0xdb394af4,0xc7664b57,0x7036f789,0x39ba215d,0x2f27b472,0x46d2ca0e,0xf73a84b7,0xc42647ee .long 0x64488f1d,0x44bc7545,0xf4cf85d5,0xaa922708,0x53e4df63,0x721a01d5,0x5db46ced,0x649c0c51,0x3cffcb6c,0x6bf0d64e,0x50f71d96,0xe3bf93fe,0xbcc194a0,0x75044558,0x6afdc554,0x16ae3372 .long 0x5ca48f3f,0xbfc01adf,0xe22a9b84,0x64352f06,0xc1099e4a,0xcee54da1,0xfa1b89c0,0xbbda54e8,0x6f6e55fb,0x166a3df5,0x20176f88,0x1ca44a24,0xdfb7b5ff,0x936afd88,0x8611d4a0,0xe34c2437 .long 0x86142103,0x7effbb75,0x1f34fc4d,0x6704ba1b,0x10c1b122,0x7c2a468f,0x8c6aace9,0x36b3a610,0x75a0d050,0xabfcc0a7,0x3ce33e32,0x066f9197,0x29fe09be,0xce905ef4,0xa8376351,0x89ee25ba .long 0xfd29dc76,0x2a3ede22,0x36f17260,0x7fd32ed9,0x284b4126,0x0cadcf68,0xa7951fc8,0x63422f08,0x0807e199,0x562b24f4,0x22ad4490,0xfe9ce5d1,0x0db2b1b4,0xc2f51b10,0xe4541d0d,0xeb3613ff .long 0x2680813b,0xbd2c4a05,0x561b08d6,0x527aa55d,0xa7205558,0xa9f8a40e,0x243d0bec,0xe3eea56f,0xa0ff58b3,0x7b853817,0x1a69e627,0xb67d3f65,0xa869b5d6,0x0b76bbb9,0x546723ed,0xa3afeb82 .long 0x3e554892,0x5f24416d,0x430e2a45,0x8413b53d,0x9032a2a0,0x99c56aee,0xeec367b1,0x09432bf6,0xdaf0ecc1,0x552850c6,0x5bc92048,0x49ebce55,0x54811307,0xdfb66ba6,0x6f298597,0x1b84f797 .long 0x8d1d7a0d,0x79590481,0x3a6fa556,0xd9fabe03,0xba9e5d35,0xa40f9c59,0xf6247577,0xcb1771c1,0xe9a6312b,0x542a47ca,0x552dd8c5,0xa34b3560,0x0d794716,0xfdf94de0,0x9c623094,0xd46124a9 .long 0x68afe8b4,0x56b7435d,0x6c0d8ea1,0x27f20540,0x73186898,0x12b77e14,0x7479490f,0xdbc3dd46,0xc03b0c05,0x951a9842,0x7921bc96,0x8b1b3bb3,0x2b202e0a,0xa573b346,0x47254d56,0x77e4665d .long 0xd23e3984,0x08b70dfc,0xebd14236,0xab86e8bc,0x57114ba7,0xaa3e07f8,0xab0ef4f2,0x5ac71689,0x0139d9af,0x88fca384,0x76644af0,0x72733f88,0x65d74f4a,0xf122f72a,0xa5626c7a,0x13931577 .long 0x70f8d5a4,0xd5b5d9eb,0xd7bbb228,0x375adde7,0x0c1c0b32,0x31e88b86,0x173edbaa,0xd1f568c4,0x5459df02,0x1592fc83,0x0fcd9a7e,0x2beac0fb,0x1b473b0a,0xb0a6fdb8,0x0fe8fc48,0xe3224c6f .long 0xe87edf5b,0x680bd00e,0x20e77cf5,0x30385f02,0x4d42d1b2,0xe9ab98c0,0xd3816d77,0x72d191d2,0x0917d9e5,0x1564daca,0x1f8fed7f,0x394eab59,0x7fbb3896,0xa209aa8d,0xbe6ac98e,0x5564f3b9 .long 0xd73654ef,0xead21d05,0x13d78d74,0x68d1a9c4,0x6d4973a0,0x61e01708,0x46e6d32a,0x83da3500,0x68ae0118,0x6a3dfca4,0xd02da069,0xa1b9a4c9,0xebab8302,0x0b2ff9c7,0x944ba436,0x98af07c3 .long 0x995f0f9f,0x85997326,0x71b58bc6,0x467fade0,0xbd625a2b,0x47e4495a,0x33c3b8cd,0xfdd2d01d,0xc693f9fa,0x2c38ae28,0x348f7999,0x48622329,0x2161f583,0x97bf738e,0x565e8cc9,0x15ee2fa7 .long 0x5777e189,0xa1a5c845,0x456f2829,0xcc10bee0,0xda762bd5,0x8ad95c56,0xe9d91da8,0x152e2214,0x7cb23c74,0x975b0e72,0xa90c66df,0xfd5d7670,0x225ffc53,0xb5b5b8ad,0xfaded2ae,0xab6dff73 .long 0x6f4cbe9d,0xebd56781,0x6a574bd7,0x0ed8b249,0x81a881fa,0x41c246fe,0xc3db9c70,0x91564805,0x5b862809,0xd7c12b08,0x55858d7b,0x1facd1f1,0xaf09e92a,0x7693747c,0x189a425f,0x3b69dcba .long 0x967365ef,0x0be28e9f,0xe801f5c9,0x57300eb2,0xd583352f,0x93b8ac6a,0xcd05b2b7,0xa2cf1f89,0x4dcc40cc,0x7c0c9b74,0xada523fb,0xfee38c45,0x1099cc4d,0xb49a4dec,0x69f069c6,0x325c377f .long 0x476cc9ff,0xe12458ce,0xc6d4cb63,0x580e0b6c,0x9072289b,0xd561c8b7,0xa619e6da,0x0377f264,0x88e591a5,0x26685362,0x7523ca2b,0xa453a7bd,0xc1df4533,0x8a9536d2,0xbe972f79,0xc8e50f2f .long 0x6d3549cf,0xd433e50f,0xfacd665e,0x6f33696f,0xce11fcb4,0x695bfdac,0xaf7c9860,0x810ee252,0x7159bb2c,0x65450fe1,0x758b357b,0xf7dfbebe,0xd69fea72,0x2b057e74,0x92731745,0xd485717a .long 0xee36860c,0x896c42e8,0x4113c22d,0xdaf04dfd,0x44104213,0x1adbb7b7,0x1fd394ea,0xe5fd5fa1,0x1a4e0551,0x68235d94,0x18d10151,0x6772cfbe,0x09984523,0x276071e3,0x5a56ba98,0xe4e879de .long 0x285b9491,0xaaafafb0,0x1e4c705e,0x01a0be88,0x2ad9caab,0xff1d4f5d,0xc37a233f,0x6e349a4a,0x4a1c6a16,0xcf1c1246,0x29383260,0xd99e6b66,0x5f6d5471,0xea3d4366,0xff8cc89b,0x36974d04 .long 0xcfe89d80,0xc26c49a1,0xda9c8371,0xb42c026d,0xdad066d2,0xca6c013a,0x56a4f3ee,0xfb8f7228,0xd850935b,0x08b579ec,0xd631e1b3,0x34c1a74c,0xac198534,0xcb5fe596,0xe1f24f25,0x39ff21f6 .long 0x8f929057,0x27f29e14,0xc0c853df,0x7a64ae06,0x58e9c5ce,0x256cd183,0xded092a5,0x9d9cce82,0x6e93b7c7,0xcc6e5979,0x31bb9e27,0xe1e47092,0xaa9e29a0,0xb70b3083,0x3785e644,0xbf181a75 .long 0x8ead09f7,0xf53f2c65,0x9780d14d,0x1335e1d5,0xcd1b66bc,0x69cc20e0,0xbbe0bfc8,0x9b670a37,0x28efbeed,0xce53dc81,0x8326a6e5,0x0c74e77c,0xb88e9a63,0x3604e0d2,0x13dc2248,0xbab38fca .long 0x5c0a3f1e,0x8ed6e8c8,0x7c87c37f,0xbcad2492,0x9ee3b78d,0xfdfb62bb,0xcbceba46,0xeba8e477,0xeeaede4b,0x37d38cb0,0x7976deb6,0x0bc498e8,0x6b6147fb,0xb2944c04,0xf71f9609,0x8b123f35 .long 0xde79dc24,0xa155dcc7,0x558f69cd,0xf1168a32,0x0d1850df,0xbac21595,0xb204c848,0x15c8295b,0x7d8184ff,0xf661aa36,0x30447bdb,0xc396228e,0xbde4a59e,0x11cd5143,0x6beab5e6,0xe3a26e3b .long 0x1402b9d0,0xd3b3a13f,0x2c7bc863,0x573441c3,0x578c3e6e,0x4b301ec4,0x0adaf57e,0xc26fc9c4,0x7493cea3,0x96e71bfd,0x1af81456,0xd05d4b3f,0x6a8c608f,0xdaca2a8a,0x0725b276,0x53ef07f6 .long 0x7824fc56,0x07a5fbd2,0x13289077,0x34675218,0xe0c48349,0x5bf69fd5,0xb6aa7875,0xa613ddd3,0x5450d866,0x7f78c19c,0x8f84a481,0x46f4409c,0x90fce239,0x9f1d1928,0xb2ce44b9,0x016c4168 .long 0xc7435978,0xbae023f0,0x20e30e19,0xb152c888,0xe3fa6faf,0x9c241645,0x84823e60,0x735d95c1,0x03955317,0x03197573,0xf03b4995,0x0b4b02a9,0x70274600,0x076bf559,0xaaf57508,0x32c5cc53 .long 0x60624129,0xe8af6d1f,0x9a5e2b5e,0xb7bc5d64,0x5f082d72,0x3814b048,0xce19677a,0x76f267f2,0xb36eed93,0x626c630f,0x3bf56803,0x55230cd7,0xce2736a0,0x78837949,0xaa6c55f1,0x0d792d60 .long 0xd5c7c5d2,0x0318dbfd,0x072b342d,0xb38f8da7,0x7b8de38a,0x3569bddc,0xa1c94842,0xf25b5887,0x2946ad60,0xb2d5b284,0xe9d1707e,0x854f29ad,0x2c6a4509,0xaa5159dc,0x57189837,0x899f94c0 .long 0xf4a55b03,0xcf6adc51,0x35e3b2d5,0x261762de,0x04827b51,0x4cc43012,0xc6021442,0xcd22a113,0x247c9569,0xce2fd61a,0xd152beca,0x59a50973,0x63a716d4,0x6c835a11,0x187dedcf,0xc26455ed .long 0x49ce89e7,0x27f536e0,0xcc890cb5,0x18908539,0xd83c2aa1,0x308909ab,0x1ab73bd3,0xecd3142b,0xb3f5ab84,0x6a85bf59,0xf2bea4c6,0x3c320a68,0x6da4541f,0xad8dc538,0xb7c41186,0xeaf34eb0 .long 0x977c97c4,0x1c780129,0xc57eb9fa,0x5ff9beeb,0xc822c478,0xa24d0524,0x461cd415,0xfd8eec2a,0xf027458c,0xfbde194e,0x1d1be115,0xb4ff5319,0x4866d6f4,0x63f874d9,0xb21ad0c9,0x35c75015 .long 0x46ac49d2,0xa6b5c9d6,0x83137aa9,0x42c77c0b,0x68225a38,0x24d000fc,0x2fe1e907,0x0f63cfc8,0xc6441f95,0x22d1b01b,0xec8e448f,0x7d38f719,0x787fb1ba,0x9b33fa5f,0x190158df,0x94dcfda1 .long 0x5f6d4a09,0xc47cb339,0xee52b826,0x6b4f355c,0xf51b930a,0x3d100f5d,0x9f668f69,0xf4512fac,0x206c4c74,0x546781d5,0xcb4d2e48,0xd021d4d4,0xca085c2d,0x494a54c2,0x520850a8,0xf1dbaca4 .long 0x490a1aca,0x63c79326,0x41526b02,0xcb64dd9c,0xa2979258,0xbb772591,0x48d97846,0x3f582970,0x7c213ba7,0xd66b70d1,0xe8a0ced4,0xc28febb5,0xc10338c1,0x6b911831,0xbf0126f3,0x0d54e389 .long 0x4af206ee,0x7048d460,0x77e97cb9,0x786c88f6,0xac64802e,0xd4375ae1,0xd53ec11c,0x469bcfe1,0x47062230,0xfc9b340d,0xc5b4a3ac,0xe743bb57,0x59ef45ac,0xfe00b4aa,0x59edf188,0x29a4ef23 .long 0xb483689b,0x40242efe,0x513ac262,0x2575d3f6,0x0ca6db72,0xf30037c8,0x98864be2,0xc9fcce82,0x0149362d,0x84a112ff,0x1c4ae971,0x95e57582,0x945cf86c,0x1fa4b1a8,0x0b024a2f,0x4525a734 .long 0x8f338360,0xe76c8b62,0x28edf32b,0x483ff593,0x298b1aec,0x67e8e90a,0x736d9a21,0x9caab338,0x66892709,0x5c09d2fd,0xb55a1d41,0x2496b4dc,0xe24a4394,0x93f5fb1a,0x6fa8f6c1,0x08c75049 .long 0xc905d85f,0xcaead1c2,0x0733ae57,0xe9d7f790,0xf07cdd94,0x24c9a65c,0xa4b55931,0x7389359c,0x367e45f7,0xf58709b7,0xcb7e7adc,0x1f203067,0xc7b72818,0x82444bff,0xbaac8033,0x07303b35 .long 0xd13b7ea1,0x1e1ee4e4,0xe0e74180,0xe6489b24,0x7e70ef70,0xa5f2c610,0xbdd10894,0xa1655412,0x7af4194e,0x555ebefb,0x8e89bd9c,0x533c1c3c,0x89895856,0x735b9b57,0x567f5c15,0x15fb3cd2 .long 0x526f09fd,0x057fed45,0x8128240a,0xe8a4f10c,0xff2bfd8d,0x9332efc4,0xbd35aa31,0x214e77a0,0x14faa40e,0x32896d73,0x01e5f186,0x767867ec,0x17a1813e,0xc9adf8f1,0x54741795,0xcb6cda78 .long 0x349d51aa,0xb7521b6d,0xe3c7b8e9,0xf56b5a9e,0x32a096df,0xc6f1e5c9,0xa3635024,0x083667c4,0x18087f2f,0x365ea135,0xd136e45d,0xf1b8eaac,0x73aec989,0xc8a0e484,0x142c9259,0xd75a324b .long 0x01dae185,0xb7b4d001,0x9b7a94bc,0x45434e0b,0xfbd8cb0b,0xf54339af,0xe98ef49e,0xdcc4569e,0x09a51299,0x7789318a,0xb2b025d8,0x81b4d206,0xfae85792,0xf64aa418,0xacd7baf7,0x3e50258f .long 0x2996864b,0xdce84cdb,0x1f485fa4,0xa2e67089,0x534c6a5a,0xb28b2bb6,0xc94b9d39,0x31a7ec6b,0xd6bc20da,0x1d217766,0x86761190,0x4acdb5ec,0x73701063,0x68726328,0x2128c29b,0x4d24ee7c .long 0xa19fd868,0xc072ebd3,0xdb8ddd3b,0x612e481c,0x1a64d852,0xb4e1d754,0xc4c6c4ab,0x00ef95ac,0xaa0a6c46,0x1536d2ed,0x43774790,0x61294086,0x343fda10,0x54af25e8,0xfd25d6f2,0x9ff9d98d .long 0x468b8835,0x0746af7c,0x730ecea7,0x977a31cb,0xc2cf4a81,0xa5096b80,0x6458c37a,0xaa986833,0xa6bd9d34,0x6af29bf3,0x33c5d854,0x6a62fe9b,0xb7133b5e,0x50e6c304,0x7d6e6848,0x04b60159 .long 0x5579bea4,0x4cd296df,0x5ceedaf1,0x10e35ac8,0xe3bcc5b1,0x04c4c5fd,0x89412cf9,0x95f9ee8a,0x82b6eb0f,0x2c9459ee,0x95c2aadd,0x2e845765,0xd327fcfe,0x774a84ae,0x0368d476,0xd8c93722 .long 0xf83e8a3b,0x0dbd5748,0x8d2495f3,0xa579aa96,0xae496e9b,0x535996a0,0xb7f9bcc2,0x07afbfe9,0x5b7bd293,0x3ac1dc6d,0x7022323d,0x3b592cff,0x9c0a3e76,0xba0deb98,0x4b197acb,0x18e78e9f .long 0x296c36ef,0x211cde10,0x82c4da77,0x7ee89672,0xa57836da,0xb617d270,0x9cb7560b,0xf0cd9c31,0xe455fe90,0x01fdcbf7,0x7e7334f3,0x3fb53cbb,0x4e7de4ec,0x781e2ea4,0x0b384fd0,0x8adab3ad .long 0x53d64829,0x129eee2f,0xa261492b,0x7a471e17,0xe4cb4a2c,0xe4f9adb9,0x97ba2c2d,0x3d359f6f,0x0aacd697,0x346c6786,0x75c2f8a8,0x92b444c3,0xd85df44e,0xc79fa117,0x398ddf31,0x56782372 .long 0xbbbab3b8,0x60e690f2,0x8b04816b,0x4851f8ae,0x9c92e4d2,0xc72046ab,0x7cf3136b,0x518c74a1,0xf9877d4c,0xff4eb50a,0xa919cabb,0x14578d90,0xac5eb2b6,0x8218f8c4,0x542016e4,0xa3ccc547 .long 0x327f8349,0x025bf48e,0xf43cb641,0xf3e97346,0x500f1085,0xdc2bafdf,0x2f063055,0x57167876,0x411925a6,0x5bd914b9,0xa1123de5,0x7c078d48,0x182b165d,0xee6bf835,0xba519727,0xb11b5e5b .long 0x1eea7b85,0xe33ea76c,0x92d4f85e,0x2352b461,0xafe115bb,0xf101d334,0x889175a3,0xfabc1294,0x5233f925,0x7f6bcdc0,0xe77fec55,0xe0a802db,0x8069b659,0xbdb47b75,0xf98fbd74,0x1c5e12de .long 0x4b8457ee,0x869c58c6,0x4f7ea9f7,0xa5360f69,0xf460b38f,0xe576c09f,0x22b7fb36,0x6b70d548,0x3bfae315,0x3fd237f1,0xcbdff369,0x33797852,0x25b516f9,0x97df25f5,0xba38ad2d,0x46f388f2 .long 0x89d8ddbb,0x656c4658,0x70f38ee8,0x8830b26e,0xde1212b0,0x4320fd5c,0xe4a2edb2,0xc34f30cf,0x56ab64b8,0xabb131a3,0xd99c5d26,0x7f77f0cc,0xbf981d94,0x66856a37,0x738bd76e,0x19e76d09 .long 0x96238f39,0xe76c8ac3,0xa830b366,0xc0a482be,0x0b4eb499,0xb7b8eaff,0x4bfb4865,0x8ecd83bc,0xa2f3776f,0x971b2cb7,0xf4b88adf,0xb42176a4,0xbe1fa446,0xb9617df5,0xcd031bd2,0x8b32d508 .long 0x53b618c0,0x1c6bd47d,0x6a227923,0xc424f46c,0xdd92d964,0x7303ffde,0x71b5abf2,0xe9712878,0xf815561d,0x8f48a632,0xd3c055d1,0x85f48ff5,0x7525684f,0x222a1427,0x67360cc3,0xd0d841a0 .long 0x0b9267c6,0x4245a926,0xcf07f863,0xc78913f1,0x4d0d9e24,0xaa844c8e,0x3d5f9017,0xa42ad522,0xa2c989d5,0xbd371749,0xe1f5e78e,0x928292df,0x0a1ea6da,0x493b383e,0x13aee529,0x5136fd8d .long 0xf2c34a99,0x860c44b1,0xbf5855ac,0x3b00aca4,0xfaaf37be,0xabf6aaa0,0x2a53ec08,0x65f43682,0xa11b12e1,0x1d9a5801,0xe20ed475,0x78a7ab2c,0x9a41e0d5,0x0de1067e,0x305023ea,0x30473f5f .long 0x169c7d97,0xdd3ae09d,0xcfaef9cd,0x5cd5baa4,0x65a44803,0x5cd7440b,0x47f364de,0xdc13966a,0x2b8357c1,0x077b2be8,0xe9d57c2a,0x0cb1b4c5,0x05ff363e,0x7a4ceb32,0xca35a9ef,0xf310fa4d .long 0xf97f68c6,0xdbb7b352,0x0b02cf58,0x0c773b50,0x3c1f96d9,0xea2e4821,0xeee01815,0xffb357b0,0xe0f28039,0xb9c924cd,0x46a3fbe4,0x0b36c95a,0x5e46db6c,0x1faaaea4,0x1928aaff,0xcae575c3 .long 0xa70dab86,0x7f671302,0x71c58cfc,0xfcbd12a9,0xbee0cb92,0xcbef9acf,0xf8c1b583,0x573da0b9,0x0d41d550,0x4752fcfe,0x2155cffe,0xe7eec0e3,0x545ae248,0x0fc39fcb,0x8065f44e,0x522cb8d1 .long 0x70cbb96c,0x263c962a,0xbcd124a9,0xe034362a,0x3c2ae58d,0xf120db28,0xfef6d507,0xb9a38d49,0x1ff140fd,0xb1fd2a82,0x20aee7e0,0xbd162f30,0xcb251949,0x4e17a5d4,0x4f7e1c3d,0x2aebcb83 .long 0x937b0527,0x608eb25f,0xeb7d9997,0xf42e1e47,0xb8a53a29,0xeba699c4,0xe091b536,0x1f921c71,0x5b26bbd5,0xcce29e7b,0x3b61a680,0x7a8ef5ed,0xba1f1c7e,0xe5ef8043,0x18158dda,0x16ea8217 .long 0x599ff0f9,0x01778a2b,0x8104fc6b,0x68a923d7,0xda694ff3,0x5bfa44df,0xf7667f12,0x4f7199db,0xe46f2a79,0xc06d8ff6,0xe9f8131d,0x08b5dead,0xabb4ce7c,0x02519a59,0xb42aec3e,0xc4f710bc .long 0x78bde41a,0x3d77b057,0xb4186b5a,0x6474bf80,0x88c65741,0x048b3f67,0x03c7c154,0xc64519de,0x0edfcc4f,0xdf073846,0x48f1aa6b,0x319aa737,0xca909f77,0x8b9f8a02,0x7580bfef,0x90258139 .long 0xc0c22719,0xd8bfd3ca,0xc9ca151e,0xc60209e4,0xd9a1a69c,0x7a744ab5,0x14937f8f,0x6de5048b,0xe115ac04,0x171938d8,0x1c6b16d2,0x7df70940,0x7f8e94e7,0xa6aeb663,0x2a2cf094,0xc130388e .long 0x77f54e6e,0x1850be84,0x65d60fe5,0x9f258a72,0x6c9146d6,0xff7ff0c0,0xe63a830b,0x039aaf90,0x9460342f,0x38f27a73,0x3f795f8a,0x4703148c,0x9681a97e,0x1bb5467b,0xecaeb594,0x00931ba5 .long 0x786f337c,0xcdb6719d,0xe704397d,0xd9c01cd2,0x555c2fef,0x0f4a3f20,0x7c0af223,0x00452509,0x84db8e76,0x54a58047,0x93c8aa06,0x3bacf1aa,0xf7919422,0x11ca957c,0x78cdaa40,0x50641053 .long 0x9f7144ae,0x7a303874,0x43d4acfd,0x170c963f,0x58ddd3ef,0x5e148149,0x9e72dba8,0xa7bde582,0x6fa68750,0x0769da8b,0x572e0249,0xfa64e532,0x2619ad31,0xfcaadf9d,0xa7b349cd,0x87882daa .long 0x6c67a775,0x9f6eb731,0xefc5d0b1,0xcb10471a,0xe1b806b2,0xb433750c,0x57b1ae7e,0x19c5714d,0xed03fd3f,0xc0dc8b7b,0x31bc194e,0xdd03344f,0x8c6320b5,0xa66c52a7,0xd0b6fd93,0x8bc82ce3 .long 0xb35f1341,0xf8e13501,0x25a43e42,0xe53156dd,0x4daeb85c,0xd3adf27e,0xbbeddeb5,0xb81d8379,0x2e435867,0x1b0b546e,0xeba5dd60,0x9020eb94,0x8210cb9d,0x37d91161,0x5c91f1cf,0x4c596b31 .long 0x0e0b040d,0xb228a90f,0x45ff897f,0xbaf02d82,0x00fa6122,0x2aac79e6,0x8e36f557,0x24828817,0x113ec356,0xb9521d31,0x15eff1f8,0x9e48861e,0xe0d41715,0x2aa1d412,0x53f131b8,0x71f86203 .long 0x3fd19408,0xf60da8da,0x278d9d99,0x4aa716dc,0xa8c51c90,0x394531f7,0xf59db51c,0xb560b0e8,0xfa34bdad,0xa28fc992,0x9cd4f8bd,0xf024fa14,0x23a9d0d3,0x5cf530f7,0xe28c9b56,0x615ca193 .long 0x6f73c51e,0x6d2a483d,0xea0dc2dd,0xa4cb2412,0x1eb917ff,0x50663c41,0xeade299e,0x3d3a74cf,0x4a7a9202,0x29b3990f,0xa7b15c3d,0xa9bccf59,0xa5df9208,0x66a3ccdc,0x43f2f929,0x48027c14 .long 0x40b557f0,0xd385377c,0xcd684660,0xe001c366,0xe2183a27,0x1b18ed6b,0x63210329,0x879738d8,0xbda94882,0xa687c74b,0xa684b299,0xd1bbcc48,0x863b3724,0xaf6f1112,0x2c8ce9f8,0x6943d1b4 .long 0x098cafb4,0xe044a3bb,0x60d48caf,0x27ed2310,0x3a31b84d,0x542b5675,0xfcddbed7,0xcbf3dd50,0x41b1d830,0x25031f16,0xcb0c1e27,0xa7ec851d,0xb5ae75db,0xac1c8fe0,0x08c52120,0xb24c7557 .long 0x1d4636c3,0x57f811dc,0x681a9939,0xf8436526,0x9c81adb3,0x1f6bc6d9,0x5b7d80d4,0x840f8ac3,0xf4387f1a,0x731a9811,0xb5156880,0x7c501cd3,0xdfe68867,0xa5ca4a07,0x5fcea120,0xf123d8f0 .long 0xd607039e,0x1fbb0e71,0xcd3a4546,0x2b70e215,0x53324091,0x32d2f01d,0x180ab19b,0xb796ff08,0x3c57c4aa,0x32d87a86,0xb7c49a27,0x2aed9caf,0x31630d98,0x9fb35eac,0x5c3e20a3,0x338e8cdf .long 0x66cde8db,0x80f16182,0x2d72fd36,0x4e159980,0x9b6e5072,0xd7b8f13b,0x3b7b5dc1,0xf5213907,0x8ce4396e,0x4d431f1d,0xa7ed2142,0x37a1a680,0xd01aaf6b,0xbf375696,0xe63aab66,0xaa1c0c54 .long 0x4ed80940,0x3014368b,0x7a6fcedd,0x67e6d056,0xca97579f,0x7c208c49,0xa23597f6,0xfe3d7a81,0x7e096ae2,0x5e203202,0x24b39366,0xb1f3e1e7,0x2fdcdffc,0x26da26f3,0x6097be83,0x79422f1d .long 0x9db3b381,0x263a2cfb,0xd4df0a4b,0x9c3a2dee,0x7d04e61f,0x728d06e9,0x42449325,0x8b1adfbc,0x7e053a1b,0x6ec1d939,0x66daf707,0xee2be5c7,0x810ac7ab,0x80ba1e14,0xf530f174,0xdd2ae778 .long 0x205b9d8b,0x0435d97a,0x056756d4,0x6eb8f064,0xb6f8210e,0xd5e88a8b,0xec9fd9ea,0x070ef12d,0x3bcc876a,0x4d849505,0xa7404ce3,0x12a75338,0xb8a1db5e,0xd22b49e1,0x14bfa5ad,0xec1f2051 .long 0xb6828f36,0xadbaeb79,0x01bd5b9e,0x9d7a0258,0x1e844b0c,0xeda01e0d,0x887edfc9,0x4b625175,0x9669b621,0x14109fdd,0xf6f87b98,0x88a2ca56,0x170df6bc,0xfe2eb788,0xffa473f9,0x0cea06f4 .long 0xc4e83d33,0x43ed81b5,0x5efd488b,0xd9f35879,0x9deb4d0f,0x164a620f,0xac6a7394,0xc6927bdb,0x9f9e0f03,0x45c28df7,0xfcd7e1a9,0x2868661e,0xffa348f1,0x7cf4e8d0,0x398538e0,0x6bd4c284 .long 0x289a8619,0x2618a091,0x6671b173,0xef796e60,0x9090c632,0x664e46e5,0x1e66f8fb,0xa38062d4,0x0573274e,0x6c744a20,0xa9271394,0xd07b67e4,0x6bdc0e20,0x391223b2,0xeb0a05a7,0xbe2d93f1 .long 0x3f36d141,0xf23e2e53,0x4dfca442,0xe84bb3d4,0x6b7c023a,0xb804a48d,0x76431c3b,0x1e16a8fa,0xddd472e0,0x1b5452ad,0x0d1ee127,0x7d405ee7,0xffa27599,0x50fc6f1d,0xbf391b35,0x351ac53c .long 0x4444896b,0x7efa14b8,0xf94027fb,0x64974d2f,0xde84487d,0xefdcd0e8,0x2b48989b,0x8c45b260,0xd8463487,0xa8fcbbc2,0x3fbc476c,0xd1b2b3f7,0xc8f443c0,0x21d005b7,0x40c0139c,0x518f2e67 .long 0x06d75fc1,0x56036e8c,0x3249a89f,0x2dcf7bb7,0xe245e7dd,0x81dd1d3d,0xebd6e2a7,0xf578dc4b,0xdf2ce7a0,0x4c028903,0x9c39afac,0xaee36288,0x146404ab,0xdc847c31,0xa4e97818,0x6304c0d8 .long 0xa91f6791,0xae51dca2,0x9baa9efc,0x2abe4190,0x559c7ac1,0xd9d2e2f4,0xfc9f773a,0xe82f4b51,0x4073e81c,0xa7713027,0xfbb596fc,0xc0276fac,0xa684f70c,0x1d819fc9,0xc9f7b1e0,0x29b47fdd .long 0x459b1940,0x358de103,0x5b013e93,0xec881c59,0x49532ad3,0x51574c93,0xb37b46de,0x2db1d445,0xdf239fd8,0xc6445b87,0x151d24ee,0xc718af75,0xf43c6259,0xaea1c4a4,0x70be02f7,0x40c0e5d7 .long 0x721b33f2,0x6a4590f4,0xfedf04ea,0x2124f1fb,0x9745efe7,0xf8e53cde,0x65f046d9,0xe7e10432,0xe4d0c7e6,0xc3fca28e,0x87253b1b,0x847e339a,0x3743e643,0x9b595348,0x4fd12fc5,0xcb6a0a0b .long 0x27d02dcc,0xfb6836c3,0x7a68bcc2,0x5ad00982,0x005e912d,0x1b24b44c,0x811fdcfe,0xcc83d20f,0x666fba0c,0x36527ec1,0x14754635,0x69948197,0x556da9c2,0xfcdcb1a8,0x81a732b2,0xa5934267 .long 0xa714181d,0xec1214ed,0x6067b341,0x609ac13b,0xa545df1f,0xff4b4c97,0x34d2076b,0xa1240501,0x1409ca97,0x6efa0c23,0x20638c43,0x254cc1a8,0xdcfb46cd,0xd4e363af,0x03942a27,0x62c2adc3 .long 0x56e46483,0xc67b9df0,0x63736356,0xa55abb20,0xc551bc52,0xab93c098,0xb15fe64b,0x382b49f9,0x4dff8d47,0x9ec221ad,0x437df4d6,0x79caf615,0xbb456509,0x5f13dc64,0x191f0714,0xe4c589d9 .long 0x3fd40e09,0x27b6a8ab,0x77313ea9,0xe455842e,0x1f55988b,0x8b51d1e2,0x062bbbfc,0x5716dd73,0x4e8bf3de,0x633c11e5,0x1b85be3b,0x9a0e77b6,0x0911cca6,0x56510729,0xefa6590f,0x27e76495 .long 0x070d3aab,0xe4ac8b33,0x9a2cd5e5,0x2643672b,0x1cfc9173,0x52eff79b,0x90a7c13f,0x665ca49b,0xb3efb998,0x5a8dda59,0x052f1341,0x8a5b922d,0x3cf9a530,0xae9ebbab,0xf56da4d7,0x35986e7b .long 0xff3513cc,0x3a636b5c,0x3198f7dd,0xbb0cf8ba,0x41f16f86,0xb8d40522,0xde13a7bf,0x760575d8,0x9f7aa181,0x36f74e16,0xf509ed1c,0x163a3ecf,0x3c40a491,0x6aead61f,0xdfe8fcaa,0x158c95fc .long 0x13cda46f,0xa3991b6e,0x342faed0,0x79482415,0x666b5970,0xf3ba5bde,0xb26ab6dd,0x1d52e6bc,0x8608dd3d,0x768ba1e7,0xea076586,0x4930db2a,0xe7dc1afa,0xd9575714,0xf7c58817,0x1fc7bf7d .long 0xd9eee96c,0x6b47accd,0xe58cec37,0x0ca277fb,0xe702c42a,0x113fe413,0xc47cbe51,0xdd1764ee,0x7b3ed739,0x041e7cde,0x5ce9e1c0,0x50cb7459,0x2925b212,0x35568513,0x001b081c,0x7cff95c4 .long 0x8088b454,0x63ee4cbd,0x9a9e0c8a,0xdb7f32f7,0x6b2447cb,0xb377d418,0xd370219b,0xe3e982aa,0xc2a2a593,0x06ccc1e4,0x0773f24f,0x72c36865,0x95859423,0xa13b4da7,0x75040c8f,0x8bbf1d33 .long 0xda50c991,0x726f0973,0x822d6ee2,0x48afcd5b,0x20fd7771,0xe5fc718b,0xfd0807a1,0xb9e8e77d,0x99a7703d,0x7f5e0f44,0x618e36f3,0x6972930e,0x23807bbe,0x2b7c77b8,0xcb27ff50,0xe5b82405 .long 0xbd379062,0xba8b8be3,0x2dce4a92,0xd64b7a1d,0xb2952e37,0x040a73c5,0xd438aeca,0x0a9e252e,0xc39d3bcb,0xdd43956b,0xb32b2d63,0x1a31ca00,0x5c417a18,0xd67133b8,0x2ef442c8,0xd08e4790 .long 0x255c0980,0x98cb1ae9,0x2b4a739f,0x4bd86381,0x1e4a45a1,0x5a5c31e1,0x9cb0db2f,0x1e5d55fe,0x8ff5cc29,0x74661b06,0x0eb8a4f4,0x026b389f,0x58848c24,0x536b21a4,0x81dc72b0,0x2e5bf8ec .long 0xad886aac,0x03c187d0,0xb771b645,0x5c16878a,0xc74045ab,0xb07dfc6f,0x7800caed,0x2c6360bf,0xb9c972a3,0x24295bb5,0x7c9a6dba,0xc9e6f88e,0x92a79aa6,0x90ffbf24,0x41c26ac2,0xde29d50a .long 0xd309cbe6,0x9f0af483,0xe0bced4f,0x5b020d8a,0xb38023e3,0x606e986d,0x1abc6933,0xad8f2c9d,0xe7400e93,0x19292e1d,0x52be5e4d,0xfe3e18a9,0x2e0680bf,0xe8e9771d,0xc54db063,0x8c5bec98 .long 0x74a55d1f,0x2af9662a,0x046f66d8,0xe3fbf28f,0xd4dc4794,0xa3a72ab4,0x5c7c2dd8,0x09779f45,0xc3d19d8d,0xd893bdaf,0x57d6a6df,0xd5a75094,0x952e6255,0x8cf8fef9,0xda9a8aff,0x3da67cfb .long 0x2c160dcd,0x4c23f62a,0x8f90eaef,0x34e6c5e3,0xa9a65d5a,0x35865519,0x8fd38a3d,0x07c48aae,0x50068527,0xb7e7aeda,0x1c90936a,0x2c09ef23,0xe879324c,0x31ecfeb6,0xfb0ec938,0xa0871f6b .long 0xd84d835d,0xb1f0fb68,0x861dc1e6,0xc90caf39,0x7594f8d7,0x12e5b046,0x65012b92,0x26897ae2,0xa4d6755d,0xbcf68a08,0x0991fbda,0x403ee41c,0x3bbf17e8,0x733e343e,0x679b3d65,0xd2c7980d .long 0xd2e11305,0x33056232,0xf3c07a6f,0x966be492,0xbb15509d,0x6a8878ff,0x0a9b59a4,0xff221101,0xabe30129,0x6c9f564a,0x336e64cf,0xc6f2c940,0x8b0c8022,0x0fe75262,0x6ae8db87,0xbe0267e9 .long 0x93bc042b,0x22e192f1,0xb237c458,0xf085b534,0x832c4168,0xa0d192bd,0xbdf6271d,0x7a76e9e3,0xb88911b5,0x52a882fa,0xb4db0eb5,0xc85345e4,0x81a7c3ff,0xa3be02a6,0xf0ec0469,0x51889c8c .long 0xa5e829e5,0x9d031369,0x1607aa41,0xcbb4c6fc,0x241d84c1,0x75ac59a6,0x8829e0ee,0xc043f2bf,0x8ea5e185,0x82a38f75,0xd87cbd9f,0x8bda40b9,0x2d8fc601,0x9e65e75e,0xa35690b3,0x3d515f74 .long 0xda79e5ac,0x534acf4f,0x8630215f,0x68b83b3a,0xd085756e,0x5c748b2e,0xe5d37cb2,0xb0317258,0xc5ccc2c4,0x6735841a,0x3d9d5069,0x7d7dc96b,0xfd1754bd,0xa147e410,0xd399ddd5,0x65296e94 .long 0xbc8fa5bc,0xf6b5b2d0,0x500c277b,0x8a5ead67,0xdfa08a5d,0x214625e6,0x959cf047,0x51fdfedc,0x289fca32,0x6bc9430b,0x9d9bdc3f,0xe36ff0cf,0x58ea0ede,0x2fe187cb,0x5a900b3f,0xed66af20 .long 0x5fa9f4d6,0x00e0968b,0x37a362e7,0x2d4066ce,0xbd07e772,0xa99a9748,0x06a4f1d0,0x710989c0,0xce40cbd8,0xd5dedf35,0x1743293d,0xab55c5f0,0x8aa24e2c,0x766f1144,0x605fbcb4,0x94d874f8 .long 0xa518001b,0xa365f0e8,0x9d04ef0f,0xee605eb6,0xba8d4d25,0x5a3915cd,0xb5113472,0x44c0e1b8,0x8b6740dc,0xcbb024e8,0xee1d4f0c,0x89087a53,0x1fc4e372,0xa88fa05c,0xaf8b3af2,0x8bf395cb .long 0xdeb8568b,0x1e71c9a1,0x80fb3d32,0xa35daea0,0x2cf8fb81,0xe8b6f266,0x9490696a,0x6d51afe8,0x51803a19,0x81beac6e,0x86219080,0xe3d24b7f,0xdf6f463c,0x727cfd9d,0x72284ee8,0x8c6865ca .long 0xb743f4ef,0x32c88b7d,0xe7d11dce,0x3793909b,0x2ff2ebe8,0xd398f922,0xe5e49796,0x2c70ca44,0xcb1131b1,0xdf4d9929,0x25888e79,0x7826f298,0xf1d8740a,0x4d3a112c,0x270afa8b,0x00384cb6 .long 0x3ab48095,0xcb64125b,0x62d05106,0x3451c256,0xa4955845,0xd73d577d,0xbf9f4433,0x39570c16,0xadecf263,0xd7dfaad3,0xdc76e102,0xf1c3d8d1,0x54c6a836,0x5e774a58,0x3e92d47b,0xdad4b672 .long 0xf0d796a0,0xbe7e990f,0xdf0e8b02,0x5fc62478,0x030c00ad,0x8aae8bf4,0x9004ba0f,0x3d2db93b,0xd85d5ddc,0xe48c8a79,0x6bb07f34,0xe907caa7,0xa39eaed5,0x58db343a,0xadaf5724,0x0ea6e007 .long 0xd23233f3,0xe00df169,0x77cb637f,0x3e322796,0x1da0cf6c,0x1f897c0e,0x31d6bbdd,0xa651f5d8,0x1a230c76,0xdd61af19,0xcdaa5e4a,0xbd527272,0xd0abcd7e,0xca753636,0x370bd8dc,0x78bdd37c .long 0x17cd93fe,0xc23916c2,0xdadce6e2,0x65b97a4d,0x174e42f8,0xe04ed4eb,0xbb21480a,0x1491ccaa,0x23196332,0x145a8280,0x587b479a,0x3c3862d7,0x01dcd0ed,0x9f4a88a3,0x3ea12f1f,0x4da2b7ef .long 0xb126e48e,0xf8e7ae33,0xf494e237,0x404a0b32,0xc55acadb,0x9beac474,0xcbec9fd9,0x4ee5cf3b,0x7df3c8c3,0x336b33b9,0xb76808fd,0xbd905fe3,0xaa45c16a,0x8f436981,0x3dd27b62,0x255c5bfa .long 0xc3dd9b4d,0x71965cbf,0xfc068a87,0xce23edbf,0x745b029b,0xb78d4725,0xcefdd9bd,0x74610713,0x1266bf52,0x7116f75f,0x18e49bb6,0x02046722,0x3d6f19e3,0xdf43df9f,0xe685cb2f,0xef1bc7d0 .long 0x7078c432,0xcddb27c1,0xb77fedb7,0xe1961b9c,0xc2290570,0x1edc2f5c,0x19cbd886,0x2c3fefca,0xc2af389a,0xcf880a36,0xbda71cea,0x96c610fd,0x32aa8463,0xf03977a9,0x8586d90a,0x8eb7763f .long 0x2a296e77,0x3f342454,0x42837a35,0xc8718683,0x6a09c731,0x7dc71090,0x51b816db,0x54778ffb,0xaf06defd,0x6b33bfec,0x8592b70b,0xfe3c105f,0x61da6114,0xf937fda4,0x4c266ad7,0x3c13e651 .long 0x855938e8,0xe363a829,0x9de54b72,0x2eeb5d9e,0x20ccfab9,0xbeb93b0e,0x25e61a25,0x3dffbb5f,0x1acc093d,0x7f655e43,0x3964ce61,0x0cb6cc3d,0xe5e9b460,0x6ab283a1,0xa1c7e72d,0x55d787c5 .long 0xdeadbf02,0x4d2efd47,0xac459068,0x11e80219,0x71f311f0,0x810c7626,0x4ab6ef53,0xfa17ef8d,0x93e43bff,0xaf47fd25,0x0be40632,0x5cb5ff3f,0x8ee61da3,0x54687106,0xb08afd0f,0x7764196e .long 0xf0290a8f,0x831ab3ed,0xcb47c387,0xcae81966,0x184efb4f,0xaad7dece,0x4749110e,0xdcfc53b3,0x4cb632f9,0x6698f23c,0xb91f8067,0xc42a1ad6,0x6284180a,0xb116a81d,0xe901326f,0xebedf5f8 .long 0x97e3e044,0xf2274c9f,0x11d09fc9,0x42018520,0xd18e6e23,0x56a65f17,0x352b683c,0x2ea61e2a,0x575eaa94,0x27d291bc,0xb8ff522d,0x9e7bc721,0xa7f04d6f,0x5f7268bf,0xaba41748,0x5868c73f .long 0x7be0eead,0x9f85c2db,0xff719135,0x511e7842,0xc5ea90d7,0x5a06b1e9,0x26fab631,0x0c19e283,0xe9206c55,0x8af8f0cf,0x3553c06a,0x89389cb4,0xf65f8004,0x39dbed97,0xc508991d,0x0621b037 .long 0x96e78cc4,0x1c52e635,0x0c06b4a8,0x5385c8b2,0xb0e87d03,0xd84ddfdb,0x934bafad,0xc49dfb66,0x59f70772,0x7071e170,0x3a1db56b,0x3a073a84,0x3b8af190,0x03494903,0xd32920f0,0x7d882de3 .long 0xb2cf8940,0x91633f0a,0x6f948f51,0x72b0b178,0x782653c8,0x2d28dc30,0xdb903a05,0x88829849,0x6a19d2bb,0xb8095d0c,0x86f782cb,0x4b9e7f0c,0x2d907064,0x7af73988,0x8b32643c,0xd12be0fe .long 0x0e165dc3,0x358ed23d,0x4e2378ce,0x3d47ce62,0xfeb8a087,0x7e2bb0b9,0xe29e10b9,0x3246e8ae,0x03ce2b4d,0x459f4ec7,0xbbc077cf,0xe9b4ca1b,0x0e9940c1,0x2613b4f2,0x047d1eb1,0xfc598bb9 .long 0x45036099,0x9744c62b,0x167c65d8,0xa9dee742,0xdabe1943,0x0c511525,0x93c6c624,0xda110554,0x651a3be2,0xae00a52c,0x884449a6,0xcda5111d,0xff33bed1,0x063c06f4,0x0d3d76b4,0x73baaf9a .long 0x7fc63668,0x52fb0c9d,0x0c039cde,0x6886c9dd,0x55b22351,0x602bd599,0x360c7c13,0xb00cab02,0x81b69442,0x8cb616bc,0xb55c3cee,0x41486700,0xf49ba278,0x71093281,0x64a50710,0xad956d9c .long 0x638a7e81,0x9561f28b,0x5980ddc3,0x54155cdf,0xd26f247a,0xb2db4a96,0x4787d100,0x9d774e4e,0x078637d2,0x1a9e6e2e,0x5e0ae06a,0x1c363e2d,0xe9cfa354,0x7493483e,0x7f74b98d,0x76843cb3 .long 0xd4b66947,0xbaca6591,0x04460a8c,0xb452ce98,0x43768f55,0x6830d246,0x7dff12df,0xf4197ed8,0x400dd0f7,0x6521b472,0x4b1e7093,0x59f5ca8f,0x080338ae,0x6feff11b,0xa29ca3c6,0x0ada31f6 .long 0x94a2c215,0x24794eb6,0x05a57ab4,0xd83a43ab,0x2a6f89fe,0x264a543a,0xdd5ec7c2,0x2c2a3868,0x8439d9b2,0xd3373940,0x0acd1f11,0x715ea672,0xe7e6cc19,0x42c1d235,0xb990585c,0x81ce6e96 .long 0xd809c7bd,0x04e5dfe0,0x8f1050ab,0xd7b2580c,0xd8a4176f,0x6d91ad78,0x4e2e897c,0x0af556ee,0x921de0ac,0x162a8b73,0x7ea78400,0x52ac9c22,0xefce2174,0xee2a4eea,0x6d637f79,0xbe61844e .long 0x789a283b,0x0491f1bc,0x880836f4,0x72d3ac3d,0x88e5402d,0xaa1c5ea3,0xd5cc473d,0x1b192421,0x9dc84cac,0x5c0b9998,0x9c6e75b8,0xb0a8482d,0x3a191ce2,0x639961d0,0x6d837930,0xda3bc865 .long 0x056e6f8f,0xca990653,0x64d133a7,0x84861c41,0x746abe40,0x8b403276,0xebf8e303,0xb7b4d51a,0x220a255d,0x05b43211,0x02419e6e,0xc997152c,0x630c2fea,0x76ff47b6,0x281fdade,0x50518677 .long 0xcf902b0b,0x3283b8ba,0x37db303b,0x8d4b4eb5,0x755011bc,0xcc89f42d,0xdd09d19b,0xb43d74bb,0x8adba350,0x65746bc9,0xb51c1927,0x364eaf8c,0x10ad72ec,0x13c76596,0xf8d40c20,0x30045121 .long 0xea7b979b,0x6d2d99b7,0xe6fb3bcd,0xcd78cd74,0x86cffbfe,0x11e45a9e,0x637024f6,0x78a61cf4,0x3d502295,0xd06bc872,0x458cb288,0xf1376854,0x342f8586,0xb9db26a1,0x4beee09e,0xf33effcf .long 0xb30cfb3a,0xd7e0c4cd,0x6c9db4c8,0x6d09b8c1,0x07c8d9df,0x40ba1a42,0x1c52c66d,0x6fd495f7,0x275264da,0xfb0e169f,0xe57d8362,0x80c2b746,0x49ad7222,0xedd987f7,0x4398ec7b,0xfdc229af .long 0x52666a58,0xb0d1ed84,0xe6a9c3c2,0x4bcb6e00,0x26906408,0x3c57411c,0x13556400,0xcfc20755,0x5294dba3,0xa08b1c50,0x8b7dd31e,0xa30ba286,0x991eca74,0xd70ba90e,0xe762c2b9,0x094e142c .long 0x979f3925,0xb81d783e,0xaf4c89a7,0x1efd130a,0xfd1bf7fa,0x525c2144,0x1b265a9e,0x4b296904,0xb9db65b6,0xed8e9634,0x03599d8a,0x35c82e32,0x403563f3,0xdaa7a54f,0x022c38ab,0x9df088ad .long 0xbb3fd30a,0xe5cfb066,0xeff0354e,0x429169da,0x3524e36c,0x809cf852,0x0155be1d,0x136f4fb3,0x1fbba712,0x4826af01,0x506ba1a1,0x6ef0f0b4,0x77aea73e,0xd9928b31,0x5eaa244e,0xe2bf6af2 .long 0x4237b64b,0x8d084f12,0xe3ecfd07,0x688ebe99,0xf6845dd8,0x57b8a70c,0x5da4a325,0x808fc59c,0xa3585862,0xa9032b2b,0xedf29386,0xb66825d5,0x431ec29b,0xb5a5a8db,0x3a1e8dc8,0xbb143a98 .long 0x12ae381b,0x35ee94ce,0x86ccda90,0x3a7f176c,0x4606eaca,0xc63a657e,0x43cd04df,0x9ae5a380,0xed251b46,0x9bec8d15,0xcaca5e64,0x1f5d6d30,0x9ff20f07,0x347b3b35,0xf7e4b286,0x4d65f034 .long 0xf111661e,0x9e93ba24,0xb105eb04,0xedced484,0xf424b578,0x96dc9ba1,0xe83e9069,0xbf8f66b7,0xd7ed8216,0x872d4df4,0x8e2cbecf,0xbf07f377,0x98e73754,0x4281d899,0x8aab8708,0xfec85fbb .long 0xa5ba5b0b,0x9a3c0dee,0x42d05299,0xe6a116ce,0xe9b02d42,0xae9775fe,0xa1545cb6,0x72b05200,0x31a3b4ea,0xbc506f7d,0x8bbd9b32,0xe5893078,0xe4b12a97,0xc8bc5f37,0x4a73b671,0x6b000c06 .long 0x765fa7d0,0x13b5bf22,0x1d6a5370,0x59805bf0,0x4280db98,0x67a5e29d,0x776b1ce3,0x4f53916f,0x33ddf626,0x714ff61f,0xa085d103,0x4206238e,0xe5809ee3,0x1c50d4b7,0x85f8eb1d,0x999f450d .long 0xe4c79e9b,0x658a6051,0xc66a9fea,0x1394cb73,0xc6be7b23,0x27f31ed5,0x5aa6f8fe,0xf4c88f36,0x4aaa499e,0x0fb0721f,0xe3fb2a6b,0x68b3a7d5,0x3a92851d,0xa788097d,0xe96f4913,0x060e7f8a .long 0x1a3a93bc,0x82eebe73,0xa21adc1a,0x42bbf465,0xef030efd,0xc10b6fa4,0x87b097bb,0x247aa4c7,0xf60c77da,0x8b8dc632,0xc223523e,0x6ffbc26a,0x344579cf,0xa4f6ff11,0x980250f6,0x5825653c .long 0xbc1aa2b9,0xb2dd097e,0x37a0333a,0x07889393,0x37a0db38,0x1cf55e71,0x792c1613,0x2648487f,0x3fcef261,0xdad01336,0x0eabf129,0x6239c81d,0x9d276be2,0x8ee761de,0x1eda6ad3,0x406a7a34 .long 0x4a493b31,0x4bf367ba,0x9bf7f026,0x54f20a52,0x9795914b,0xb696e062,0x8bf236ac,0xcddab96d,0xed25ea13,0x4ff2c70a,0x81cbbbe7,0xfa1d09eb,0x468544c5,0x88fc8c87,0x696b3317,0x847a670d .long 0x64bcb626,0xf133421e,0x26dee0b5,0xaea638c8,0xb310346c,0xd6e7680b,0xd5d4ced3,0xe06f4097,0x7512a30b,0x09961452,0xe589a59a,0xf3d867fd,0x52d0c180,0x2e73254f,0x333c74ac,0x9063d8a3 .long 0xd314e7bc,0xeda6c595,0x467899ed,0x2ee7464b,0x0a1ed5d3,0x1cef423c,0x69cc7613,0x217e76ea,0xe7cda917,0x27ccce1f,0x8a893f16,0x12d8016b,0x9fc74f6b,0xbcd6de84,0xf3144e61,0xfa5817e2 .long 0x0821ee4c,0x1f354164,0x0bc61992,0x1583eab4,0x1d72879f,0x7490caf6,0xf76ae7b2,0x998ad9f3,0xa41157f7,0x1e181950,0xe8da3a7e,0xa9d7e1e6,0x8426b95f,0x963784eb,0x542e2a10,0x0ee4ed6e .long 0xac751e7b,0xb79d4cc5,0xfd4211bd,0x93f96472,0xc8de4fc6,0x8c72d3d2,0xdf44f064,0x7b69cbf5,0xf4bf94e1,0x3da90ca2,0xf12894e2,0x1a5325f8,0x7917d60b,0x0a437f6c,0x96c9cb5d,0x9be70486 .long 0xe1dc5c05,0xb4d880bf,0xeebeeb57,0xd738adda,0xdf0fe6a3,0x6f0119d3,0x66eaaf5a,0x5c686e55,0xdfd0b7ec,0x9cb10b50,0x6a497c21,0xbdd0264b,0x8c546c96,0xfc093514,0x79dbf42a,0x58a947fa .long 0x49ccd6d7,0xc0b48d4e,0x88bd5580,0xff8fb02c,0x07d473b2,0xc75235e9,0xa2188af3,0x4fab1ac5,0x97576ec0,0x030fa3bc,0x0b7e7d2f,0xe8c946e8,0x70305600,0x40a5c9cc,0xc8b013b4,0x6d8260a9 .long 0x70bba85c,0x0368304f,0xa4a0d311,0xad090da1,0x2415eec1,0x7170e870,0x8461ea47,0xbfba35fe,0xc1e91938,0x6279019a,0x1afc415f,0xa47638f3,0xbcba0e0f,0x36c65cbb,0x034e2c48,0x02160efb .long 0x615cd9e4,0xe6c51073,0xf1243c06,0x498ec047,0xb17b3d8c,0x3e5a8809,0x0cc565f1,0x5cd99e61,0x7851dafe,0x81e312df,0xa79061e2,0xf156f5ba,0x880c590e,0x80d62b71,0x0a39faa1,0xbec9746f .long 0xc8ed1f7a,0x1d98a9c1,0xa81d5ff2,0x09e43bb5,0x0da0794a,0xd5f00f68,0x661aa836,0x412050d9,0x90747e40,0xa89f7c4e,0xb62a3686,0x6dc05ebb,0x308e3353,0xdf4de847,0x9fb53bb9,0x53868fbb .long 0xcfdcf7dd,0x2b09d2c3,0x723fcab4,0x41a9fce3,0x07f57ca3,0x73d905f7,0xac8e1555,0x080f9fb1,0x9ba7a531,0x7c088e84,0xed9a147f,0x07d35586,0xaf48c336,0x602846ab,0x0ccf0e79,0x7320fd32 .long 0xb18bd1ff,0xaa780798,0xafdd2905,0x52c2e300,0x434267cd,0xf27ea3d6,0x15605b5f,0x8b96d16d,0x4b45706b,0x7bb31049,0x743d25f8,0xe7f58b8e,0x87f30076,0xe9b5e45b,0x5d053d5a,0xd19448d6 .long 0xd3210a04,0x1ecc8cb9,0xdafb5269,0x6bc7d463,0x67c3489f,0x3e59b10a,0x65641e1b,0x1769788c,0xbd6cb838,0x8a53b82d,0x236d5f22,0x7066d6e6,0x6908536e,0x03aa1c61,0x66ae9809,0xc971da0d .long 0xc49a2fac,0x01b3a86b,0x3092e77a,0x3b8420c0,0x7d6fb556,0x02057300,0xbff40a87,0x6941b2a1,0x0658ff2a,0x140b6308,0x3424ab36,0x87804363,0x5751e299,0x0253bd51,0x449c3e3a,0xc75bcd76 .long 0x7f8f875d,0x92eb4090,0x56c26bbf,0x9c9d754e,0x8110bbe7,0x158cea61,0x745f91ea,0x62a6b802,0xc6e7394b,0xa79c41aa,0xad57ef10,0x445b6a83,0x6ea6f40c,0x0c5277eb,0x88633365,0x319fe96b .long 0x385f63cb,0x0b0fc61f,0x22bdd127,0x41250c84,0x09e942c2,0x67d153f1,0xc021ad5d,0x60920d08,0x724d81a5,0x229f5746,0x5bba3299,0xb7ffb892,0xde413032,0x518c51a1,0x3c2fd94c,0x2a9bfe77 .long 0x3191f4fd,0xcbcde239,0xd3d6ada1,0x43093e16,0x58769606,0x184579f3,0xd236625c,0x2c94a8b3,0x5c437d8e,0x6922b9c0,0xd8d9f3c8,0x3d4ae423,0x2e7090a2,0xf72c31c1,0xd76a55bd,0x4ac3f5f3 .long 0x6b6af991,0x342508fc,0x1b5cebbd,0x0d527100,0xdd440dd7,0xb84740d0,0x780162fd,0x748ef841,0xdfc6fafb,0xa8dbfe0e,0xf7300f27,0xeadfdf05,0xfeba4ec9,0x7d06555f,0x9e25fa97,0x12c56f83 .long 0xd39b8c34,0x77f84203,0x3125eddb,0xed8b1be6,0xf6e39dc5,0x5bbf2441,0x6a5d678a,0xb00f6ee6,0x57d0ea99,0xba456ecf,0x17e06c43,0xdcae0f58,0x0f5b4baa,0x01643de4,0xd161b9be,0x2c324341 .long 0xe126d468,0x80177f55,0x76748e09,0xed325f1f,0xcfa9bdc2,0x6116004a,0x3a9fb468,0x2d8607e6,0x6009d660,0x0e573e27,0x8d10c5a1,0x3a525d2e,0x3b9009a0,0xd26cb45c,0xde9d7448,0xb6b0cdc0 .long 0xe1337c26,0x949c9976,0xd73d68e5,0x6faadebd,0xf1b768d9,0x9e158614,0x9cc4f069,0x22dfa557,0xbe93c6d6,0xccd6da17,0xa504f5b9,0x24866c61,0x8d694da1,0x2121353c,0x0140b8c6,0x1c6ca580 .long 0xe964021e,0xc245ad8c,0x032b82b3,0xb83bffba,0x47ef9898,0xfaa220c6,0x982c948a,0x7e8d3ac6,0xbc2d124a,0x1faa2091,0x05b15ff4,0xbd54c3dd,0xc87c6fb7,0x386bf3ab,0xfdeb6f66,0xfb2b0563 .long 0x5b45afb4,0x4e77c557,0xefb8912d,0xe9ded649,0x42f6e557,0x7ec9bbf5,0x62671f00,0x2570dfff,0x88e084bd,0x2b3bfb78,0xf37fe5b4,0xa024b238,0x95649aee,0x44e7dc04,0x5e7ec1d8,0x498ca255 .long 0xaaa07e86,0x3bc766ea,0xf3608586,0x0db6facb,0xbdc259c8,0xbadd2549,0x041c649f,0x95af3c6e,0x02e30afb,0xb36a928c,0x008a88b8,0x9b5356ad,0xcf1d9e9d,0x4b67a5f1,0xa5d8d8ce,0xc6542e47 .long 0x7adfb6cc,0x73061fe8,0x98678141,0xcc826fd3,0x3c80515a,0x00e758b1,0x41485083,0x6afe3247,0xb6ae8a75,0x0fcb08b9,0x4acf51e1,0xb8cf388d,0x6961b9d6,0x344a5560,0x6a97fd0c,0x1a6778b8 .long 0xecc4c7e3,0xd840fdc1,0x16db68cc,0xde9fe47d,0xa3e216aa,0xe95f89de,0x9594a8be,0x84f1a6a4,0x5a7b162b,0x7ddc7d72,0xadc817a3,0xc5cfda19,0x78b58d46,0x80a5d350,0x82978f19,0x93365b13 .long 0x26a1fc90,0x2e44d225,0x4d70705d,0x0d6d10d2,0xd70c45f4,0xd94b6b10,0xb216c079,0x0f201022,0x658fde41,0xcec966c5,0x7e27601d,0xa8d2bc7d,0xff230be7,0xbfcce3e1,0x0033ffb5,0x3394ff6b .long 0x8132c9af,0xd890c509,0x361e7868,0xaac4b0eb,0xe82d15aa,0x5194ded3,0x23ae6b7d,0x4550bd2e,0xea5399d4,0x3fda318e,0x91638b80,0xd989bffa,0xa14aa12d,0x5ea124d0,0x3667b944,0x1fb1b899 .long 0x44c44d6a,0x95ec7969,0x57e86137,0x91df144a,0x73adac44,0x915fd620,0x59a83801,0x8f01732d,0x3aa0a633,0xec579d25,0xc9d6d59c,0x06de5e7c,0xb1ef8010,0xc132f958,0xe65c1a02,0x29476f96 .long 0xd34c3565,0x336a77c0,0x1b9f1e9e,0xef1105b2,0xf9e08002,0x63e6d08b,0xc613809e,0x9aff2f21,0x3a80e75d,0xb5754f85,0x6bbda681,0xde71853e,0x8197fd7a,0x86f041df,0x127817fa,0x8b332e08 .long 0xb9c20cda,0x05d99be8,0xd5cd0c98,0x89f7aad5,0x5bb94183,0x7ef936fe,0xb05cd7f2,0x92ca0753,0x74a1e035,0x9d65db11,0x13eaea92,0x02628cc8,0x49e4fbf2,0xf2d9e242,0xe384f8b7,0x94fdfd9b .long 0x63428c6b,0x65f56054,0x90b409a5,0x2f7205b2,0xff45ae11,0xf778bb78,0xc5ee53b2,0xa13045be,0x03ef77fe,0xe00a14ff,0xffef8bef,0x689cd59f,0x1e9ade22,0x3578f0ed,0x6268b6a8,0xe99f3ec0 .long 0xea1b3c3e,0xa2057d91,0xb8823a4a,0x2d1a7053,0x2cca451e,0xabbb336a,0x2218bb5d,0xcd2466e3,0xc8cb762d,0x3ac1f42f,0x7690211f,0x7e312aae,0x45d07450,0xebb9bd73,0x46c2213f,0x207c4b82 .long 0x375913ec,0x99d425c1,0x67908220,0x94e45e96,0xcd67dbf6,0xc08f3087,0xc0887056,0xa5670fbe,0x66f5b8fc,0x6717b64a,0x786fec28,0xd5a56aea,0xc0ff4952,0xa8c3f55f,0x457ac49b,0xa77fefae .long 0x98379d44,0x29882d7c,0x509edc8a,0xd000bdfb,0xe66fe464,0xc6f95979,0xfa61bde0,0x504a6115,0xeffea31a,0x56b3b871,0xf0c21a54,0x2d3de26d,0x834753bf,0x21dbff31,0x69269d86,0xe67ecf49 .long 0x151fe690,0x7a176952,0x7f2adb5f,0x03515804,0xd1b62a8d,0xee794b15,0xaae454e6,0xf004ceec,0xf0386fac,0x0897ea7c,0xd1fca751,0x3b62ff12,0x1b7a04ec,0x154181df,0xfb5847ec,0x2008e04a .long 0x41dbd772,0xd147148e,0x22942654,0x2b419f73,0xe9c544f7,0x669f30d3,0xc8540149,0x52a2c223,0x634dfb02,0x5da9ee14,0xf47869f3,0x5f074ff0,0xa3933acc,0x74ee878d,0x4fe35ed1,0xe6510651 .long 0xf1012e7a,0xb3eb9482,0xa8a566ae,0x51013cc0,0x47c00d3b,0xdd5e9243,0x946bb0e5,0x7fde089d,0xc731b4b3,0x030754fe,0x99fda062,0x12a136a4,0x5a1a35bc,0x7c1064b8,0x446c84ef,0xbf1f5763 .long 0xa16d4b34,0xed29a56d,0xdca21c4f,0x7fba9d09,0x6d8de486,0x66d7ac00,0x73a2a5e1,0x60061987,0x9da28ff0,0x8b400f86,0x43c4599c,0x3133f708,0xee28cb0d,0x9911c9b8,0x8e0af61d,0xcd7e2874 .long 0x72ed91fc,0x5a85f0f2,0x9cd4a373,0x85214f31,0x1925253c,0x881fe5be,0x91e8bc76,0xd8dc98e0,0x585cc3a2,0x7120affe,0x735bf97a,0x724952ed,0x3eb34581,0x5581e7dc,0xe52ee57d,0x5cbff4f2 .long 0x87d8cc7b,0x8d320a0e,0xf1d280d0,0x9beaa7f3,0x9beec704,0x7a0b9571,0x5b7f0057,0x9126332e,0x8ed3bd6d,0x01fbc1b4,0xd945eb24,0x35bb2c12,0x9a8ae255,0x6404694e,0x8d6abfb3,0xb6092eec .long 0xcc058865,0x4d76143f,0x6e249922,0x7b0a5af2,0x6a50d353,0x8aef9440,0x64f0e07a,0xe11e4bcc,0xa14a90fa,0x4472993a,0xba0c51d4,0x7706e20c,0x1532672d,0xf403292f,0x21829382,0x52573bfa .long 0x3b5bdb83,0x6a7bb6a9,0xa4a72318,0x08da65c0,0x63eb065f,0xc58d22aa,0x1b15d685,0x1717596c,0xb266d88b,0x112df0d0,0x5941945a,0xf688ae97,0x7c292cac,0x487386e3,0x57d6985c,0x42f3b50d .long 0x6a90fc34,0x6da4f998,0x65ca8a8d,0xc8f257d3,0x6951f762,0xc2feabca,0x74c323ac,0xe1bc81d0,0x251a2a12,0x1bc68f67,0xbe8a70dc,0x10d86587,0xf0f84d2e,0xd648af7f,0x6a43ac92,0xf0aa9ebc .long 0x27596893,0x69e3be04,0x45bf452b,0xb6bb02a6,0xf4c698c8,0x0875c11a,0xbece3794,0x6652b5c7,0x4f5c0499,0x7b3755fd,0xb5532b38,0x6ea16558,0xa2e96ef7,0xd1c69889,0x61ed8f48,0x9c773c3a .long 0x9b323abc,0x2b653a40,0xf0e1d791,0xe26605e1,0x4a87157a,0x45d41064,0xcbbce616,0x8f9a78b7,0xc407eddd,0xcf1e44aa,0xa35b964f,0x81ddd1d8,0xfd083999,0x473e339e,0x8e796802,0x6c94bdde .long 0x8545d185,0x5a304ada,0x738bb8cb,0x82ae44ea,0xdf87e10e,0x628a35e3,0xa15b9fe3,0xd3624f3d,0x14be4254,0xcc44209b,0xbdbc2ea5,0x7d0efcbc,0x04c37bbe,0x1f603362,0x56a5852c,0x21f363f5 .long 0xa8501550,0xa1503d1c,0xd8ab10bb,0x2251e0e1,0x6961c51c,0xde129c96,0x81910f68,0x1f7246a4,0x5f2591f2,0x2eb744ee,0x5e627157,0x3c47d33f,0x22f3bd68,0x4d6d62c9,0xcb8df856,0x6120a64b .long 0x7b5d07df,0x3a9ac6c0,0x7ef39783,0xa92b9558,0xab3a9b4f,0xe128a134,0xb1252f05,0x41c18807,0x80ba9b1c,0xfc7ed089,0xc532a9dd,0xac8dc6de,0x55246809,0xbf829cef,0x5b4ee80f,0x101b784f .long 0xb6f11603,0xc09945bb,0x41d2801e,0x57b09dbe,0xa97534a8,0xfba5202f,0xc17b9614,0x7fd8ae5f,0x78308435,0xa50ba666,0xd3868c4d,0x9572f77c,0x2dd7aab0,0x0cef7bfd,0x2c7c79ff,0xe7958e08 .long 0x25346689,0x81262e42,0xb07c7004,0x716da290,0xb7950ee3,0x35f911ea,0x261d21b5,0x6fd72969,0x08b640d3,0x52389803,0x887f12a1,0x5b0026ee,0x742e9311,0x20e21660,0x5ff77ff7,0x0ef6d541 .long 0xf9c41135,0x969127f0,0x68a64993,0xf21d60c9,0xe541875c,0x656e5d0c,0xa1d3c233,0xf1e0f84e,0x06002d60,0x9bcca359,0x06191552,0xbe2da60c,0x61181ec3,0x5da8bbae,0x65806f19,0x9f04b823 .long 0xd4b79bb8,0xf1604a7d,0x52c878c8,0xaee806fb,0x8d47b8e8,0x34144f11,0x949f9054,0x72edf52b,0x2127015a,0xebfca84e,0x9cb7cef3,0x9051d0c0,0x296deec8,0x86e8fe58,0x41010d74,0x33b28188 .long 0x171b445f,0x01079383,0x8131ad4c,0x9bcf21e3,0xc93987e8,0x8cdfe205,0xc92e8c8f,0xe63f4152,0x30add43d,0x729462a9,0xc980f05a,0x62ebb143,0x3b06e968,0x4f3954e5,0x242cf6b1,0xfe1d75ad .long 0xaf8685c8,0x5f95c6c7,0x2f8f01aa,0xd4c1c8ce,0x2574692a,0xc44bbe32,0xd4a4a068,0xb8003478,0x2eca3cdb,0x7c8fc6e5,0xec04d399,0xea1db16b,0x8f2bc5cf,0xb05bc82e,0xf44793d2,0x763d517f .long 0x08bd98d0,0x4451c1b8,0x6575f240,0x644b1cd4,0x7375d270,0x6907eb33,0xfa2286bd,0x56c8bebd,0xc4632b46,0xc713d2ac,0xafd60242,0x17da427a,0xc95c7546,0x313065b7,0xbf17a3de,0xf8239898 .long 0x4c830320,0xf3b7963f,0x903203e3,0x842c7aa0,0xe7327afb,0xaf22ca0a,0x967609b6,0x38e13092,0x757558f1,0x73b8fb62,0xf7eca8c1,0x3cc3e831,0xf6331627,0xe4174474,0xc3c40234,0xa77989ca .long 0x44a081e0,0xe5fd17a1,0xb70e296a,0xd797fb7d,0x481f719c,0x2b472b30,0xfe6f8c52,0x0e632a98,0xc5f0c284,0x89ccd116,0x2d987c62,0xf51088af,0x4c2de6cf,0x2a2bccda,0xf679f0f9,0x810f9efe .long 0x7ffe4b3e,0xb0f394b9,0xe5fa5d21,0x0b691d21,0x9dfbbc75,0xb0bd7747,0xfaf78b00,0xd2830fda,0x52434f57,0xf78c249c,0x98096dab,0x4b1f7545,0x8ff8c0b3,0x73bf6f94,0x454e134c,0x34aef03d .long 0xb7ac7ec5,0xf8d151f4,0xe50da7d5,0xd6ceb95a,0xdc3a0eb8,0xa1b492b0,0xb3dd2863,0x75157b69,0xc5413d62,0xe2c4c74e,0xbc5fc4c7,0xbe329ff7,0x60fa9dda,0x835a2aea,0x7445cb87,0xf117f5ad .long 0xb0166f7a,0xae8317f4,0xceec74e6,0xfbd3e3f7,0xe0874bfd,0xfdb516ac,0xc681f3a3,0x3d846019,0x7c1620b0,0x0b12ee5c,0x2b63c501,0xba68b4dd,0x6668c51e,0xac03cd32,0x4e0bcb5b,0x2a6279f7 .long 0x6ae85c10,0x17bd69b0,0x1dfdd3a6,0x72946979,0x2c078bec,0xd9a03268,0xbfd68a52,0x41c6a658,0x0e023900,0xcdea1024,0xb10d144d,0xbaeec121,0x058ab8dc,0x5a600e74,0xbb89ccdd,0x1333af21 .long 0x3aaba1f1,0xdf25eae0,0x3b7144cf,0x2cada16e,0x71ab98bc,0x657ee27d,0x7a6fc96e,0x99088b4c,0x3549dbd4,0x05d5c0a0,0xf158c3ac,0x42cbdf8f,0x87edd685,0x3fb6b3b0,0x86f064d0,0x22071cf6 .long 0xff2811e5,0xd2d6721f,0xfe7fae8c,0xdb81b703,0xd3f1f7bb,0x3cfb74ef,0x16cdeb5d,0x0cdbcd76,0x566a808c,0x4f39642a,0x340064d6,0x02b74454,0x0528fa6f,0xfabbadca,0xd3fc0bb6,0xe4c3074c .long 0xb796d219,0xb32cb8b0,0x34741dd9,0xc3e95f4f,0x68edf6f5,0x87212125,0xa2b9cb8e,0x7a03aee4,0xf53a89aa,0x0cd3c376,0x948a28dc,0x0d8af9b1,0x902ab04f,0xcf86a3f4,0x7f42002d,0x8aacb62a .long 0xf62ffd52,0x106985eb,0x5797bf10,0xe670b54e,0xc5e30aef,0x4b405209,0x4365b5e9,0x12c97a20,0x1fe32093,0x104646ce,0x3907a8c9,0x13cb4ff6,0xd46e726b,0x8b9f30d1,0xaba0f499,0xe1985e21 .long 0x10a230cd,0xc573dea9,0xcd30f947,0x24f46a93,0xabe2010a,0xf2623fcf,0x73f00e4f,0x3f278cb2,0x50b920eb,0xed55c67d,0x8e760571,0xf1cb9a2d,0x0895b709,0x7c50d109,0x190d4369,0x4207cf07 .long 0xc4127fe1,0x3b027e81,0x3ae9c566,0xa9f8b9ad,0xacbfbba5,0x5ab10851,0x569556f5,0xa747d648,0x2ba97bf7,0xcc172b5c,0xbcfa3324,0x15e0f77d,0x7686279d,0xa345b797,0xe38003d3,0x5a723480 .long 0x8f5fcda8,0xfd8e139f,0xbdee5bfd,0xf3e558c4,0xe33f9f77,0xd76cbaf4,0x71771969,0x3a4c97a4,0xf6dce6a7,0xda27e84b,0x13e6c2d1,0xff373d96,0xd759a6e9,0xf115193c,0x63d2262c,0x3f9b7025 .long 0x317cd062,0xd9764a31,0x199f8332,0x30779d8e,0x16b11b0b,0xd8074106,0x78aeaed8,0x7917ab9f,0x28fb1d8e,0xb67a9cbe,0x136eda33,0x2e313563,0xa371a86c,0x010b7069,0x6744e6b7,0x44d90fa2 .long 0xd6b3e243,0x68190867,0x59048c48,0x9fe6cd9d,0x95731538,0xb900b028,0x32cae04f,0xa012062f,0x9399d082,0x8107c8bc,0x41df12e2,0x47e8c54a,0xb6ef3f73,0x14ba5117,0x81362f0b,0x22260bea .long 0x1a18cc20,0x90ea261e,0x2321d636,0x2192999f,0xe311b6a0,0xef64d314,0x3b54a1f5,0xd7401e4c,0x6fbca2ba,0x19019983,0x8fbffc4b,0x46ad3293,0x3786bf40,0xa142d3f6,0xb67039fc,0xeb5cbc26 .long 0x252bd479,0x9cb0ae6c,0x12b5848f,0x05e0f88a,0xa5c97663,0x78f6d2b2,0xc162225c,0x6f6e149b,0xde601a89,0xe602235c,0xf373be1f,0xd17bbe98,0xa8471827,0xcaf49a5b,0x18aaa116,0x7e1a0a85 .long 0x270580c3,0x6c833196,0xf1c98a14,0x1e233839,0xae34e0a5,0x67b2f7b4,0xd8ce7289,0x47ac8745,0x100dd467,0x2b74779a,0x4ee50d09,0x274a4337,0x83608bc9,0x603dcf13,0xc89e8388,0xcd9da6c3 .long 0x355116ac,0x2660199f,0xb6d18eed,0xcc38bb59,0x2f4bc071,0x3075f31f,0x265dc57e,0x9774457f,0xc6db88bb,0x06a6a9c8,0x4ec98e04,0x6429d07f,0x05ecaa8b,0x8d05e57b,0x7872ea7b,0x20f140b1 .long 0xca494693,0xdf8c0f09,0xf252e909,0x48d3a020,0x57b14b12,0x4c5c29af,0xbf47ad1c,0x7e6fa37d,0x49a0c938,0x66e7b506,0x6be5f41f,0xb72c0d48,0xb2359412,0x6a6242b8,0x8e859480,0xcd35c774 .long 0x87baa627,0x12536fea,0xf72aa680,0x58c1fec1,0x601e5dc9,0x6c29b637,0xde9e01b9,0x9e3c3c1c,0x2bcfe0b0,0xefc8127b,0x2a12f50d,0x35107102,0x4879b397,0x6ccd6cb1,0xf8a82f21,0xf792f804 .long 0xa9b46402,0x509d4804,0xc10f0850,0xedddf85d,0x4b6208aa,0x928410dc,0x391012dc,0xf6229c46,0x7727b9b6,0xc5a7c41e,0xaa444842,0x289e4e4b,0xe9a947ea,0x049ba1d9,0x83c8debc,0x44f9e47f .long 0x611f8b8e,0xfa77a1fe,0xf518f427,0xfd2e416a,0x114ebac3,0xc5fffa70,0x5d89697b,0xfe57c4e9,0xb1aaf613,0xfdd053ac,0xea585a45,0x31df210f,0x24985034,0x318cc10e,0x5f1d6130,0x1a38efd1 .long 0x0b1e9e21,0xbf86f237,0x1dbe88aa,0xb258514d,0x90c1baf9,0x1e38a588,0xbdb9b692,0x2936a01e,0x6dd5b20c,0xd576de98,0x70f98ecf,0xb586bf71,0xc42d2fd7,0xcccf0f12,0xfb35bd7b,0x8717e61c .long 0x35e6fc06,0x8b1e5722,0x0b3e13d5,0x3477728f,0xaa8a7372,0x150c294d,0x3bfa528a,0xc0291d43,0xcec5a196,0xc6c8bc67,0x5c2e8a7c,0xdeeb31e4,0xfb6e1c51,0xba93e244,0x2e28e156,0xb9f8b71b .long 0x968a2ab9,0xce65a287,0x46bbcb1f,0xe3c5ce69,0xe7ae3f30,0xf8c835b9,0xff72b82b,0x16bbee26,0xfd42cd22,0x665e2017,0xf8b1d2a0,0x1e139970,0x79204932,0x125cda29,0x49c3bee5,0x7aee94a5 .long 0x89821a66,0x68c70160,0x8f981669,0xf7c37678,0x48cc3645,0xd90829fc,0xd70addfc,0x346af049,0x370bf29c,0x2057b232,0x42e650ee,0xf90c73ce,0xa126ab90,0xe03386ea,0x975a087b,0x0e266e7e .long 0x0fca65d9,0x80578eb9,0x16af45b8,0x7e2989ea,0xcac75a4e,0x7438212d,0x4fef36b8,0x38c7ca39,0xd402676a,0x8650c494,0xf72c7c48,0x26ab5a66,0xce3a464e,0x4e6cb426,0x2b72f841,0xf8f99896 .long 0x1a335cc8,0x8c318491,0x6a5913e4,0x563459ba,0xc7b32919,0x1b920d61,0xa02425ad,0x805ab8b6,0x8d006086,0x2ac512da,0xbcf5c0fd,0x6ca4846a,0xac2138d7,0xafea51d8,0x344cd443,0xcb647545 .long 0xbd7d9040,0x0429ee8f,0x819b9c96,0xee66a2de,0xdea7d744,0x54f9ec25,0x671721bb,0x2ffea642,0x114344ea,0x4f19dbd1,0xfd0dbc8b,0x04304536,0x29ec7f91,0x014b50aa,0xbb06014d,0xb5fc22fe .long 0x1ee682e0,0x60d963a9,0xfe85c727,0xdf48abc0,0x2e707c2d,0x0cadba13,0xa645aeff,0xde608d3a,0xedafd883,0x05f1c28b,0xbd94de1f,0x3c362ede,0x13593e41,0x8dd0629d,0x766d6eaf,0x0a5e736f .long 0xf68cf9d1,0xbfa92311,0xc1797556,0xa4f9ef87,0x5601c209,0x10d75a1f,0x09b07361,0x651c374c,0x88b5cead,0x49950b58,0x6fa9dbaa,0x0ef00058,0x4e15f33a,0xf51ddc26,0x2ef46140,0x1f8b5ca6 .long 0xee9523f0,0x343ac0a3,0x975ea978,0xbb75eab2,0x107387f4,0x1bccf332,0x9ab0062e,0x790f9259,0x1e4f6a5f,0xf1a363ad,0x62519a50,0x06e08b84,0x7265f1ee,0x60915187,0x93ae985e,0x6a80ca34 .long 0xaaba4864,0x81b29768,0x8d52a7d6,0xb13cabf2,0x8ead03f1,0xb5c36348,0x81c7c1c0,0xc932ad95,0xcae1e27b,0x5452708e,0x1b0df648,0x9dac4269,0xdfcdb8bc,0x233e3f0c,0xec540174,0xe6ceccdf .long 0x95081181,0xbd0d845e,0x699355d5,0xcc8a7920,0xc3b375a8,0x111c0f6d,0xfd51e0dc,0xfd95bc6b,0x6888523a,0x4a106a26,0xcb01a06d,0x4d142bd6,0xadb9b397,0x79bfd289,0xe9863914,0x0bdbfb94 .long 0x1660f6a6,0x29d8a229,0x551c042d,0x7f6abcd6,0x0ac3ffe8,0x13039deb,0xec8523fb,0xa01be628,0x0ca1c328,0x6ea34103,0xb903928e,0xc74114bd,0x9e9144b0,0x8aa4ff4e,0x7f9a4b17,0x7064091f .long 0xe447f2c4,0xa3f4f521,0x604291f0,0x81b8da7a,0x7d5926de,0xd680bc46,0x34a1202f,0x84f21fd5,0x4e9df3d8,0x1d1e3181,0x39ab8d34,0x1ca4861a,0x5b19aa4a,0x809ddeec,0x4d329366,0x59f72f7e .long 0x386d5087,0xa2f93f41,0xdd67d64f,0x40bf739c,0x66702158,0xb4494205,0x73b1e178,0xc33c65be,0x38ca6153,0xcdcd657c,0xdc791976,0x97f4519a,0xcd6e1f39,0xcc7c7f29,0x7e3c3932,0x38de9cfb .long 0x7b793f85,0xe448eba3,0xf067e914,0xe9f8dbf9,0xf114ae87,0xc0390266,0xcd6a8e2a,0x39ed75a7,0x7ffba390,0xadb14848,0x6af9bc09,0x67f8cb8b,0x9c7476db,0x322c3848,0x52a538d6,0xa320fecf .long 0xb2aced2b,0xe0493002,0x616bd430,0xdfba1809,0xc331be70,0x531c4644,0x90d2e450,0xbc04d32e,0x0f9f142d,0x1805a0d1,0x47ee5a23,0x2c44a0c5,0x3989b4e3,0x31875a43,0x0c063481,0x6b1949fd .long 0xbe0f4492,0x2dfb9e08,0xe9d5e517,0x3ff0da03,0xf79466a8,0x03dbe9a1,0x15ea9932,0x0b87bcd0,0xab1f58ab,0xeb64fc83,0x817edc8a,0x6d9598da,0x1d3b67e5,0x699cff66,0x92635853,0x645c0f29 .long 0xeabaf21c,0x253cdd82,0x2241659e,0x82b9602a,0x2d9f7091,0x2cae07ec,0x8b48cd9b,0xbe4c720c,0x6f08d6c9,0x6ce5bc03,0xaf10bf40,0x36e8a997,0x3e10ff12,0x83422d21,0xbcc12494,0x7b26d3eb .long 0xc9469ad6,0xb240d2d0,0x30afa05b,0xc4a11b4d,0xdd6ba286,0x4b604ace,0x3ee2864c,0x18486600,0x8d9ce5be,0x5869d6ba,0xff4bfb0d,0x0d8f68c5,0x5700cf73,0xb69f210b,0x6d37c135,0x61f6653a .long 0x5aff5a48,0xff3d432b,0x72ba3a69,0x0d81c4b9,0xfa1899ef,0xee879ae9,0x2d6acafd,0xbac7e2a0,0x1c664399,0xd6d93f6c,0x5bcb135d,0x4c288de1,0x9dab7cbf,0x83031dab,0x3abbf5f0,0xfe23feb0 .long 0xcdedca85,0x9f1b2466,0x1a09538c,0x140bb710,0x5e11115d,0xac8ae851,0x6f03f59e,0x0d63ff67,0x7d234afb,0x755e5551,0x7e208fc1,0x61c2db4e,0xf28a4b5d,0xaa9859ce,0x34af030f,0xbdd6d4fc .long 0x3be01cb1,0xd1c4a26d,0x243aa07c,0x9ba14ffc,0xb2503502,0xf95cd3a9,0x7d2a93ab,0xe379bc06,0xd4ca8d68,0x3efc18e9,0x80bb412a,0x083558ec,0x9645a968,0xd903b940,0x9ba6054f,0xa499f0b6 .long 0xb8349abe,0x208b573c,0x30b4fc1c,0x3baab3e5,0xcb524990,0x87e978ba,0xccdf0e80,0x3524194e,0x7d4bcc42,0x62711725,0xb90109ba,0xe90a3d9b,0x1323e1e0,0x3b1bdd57,0x5eae1599,0xb78e9bd5 .long 0x9e03d278,0x0794b746,0xd70e6297,0x80178605,0x99c97855,0x171792f8,0xf5a86b5c,0x11b393ee,0xd8884f27,0x48ef6582,0xbf19ba5f,0xbd44737a,0xa42062c6,0x8698de4c,0x61ce9c54,0x8975eb80 .long 0xd7fe71f3,0xd50e57c7,0xbc97ce38,0x15342190,0x4df07b63,0x51bda2de,0x200eb87d,0xba12aeae,0xa9b4f8f6,0xabe135d2,0xfad6d99c,0x04619d65,0x7994937c,0x4a6683a7,0x6f94f09a,0x7a778c8b .long 0x20a71b89,0x8c508623,0x1c229165,0x241a2aed,0xaaf83a99,0x352be595,0x1562bac8,0x9fbfee7f,0x5c4017e3,0xeaf658b9,0x15120b86,0x1dc7f9e0,0x4c034d6f,0xd84f13dd,0xeaea3038,0x283dd737 .long 0xcd85d6a2,0x197f2609,0xfae60177,0x6ebbc345,0x4e12fede,0xb80f031b,0x07a2186b,0xde55d0c2,0x24dcdd5a,0x1fb3e37f,0x7ed191fb,0x8d602da5,0x76023e0d,0x108fb056,0x459c20c0,0x70178c71 .long 0x3fe54cf0,0xfad5a386,0x02bbb475,0xa4a3ec4f,0x919d94d7,0x1aa5ec20,0xa81e4ab3,0x5d3b63b5,0x5ad3d2af,0x7fa733d8,0xd1ac7a37,0xfbc586dd,0x40779614,0x282925de,0xe74a242a,0xfe0ffffb .long 0x906151e5,0x3f39e67f,0x55e10649,0xcea27f5f,0xc17cf7b7,0xdca1d4e1,0x2fe2362d,0x0c326d12,0x7dd35df3,0x05f7ac33,0xc396dbdf,0x0c3b7639,0x03b7db1c,0x0912f5ac,0x5c9ed4a9,0x9dea4b70 .long 0xaae3f639,0x475e6e53,0xfc278bac,0xfaba0e7c,0x9490375f,0x16f9e221,0xa5a7ed0a,0xaebf9746,0xf41ad5d6,0x45f9af3f,0xb2e99224,0x03c4623c,0xb3cf56aa,0x82c5bb5c,0x34567ed3,0x64311819 .long 0x8be489ac,0xec57f211,0xb9a1104b,0x2821895d,0x6064e007,0x610dc875,0x5b20d0fe,0x8e526f3f,0x5b645aee,0x6e71ca77,0x800e10ff,0x3d1dcb9f,0x189cf6de,0x36b51162,0x6bb17353,0x2c5a3e30 .long 0x2a6c6fbf,0xc186cd3e,0x4bf97906,0xa74516fa,0x279d6901,0x5b4b8f4b,0x2b573743,0x0c4e57b4,0xb6e386b6,0x75fdb229,0x99deac27,0xb46793fd,0xcf712629,0xeeec47ea,0xcbc3b2dd,0xe965f3c4 .long 0x425c6559,0x8dd1fb83,0x0af06fda,0x7fc00ee6,0x33d956df,0xe98c9225,0x4fbdc8a2,0x0f1ef335,0xb79b8ea2,0x2abb5145,0xbdbff288,0x40fd2945,0xd7185db7,0x6a814ac4,0xc084609a,0xc4329d6f .long 0xed1be45d,0xc9ba7b52,0xe4cd2c74,0x891dd20d,0x824139b1,0x5a4d4a7f,0xb873c710,0x66c17716,0x2843c4e0,0x5e5bc141,0xb97eb5bf,0xd5ac4817,0x450c95c7,0xc0f8af54,0x318406c5,0xc91b3fa0 .long 0xab9d97f8,0x360c340a,0x90a2d611,0xfb57bd07,0xa6a6f7e5,0x4339ae3c,0x2feb8a10,0x9c1fcd2a,0xc7ea7432,0x972bcca9,0x308076f6,0x1b0b924c,0x2a5b4ca5,0x80b2814a,0x61ef3b29,0x2f78f55b .long 0xc18a414f,0xf838744a,0x903d0a86,0xc611eaae,0x2a453f55,0x94dabc16,0x14efb279,0xe6f2e3da,0x9320dc3c,0x5b7a6017,0x8df6b5a4,0x692e382f,0x2d40fa90,0x3f5e15e0,0x643dd318,0xc87883ae .long 0x53544774,0x511053e4,0x3adba2bc,0x834d0ecc,0xbae371f5,0x4215d7f7,0x6c8663bc,0xfcfd57bf,0xd6901b1d,0xded2383d,0xb5587dc3,0x3b49fbb4,0x07625f62,0xfd44a08d,0x9de9b762,0x3ee4d65b .long 0x0d63d1fa,0x64e5137d,0x02a9d89f,0x658fc052,0x50436309,0x48894874,0xd598da61,0xe9ae30f8,0x818baf91,0x2ed710d1,0x8b6a0c20,0xe27e9e06,0x1c1a6b44,0x1e28dcfb,0xd6ac57dc,0x883acb64 .long 0xc2c6ff70,0x8735728d,0xc5dc2235,0x79d6122f,0x19e277f9,0x23f5d003,0xdded8cc7,0x7ee84e25,0x63cd880a,0x91a8afb0,0x3574af60,0x3f3ea7c6,0x02de7f42,0x0cfcdc84,0xb31aa152,0x62d0792f .long 0x8a5807ce,0x8e1b4e43,0xe4109a7e,0xad283893,0xafd59dda,0xc30cc9cb,0x3d8d8093,0xf65f36c6,0xa60d32b2,0xdf31469e,0x3e8191c8,0xee93df4b,0x355bdeb5,0x9c1017c5,0x8616aa28,0xd2623185 .long 0xdec31a21,0xb02c83f9,0x6ad9d573,0x988c8b23,0xa57be365,0x53e983ae,0x646f834e,0xe968734d,0x5da6309b,0x9137ea8f,0xc1f1ce16,0x10f3a624,0xca440921,0x782a9ea2,0x5b46f1b5,0xdf94739e .long 0xcce85c9b,0x9f9be006,0xa4c7c2d3,0x360e70d6,0xaefa1e60,0x2cd5beea,0x8c3d2b6d,0x64cf63c0,0xe1cf6f90,0xfb107fa3,0xd5e044e6,0xb7e937c6,0xce34db9f,0x74e8ca78,0x3e210bd0,0x4f8b36c1 .long 0x34a35ea8,0x1df165a4,0x4d4412f6,0x3418e0f7,0x518836c3,0x5af1f8af,0x130e1965,0x42ceef4d,0x543a1957,0x5560ca0b,0x886cb123,0xc33761e5,0xfe98ed30,0x66624b1f,0x1090997d,0xf772f4bf .long 0x4885d410,0xf4e540bb,0x9ba5f8d7,0x7287f810,0xde98dfb1,0x22d0d865,0xbcfbb8a3,0x49ff51a1,0x6bc3012e,0xb6b6fa53,0x170d541d,0x3d31fd72,0x4b0f4966,0x8018724f,0x87dbde07,0x79e7399f .long 0xf4f8b16a,0x56f8410e,0xc47b266a,0x97241afe,0x6d9c87c1,0x0a406b8e,0xcd42ab1b,0x803f3e02,0x04dbec69,0x7f0309a8,0x3bbad05f,0xa83b85f7,0xad8e197f,0xc6097273,0x5067adc1,0xc097440e .long 0x3524ff16,0x730eafb6,0x823fc6ce,0xd7f9b51e,0x443e4ac0,0x27bd0d32,0x4d66f217,0x40c59ad9,0x17c387a4,0x6c33136f,0xeb86804d,0x5043b8d5,0x675a73c9,0x74970312,0xf16669b6,0x838fdb31 .long 0x418e7ddd,0xc507b6dd,0x472f19d6,0x39888d93,0x0c27eb4d,0x7eae26be,0xfbabb884,0x17b53ed3,0x2b01ae4f,0xfc27021b,0xcf488682,0x88462e87,0x215e2d87,0xbee096ec,0xd242e29b,0xeb2fea9a .long 0xb821fc28,0x5d985b5f,0xdc1e2ad2,0x89d2e197,0x9030ba62,0x55b566b8,0x4f41b1c6,0xe3fd41b5,0xb9a96d61,0xb738ac2e,0x369443f4,0x7f8567ca,0xf803a440,0x8698622d,0x8fe2f4dc,0x2b586236 .long 0x56b95bce,0xbbcc00c7,0x616da680,0x5ec03906,0x72214252,0x79162ee6,0x86a892d2,0x43132b63,0x2f3263bf,0x4bdd3ff2,0x9cd0a142,0xd5b3733c,0x44415ccb,0x592eaa82,0x8d5474ea,0x663e8924 .long 0x5236344e,0x8058a25e,0xbda76ee6,0x82e8df9d,0x11cc3d22,0xdcf6efd8,0x3b4ab529,0x00089cda,0xbd38a3db,0x91d3a071,0xef72b925,0x4ea97fc0,0xea3edf75,0x0c9fc15b,0xa4348ed3,0x5a6297cd .long 0xce7c42d4,0x0d38ab35,0x82feab10,0x9fd493ef,0x82111b45,0x46056b6d,0x73efc5c3,0xda11dae1,0x5545a7fb,0xdc740278,0x40d507e6,0xbdb2601c,0x7066fa58,0x121dfeeb,0x39ae8c2a,0x214369a8 .long 0x06e0956c,0x195709cb,0x010cd34b,0x4c9d254f,0x0471a532,0xf51e13f7,0x1e73054d,0xe19d6791,0xdb5c7be3,0xf702a628,0xb24dde05,0xc7141218,0xf29b2e2e,0xdc18233c,0x85342dba,0x3a6bd1e8 .long 0xb311898c,0x3f747fa0,0xcd0eac65,0xe2a272e4,0xf914d0bc,0x4bba5851,0xc4a43ee3,0x7a1a9660,0xa1c8cde9,0xe5a367ce,0x7271abe3,0x9d958ba9,0x3d1615cd,0xf3ff7eb6,0xf5ae20b0,0xa2280dce .long 0xcf640147,0x56dba5c1,0x5e83d118,0xea5a2e3d,0xda24c511,0x04cd6b6d,0xe854d214,0x1c0f4671,0x69565381,0x91a6b7a9,0xdecf1f5b,0xdc966240,0xfcf5d009,0x1b22d21c,0x9021dbd5,0x2a05f641 .long 0xd4312483,0x8c0ed566,0x643e216f,0x5179a95d,0x17044493,0xcc185fec,0x54991a21,0xb3063339,0x0081a726,0xd801ecdb,0x4fa89bbb,0x0149b0c6,0x4391b6b9,0xafe9065a,0xd633f3a3,0xedc92786 .long 0xae6a8e13,0xe408c24a,0x9f3897ab,0x85833fde,0xd81a0715,0x43800e7e,0xb44ffc5f,0xde08e346,0xcdeff2e0,0x7094184c,0x165eaed1,0x49f9387b,0x777c468a,0x635d6129,0x538c2dd8,0x8c0dcfd1 .long 0x7a6a308b,0xd6d9d9e3,0x4c2767d3,0x62375830,0xf38cbeb6,0x874a8bc6,0xccb6fd9e,0xd94d3f1a,0xba21f248,0x92a9735b,0x6cd1efb0,0x272ad0e5,0x05b03284,0x7437b69c,0x6948c225,0xe7f04702 .long 0xcba2ecec,0x8a56c04a,0xe3a73e41,0x0c181270,0x03e93725,0x6cb34e9d,0x496521a9,0xf77c8713,0xfa7f9f90,0x94569183,0x8c9707ad,0xf2e7aa4c,0x26c1c9a3,0xced2c9ba,0x40197507,0x9109fe96 .long 0xe9adfe1c,0x9ae868a9,0x314e39bb,0x3984403d,0xf2fe378f,0xb5875720,0xba44a628,0x33f901e0,0x3652438c,0xea1125fe,0x9dd1f20b,0xae9ec4e6,0xbebf7fbd,0x1e740d9e,0x42dbe79c,0x6dbd3ddc .long 0xedd36776,0x62082aec,0xe9859039,0xf612c478,0x032f7065,0xa493b201,0x4ff9b211,0xebd4d8f2,0xaac4cb32,0x3f23a0aa,0x15ed4005,0xea3aadb7,0xafa27e63,0xacf17ea4,0xc11fd66c,0x56125c1a .long 0x3794f8dc,0x266344a4,0x483c5c36,0xdcca923a,0x3f9d10a0,0x2d6b6bbf,0x81d9bdf3,0xb320c5ca,0x47b50a95,0x620e28ff,0xcef03371,0x933e3b01,0x99100153,0xf081bf85,0xc3a8c8d6,0x183be9a0 .long 0xd6bbe24d,0x4e3ddc5a,0x53843795,0xc6c74630,0x65ec2d4c,0x78193dd7,0xcd3c89b2,0xb8df26cc,0x5a483f8d,0x98dbe399,0x7dd3313a,0x72d8a957,0xab0bd375,0x65087294,0x7c259d16,0xfcd89248 .long 0x7613aa81,0x8a9443d7,0x85fe6584,0x80100800,0x7fb10288,0x70fc4dbc,0xe86beee8,0xf58280d3,0x7c978c38,0x14fdd82f,0x0de44d7b,0xdf1204c1,0x4160252f,0xa08a1c84,0xc17646a5,0x591554ca .long 0xa05bd525,0x214a37d6,0x07957b3c,0x48d5f09b,0xd7109bc9,0x0247cdcb,0x30599ce7,0x40f9e4bb,0xf46ad2ec,0xc325fa03,0xc3e3f9ee,0x00f766cf,0xd43a4577,0xab556668,0x3ee03b93,0x68d30a61 .long 0x77b46a08,0x7ddc81ea,0xc7480699,0xcf5a6477,0x6633f683,0x43a8cb34,0x92363c60,0x1b867e6b,0x1f60558e,0x43921114,0x2f41450e,0xcdbcdd63,0xcc630e8b,0x7fc04601,0x97038b43,0xea7c66d5 .long 0x04e99fd8,0x7259b8a5,0x4785549a,0x98a8dd12,0x840552e1,0x0e459a7c,0x4bb0909e,0xcdfcf4d0,0x53758da7,0x34a86db2,0xeac997e1,0xe643bb83,0x530c5b7e,0x96400bd7,0xb41c8b52,0x9f97af87 .long 0xfbeee3f9,0x34fc8820,0x49091afd,0x93e53490,0x9a31f35c,0x764b9be5,0x57e3d924,0x71f37864,0x943aa75e,0x02fb34e0,0xab8ff6e4,0xa18c9c58,0x33cf0d19,0x080f31b1,0x083518a7,0x5c9682db .long 0xb709c3de,0x873d4ca6,0x3575b8f0,0x64a84262,0x020154bb,0x6275da1f,0xd17cf1ab,0x97678caa,0x951a95c3,0x8779795f,0x50fccc08,0xdd35b163,0x33d8f031,0x32709627,0x498dd85c,0x3c5ab10a .long 0x41dca566,0xb6c185c3,0xd8622aa3,0x7de7feda,0x901b6dfb,0x99e84d92,0x7c4ad288,0x30a02b0e,0x2fd3cf36,0xc7c81daa,0xdf89e59f,0xd1319547,0xcd496733,0xb2be8184,0x93d3412b,0xd5f449eb .long 0x25fe531d,0x7ea41b1b,0x6a1d5646,0xf9797432,0x2bde501a,0x86067f72,0x0c85e89c,0xf91481c0,0xf8b05bc6,0xca8ee465,0x02e83cda,0x1844e1cf,0xb4dbe33b,0xca82114a,0x4eabfde2,0x0f9f8769 .long 0x38b27fe2,0x4936b1c0,0xaba402df,0x63b6359b,0x656bdbab,0x40c0ea2f,0x6580c39c,0x9c992a89,0x2a60aed1,0x600e8f15,0xe0bf49df,0xeb089ca4,0x2d42d99a,0x9c233d7d,0x4c6bc2fa,0x648d3f95 .long 0xe1add3f3,0xdcc383a8,0x4f64a348,0xf42c0c6a,0x0030dbdb,0x2abd176f,0x7d6c215e,0x4de501a3,0x4b9a64bc,0x4a107c1f,0x2496cd59,0xa77f0ad3,0x7688dffb,0xfb78ac62,0x67937d8e,0x7025a2ca .long 0xd1a8f4e7,0xfde8b2d1,0x7354927c,0xf5b3da47,0xd9205735,0xe48606a3,0xe177b917,0xac477cc6,0xa883239a,0xfb1f73d2,0xcc8b8357,0xe12572f6,0xfb1f4f86,0x9d355e9c,0xd9f3ec6e,0x89b795f8 .long 0xb54398dc,0x27be56f1,0x3fedeed5,0x1890efd7,0x9c6d0140,0x62f77f1f,0x596f0ee4,0x7ef0e314,0xcc61dab3,0x50ca6631,0xf4866e4f,0x4a39801d,0xae363b39,0x66c8d032,0x2ead66aa,0x22c591e5 .long 0xde02a53e,0x954ba308,0xd389f357,0x2a6c060f,0xfbf40b66,0xe6cfcde8,0xc6340ce1,0x8e02fc56,0x73adb4ba,0xe4957795,0xa7b03805,0x7b86122c,0x0c8e6fa6,0x63f83512,0x057d7804,0x83660ea0 .long 0x21ba473c,0xbad79105,0xded5389d,0xb6c50bee,0xaa7c9bc0,0xee2caf4d,0x8c4e98a7,0xd97b8de4,0xab3bbddb,0xa9f63e70,0x2597815a,0x3898aabf,0xac15b3d9,0x7659af89,0x703ce784,0xedf7725b .long 0xe085116b,0x25470fab,0x87285310,0x04a43375,0xe2bfd52f,0x4e39187e,0x7d9ebc74,0x36166b44,0xfd4b322c,0x92ad433c,0xba79ab51,0x726aa817,0xc1db15eb,0xf96eacd8,0x0476be63,0xfaf71e91 .long 0x641fad98,0xdd69a640,0x29622559,0xb7995918,0xde4199dc,0x03c6daa5,0xad545eb4,0x92cadc97,0x256534e4,0x1028238b,0x8595409a,0x73e80ce6,0xd05dc59b,0x690d4c66,0x981dee80,0xc95f7b8f .long 0xd856ac25,0xf4337014,0xac524dca,0x441bd9dd,0x5f0499f5,0x640b3d85,0xd5fda182,0x39cf84a9,0xb2aa95a0,0x04e7b055,0x0ddf1860,0x29e33f0a,0x423f6b43,0x082e74b5,0x0aaa2b0f,0x217edeb9 .long 0x83cbea55,0x58b83f35,0xbc185d70,0xc485ee4d,0x1e5f6992,0x833ff03b,0xcf0c0dd5,0xb5b9b9cc,0x4e9e8a50,0x7caaee8e,0x6269dafd,0x462e907b,0xfbe791c6,0x6ed5cee9,0xed430790,0x68ca3259 .long 0x13b5ba88,0x2b72bdf2,0x35ef0ac4,0x60294c8a,0x19b99b08,0x9c3230ed,0x6c2589aa,0x560fff17,0xd6770374,0x552b8487,0x9a56f685,0xa373202d,0x45f175d9,0xd3e7f907,0xd080d810,0x3c2f315f .long 0x7b9520e8,0x1130e9dd,0x0af037b5,0xc078f9e2,0x1e9c104c,0x38cd2ec7,0xc472fe92,0x0f684368,0x6247e7ef,0xd3f1b5ed,0x396dfe21,0xb32d33a9,0x4a9aa2c2,0x46f59cf4,0xff0f7e41,0x69cd5168 .long 0x4b3234da,0x3f59da0f,0xb4579ebe,0xcf0b0235,0x6d2476c7,0x6d1cbb25,0x9dc30f08,0x4f0837e6,0x906f6e98,0x9a4075bb,0xc761e7d1,0x253bb434,0x6e73af10,0xde2e645f,0x0c5f131c,0xb89a4060 .long 0xb8cc037f,0xd12840c5,0x7405bb47,0x3d093a5b,0x206348b8,0x6202c253,0xc55a3ca7,0xbf5d57fc,0x8c3bef48,0x89f6c90c,0x5a0a960a,0x23ac7623,0x552b42ab,0xdfbd3d6b,0x132061f6,0x3ef22458 .long 0xc97e6516,0xd74e9bda,0xc230f49e,0x88779360,0x1e74ea49,0xa6ec1de3,0x3fb645a2,0x581dcee5,0x8f483f14,0xbaef2391,0xd137d13b,0x6d2dddfc,0xd2743a42,0x54cde50e,0xe4d97e67,0x89a34fc5 .long 0x12e08ce5,0x13f1f5b3,0xa7f0b2ca,0xa80540b8,0x01982805,0x854bcf77,0x233bea04,0xb8653ffd,0x02b0b4c9,0x8e7b8787,0x9acb170a,0x2675261f,0x930c14e5,0x061a9d90,0xdef0abea,0xb59b30e0 .long 0x0200ec7d,0x1dc19ea6,0x0bce132b,0xb6f4a3f9,0xf13e27e0,0xb8d5de90,0x1fade16f,0xbaee5ef0,0xe4c6cf38,0x6f406aaa,0xd1369815,0xab4cfe06,0xefd550c6,0x0dcffe87,0x75ff7d39,0x9d4f59c7 .long 0x51deb6ad,0xb02553b1,0xb1877749,0x812399a4,0xca6006e1,0xce90f71f,0xb02b6e77,0xc32363a6,0xdc36c64d,0x02284fbe,0xa7e1ae61,0x86c81e31,0xb909d94a,0x2576c7e5,0x818b2bb0,0x8b6f7d02 .long 0x56faa38a,0xeca3ed07,0x9305bb54,0xa3790e6c,0x7bc73061,0xd784eeda,0x6dd50614,0xbd56d369,0x229a8aa9,0xd6575949,0x4595ec28,0xdcca8f47,0x06ab4fe6,0x814305c1,0x24f43f16,0xc8c39768 .long 0x523f2b36,0xe2a45f36,0x920d93bb,0x995c6493,0x90f1632b,0xf8afdab7,0x1c295954,0x79ebbecd,0x79592f48,0xc7bb3ddb,0x5f88e998,0x67216a7b,0xbc01193e,0xd91f098b,0xb1db83fc,0xf7d928a5 .long 0xe991f600,0x55e38417,0x2981a934,0x2a91113e,0x06b13bde,0xcbc9d648,0x0755ff44,0xb011b6ac,0x045ec613,0x6f4cb518,0xc2f5930a,0x522d2d31,0x382e65de,0x5acae1af,0x27bc966f,0x57643067 .long 0x1c7193f0,0x5e12705d,0x3be8858e,0xf0f32f47,0x96c6dfc7,0x785c3d7d,0xbf31795d,0xd75b4a20,0x342659d4,0x91acf17b,0x44f0378f,0xe596ea34,0xce52129d,0x4515708f,0x79f2f585,0x17387e1e .long 0x49dee168,0x72cfd2e9,0x3e2af239,0x1ae05223,0x1d94066a,0x009e75be,0x38abf413,0x6cca31c7,0x9bc49908,0xb50bd61d,0xf5e2bc1e,0x4a9b4a8c,0x946f83ac,0xeb6cc5f7,0xebffab28,0x27da93fc .long 0x4821c8c5,0xea314c96,0xa83c15f4,0x8de49ded,0x7af33004,0x7a64cf20,0xc9627e10,0x45f1bfeb,0x54b9df60,0x878b0626,0xa95c0b33,0x5e4fdc3c,0xc2035d8e,0xe54a37ca,0x80f20b8c,0x9087cda9 .long 0x8319ade4,0x36f61c23,0xde8cfdf8,0x766f287a,0x346f3705,0x48821948,0x16e4f4a2,0x49a7b853,0x5cedadfd,0xb9b3f8a7,0x8db2a815,0x8f562815,0x01f68f95,0xc0b7d554,0x688a208e,0x12971e27 .long 0xd0ff34fc,0xc9f8b696,0x1222718c,0x20824de2,0x0c95284d,0x7213cf9f,0xdc158240,0xe2ad741b,0x54043ccf,0x0ee3a6df,0xd84412b3,0x16ff479b,0xdfc98af0,0xf6c74ee0,0x52fcd2fb,0xa78a169f .long 0x99c930e9,0xd8ae8746,0x49e117a5,0x1d33e858,0x6624759f,0x7581fcb4,0x5bedc01d,0xde50644f,0xcaf3155e,0xbeec5d00,0xbc73e75f,0x672d66ac,0x270b01db,0x86b9d8c6,0x50f55b79,0xd249ef83 .long 0x73978fe3,0x6131d6d4,0x754b00a1,0xcc4e4542,0x57dfcfe9,0x4e05df05,0x51ef6bf0,0x94b29cdd,0x9bc7edf2,0xe4530cff,0xd3da65f3,0x8ac236fd,0xc8eb0b48,0x0faf7d5f,0x660eb039,0x4d2de14c .long 0x60430e54,0xc006bba7,0xda3289ab,0x10a2d0d6,0xd7979c59,0x9c037a5d,0xa116d944,0x04d1f3d3,0x8a0983cd,0x9ff22473,0xc883cabb,0x28e25b38,0x47a58995,0xe968dba5,0x774eebdf,0x2c80b505 .long 0x4a953beb,0xee763b71,0x1642e7f6,0x502e223f,0x61d5e722,0x6fe4b641,0xdbef5316,0x9d37c5b0,0xf8330bc7,0x0115ed70,0x75a72789,0x139850e6,0xffceccc2,0x27d7faec,0x4fd9f7f6,0x3016a860 .long 0x4cd8f64c,0xc492ec64,0x279d7b51,0x58a2d790,0x1fc75256,0x0ced1fc5,0x8f433017,0x3e658aed,0x05da59eb,0x0b61942e,0x0ddc3722,0xba3d60a3,0x742e7f87,0x7c311cd1,0xf6b01b6e,0x6473ffee .long 0x692ac542,0x8303604f,0x227b91d3,0xf079ffe1,0x15aaf9bd,0x19f63e63,0xf1f344fb,0xf99ee565,0xd6219199,0x8a1d661f,0xd48ce41c,0x8c883bc6,0x3c74d904,0x1065118f,0x0faf8b1b,0x713889ee .long 0x81a1b3be,0x972b3f8f,0xce2764a0,0x4f3ce145,0x28c4f5f7,0xe2d0f1cc,0xc7f3985b,0xdeee0c0d,0xd39e25c3,0x7df4adc0,0xc467a080,0x40619820,0x61cf5a58,0x440ebc93,0x422ad600,0x527729a6 .long 0xb1b76ba6,0xca6c0937,0x4d2026dc,0x1a2eab85,0x19d9ae0a,0xb1715e15,0xbac4a026,0xf1ad9199,0x07ea7b0e,0x35b3dfb8,0x3ed9eb89,0xedf5496f,0x2d6d08ab,0x8932e5ff,0x25bd2731,0xf314874e .long 0x3f73f449,0xefb26a75,0x8d44fc79,0x1d1c94f8,0x3bc0dc4d,0x49f0fbc5,0x3698a0d0,0xb747ea0b,0x228d291e,0x5218c3fe,0x43c129d6,0x35b804b5,0xd1acc516,0xfac859b8,0x95d6e668,0x6c10697d .long 0x0876fd4e,0xc38e438f,0x83d2f383,0x45f0c307,0xb10934cb,0x203cc2ec,0x2c9d46ee,0x6a8f2439,0x65ccde7b,0xf16b431b,0x27e76a6f,0x41e2cd18,0x4e3484d7,0xb9c8cf8f,0x8315244a,0x64426efd .long 0xfc94dea3,0x1c0a8e44,0xdad6a0b0,0x34c8cdbf,0x04113cef,0x919c3840,0x15490ffa,0xfd32fba4,0x795dcfb7,0x58d190f6,0x83588baf,0xfef01b03,0xca1fc1c0,0x9e6d1d63,0xf0a41ac9,0x53173f96 .long 0xba16f73b,0x2b1d402a,0x8cf9b9fc,0x2fb31014,0x446ef7bf,0x2d51e60e,0xb91e1745,0xc731021b,0x4fee99d4,0x9d3b4724,0xfac5c1ea,0x4bca48b6,0xbbea9af7,0x70f5f514,0x974c283a,0x751f55a5 .long 0xcb452fdb,0x6e30251a,0x50f30650,0x31ee6965,0x933548d9,0xb0b3e508,0xf4b0ef5b,0xb8949a4f,0x3c88f3bd,0x208b8326,0xdb1d9989,0xab147c30,0x44d4df03,0xed6515fd,0xe72eb0c5,0x17a12f75 .long 0x36cf69db,0x3b59796d,0x56670c18,0x1219eee9,0x7a070d8e,0xfe3341f7,0xa327f90c,0x9b70130b,0x0ae18e0e,0x36a32462,0x46c0a638,0x2021a623,0xc62eb0d4,0x251b5817,0x4c762293,0x87bfbcdf .long 0xcdd61d64,0xf78ab505,0xc8c18857,0x8c7a53fc,0x16147515,0xa653ce6f,0xea7d52d5,0x9c923aa5,0x5c18871f,0xc24709cb,0x73b3cc74,0x7d53bec8,0xfdd1d4c4,0x59264aff,0x240da582,0x5555917e .long 0x548f5a0e,0xcae8bbda,0x3bbfbbe1,0x1910eaba,0x7677afc3,0xae579685,0x73ff0b5c,0x49ea61f1,0x4f7c3922,0x78655478,0x20c68eef,0x95d337cd,0xdf779ab9,0x68f1e1e5,0xb5cf69a8,0x14b491b0 .long 0x28e3fe89,0x7a6cbbe0,0xc5aac0eb,0xe7e1fee4,0x697e5140,0x7f47eda5,0xb454921f,0x4f450137,0x95cd8185,0xdb625f84,0xcdb2e583,0x74be0ba1,0xdd5e6de4,0xaee4fd7c,0xe8101739,0x4251437d .long 0xac620366,0x686d72a0,0xb6d59344,0x4be3fb9c,0xa1eb75b9,0x6e8b44e7,0x91a5c10c,0x84e39da3,0xb38f0409,0x37cc1490,0x2c2ade82,0x02951943,0x1190a2d8,0x9b688783,0x231182ba,0x25627d14 .long 0x658a6d87,0x6eb550aa,0xcf9c7325,0x1405aaa7,0x5c8748c9,0xd147142e,0x53ede0e0,0x7f637e4f,0x14ffad2c,0xf8ca2776,0xbafb6791,0xe58fb1bd,0xbf8f93fc,0x17158c23,0x0a4a4655,0x7f15b373 .long 0xd842ca72,0x39d4add2,0x3ed96305,0xa71e4391,0x6700be14,0x5bb09cbe,0xd8befcf6,0x68d69d54,0x37183bcf,0xa45f5367,0x3370dff7,0x7152b7bb,0xbf12525b,0xcf887baa,0xd6d1e3cd,0xe7ac7bdd .long 0x81fdad90,0x25914f78,0x0d2cf6ab,0xcf638f56,0xcc054de5,0xb90bc03f,0x18b06350,0x932811a7,0x9bbd11ff,0x2f00b330,0xb4044974,0x76108a6f,0xa851d266,0x801bb9e0,0xbf8990c1,0x0dd099be .long 0xabe32986,0x58c5aaaa,0x50d59c27,0x0fe9dd2a,0x8d307305,0x84951ff4,0x86529b78,0x6c23f829,0x0b136a79,0x50bb2218,0x77a20996,0x7e2174de,0xc0bb4da6,0x6f00a4b9,0xefdde8da,0x89a25a17 .long 0xc11ee01d,0xf728a27e,0xe5f10dfb,0xf900553a,0x02ec893c,0x189a83c8,0x23f66d77,0x3ca5bdc1,0x97eada9f,0x98781537,0x10256230,0x59c50ab3,0x323c69b3,0x346042d9,0x2c460449,0x1b715a6d .long 0x6ae06e0b,0xa41dd476,0x9d42e25f,0xcdd7888e,0x56b25a20,0x0f395f74,0x8700e27e,0xeadfe0ae,0x69950093,0xb09d52a9,0x327f8d40,0x3525d9cb,0x67df886a,0xb8235a94,0x035faec2,0x77e4b0dd .long 0x517d7061,0x115eb20a,0x6c2df683,0x77fe3433,0xcdc6fc67,0x6870ddc7,0x0b87de83,0xb1610588,0xd9c4ddbe,0x343584ca,0x3d754be2,0xb3164f1c,0xc1e6c894,0x0731ed3a,0x4f6b904c,0x26327dec .long 0x97b5cd32,0x9d49c6de,0xb5eceecd,0x40835dae,0xd9ded7fe,0xc66350ed,0x7a678804,0x8aeebb5c,0x5b8ee9ec,0x51d42fb7,0x8e3ca118,0xd7a17bdd,0x2ef4400e,0x40d7511a,0x875a66f4,0xc48990ac .long 0x2199e347,0x8de07d2a,0x2a39e051,0xbee75556,0x916e51dc,0x56918786,0x4a2d89ec,0xeb191313,0x37d341ed,0x6679610d,0x56d51c2b,0x434fbb41,0xd7492dba,0xe54b7ee7,0x59021493,0xaa33a79a .long 0xe4bd6d3d,0x49fc5054,0x5ab551d0,0x09540f04,0x4942d3a6,0x8acc9085,0x2d28323b,0x231af02f,0x0992c163,0x93458cac,0x888e3bb4,0x1fef8e71,0xbe8c268c,0x27578da5,0xe805ec00,0xcc8be792 .long 0xc61c3855,0x29267bae,0x58c1fd3b,0xebff429d,0x8c0b93b8,0x22d886c0,0x2ddb8953,0xca5e00b2,0xc3fed8b7,0xcf330117,0x819c01f6,0xd49ac6fa,0x3c0fbd54,0x6ddaa6bd,0x8049a2cf,0x91743068 .long 0xaff2ef81,0xd67f981e,0x2818ae80,0xc3654d35,0x1b2aa892,0x81d05044,0x3d099328,0x2db067bf,0x703dcc97,0xe7c79e86,0xe133e215,0xe66f9b37,0xe39a7a5c,0xcdf119a6,0x876f1b61,0x47c60de3 .long 0xd860f1b2,0x6e405939,0xf5ed4d4a,0x3e9a1dbc,0xc9b6bcbd,0x3f23619e,0x734e4497,0x5ee790cf,0x5bdaf9bb,0xf0a834b1,0x4ca295f0,0x02cedda7,0xcb8e378c,0x4619aa2b,0xcc987ea4,0xe5613244 .long 0x76b23a50,0x0bc022cc,0x0a6c21ce,0x4a2793ad,0x89cac3f5,0x38328780,0xcba26d56,0x29176f1b,0x4f6f59eb,0x06296187,0x8bdc658e,0x86e9bca9,0x57e30402,0x2ca9c4d3,0x516a09bb,0x5438b216 .long 0x7672765a,0x0a6a063c,0x0547b9bf,0x37a3ce64,0x98b1a633,0x42c099c8,0x05ee6961,0xb5ab800d,0x11a5acd6,0xf1963f59,0x46201063,0xbaee6157,0xa596210a,0x36d9a649,0x1ba7138c,0xaed04363 .long 0xa4a82b76,0xcf817d1c,0xf3806be9,0x5586960e,0x09dc6bb5,0x7ab67c89,0x114fe7eb,0x52ace7a0,0xcbbc9b70,0xcd987618,0x604ca5e1,0x4f06fd5a,0x6dbde133,0x90af14ca,0x948a3264,0x1afe4322 .long 0xc44b2c6c,0xa70d2ca6,0x0ef87dfe,0xab726799,0x2e696377,0x310f64dc,0x4c8126a0,0x49b42e68,0xcea0b176,0x0ea444c3,0xcb269182,0x53a8ddf7,0xbbba9dcb,0xf3e674eb,0xd8669d33,0x0d2878a8 .long 0xd019b6a3,0x04b935d5,0x406f1e46,0xbb5cf88e,0x5b57c111,0xa1912d16,0x19ebfd78,0x9803fc21,0xc07764a9,0x4f231c9e,0xb75bd055,0xd93286ee,0x8ee6c9de,0x83a9457d,0x6087ec90,0x04695915 .long 0x58d6cd46,0x14c6dd8a,0x8e6634d2,0x9cb633b5,0xf81bc328,0xc1305047,0x26a177e5,0x12ede0e2,0x065a6f4f,0x332cca62,0x67be487b,0xc3a47ecd,0x0f47ed1c,0x741eb187,0xe7598b14,0x99e66e58 .long 0x63d0ff12,0x6f0544ca,0xb610a05f,0xe5efc784,0x7cad7b47,0xf72917b1,0xf2cac0c0,0x3ff6ea20,0xf21db8b7,0xcc23791b,0xd7d93565,0x7dac70b1,0x694bdaad,0x682cda1d,0x1023516d,0xeb88bb8c .long 0xdfdbeb1b,0xc4c634b4,0xb4ee4dea,0x22f5ca72,0xe6524821,0x1045a368,0x052b18b2,0xed9e8a3f,0xb961f49a,0x9b7f2cb1,0x7b009670,0x7fee2ec1,0x22507a6d,0x350d8754,0x4db55f1d,0x561bd711 .long 0x320bbcaf,0x4c189ccc,0xdf1de48c,0x568434cf,0x0fa8f128,0x6af1b00e,0x8907583c,0xf0ba9d02,0x32ff9f60,0x735a4004,0xc25dcf33,0x3dd8e4b6,0x42c74cef,0xf2230f16,0x013fa8ad,0xd8117623 .long 0xf51fe76e,0x36822876,0x11d62589,0x8a6811cc,0x46225718,0xc3fc7e65,0xc82fdbcd,0xb7df2c9f,0xdd7b205b,0x3b1d4e52,0x47a2e414,0xb6959478,0xefa91148,0x05e4d793,0xfd2e9675,0xb47ed446 .long 0x04c9d9bf,0x1a7098b9,0x1b793048,0x661e2881,0xb01ee461,0xb1a16966,0x2954746f,0xbc521308,0x2477de50,0xc909a0fc,0x7dbd51ef,0xd80bb41c,0x53294905,0xa85be7ec,0x83958f97,0x6d465b18 .long 0xfb6840fd,0x16f6f330,0x3401e6c8,0xfaaeb214,0xccb5b4f8,0xaf83d30f,0x266dec4b,0x22885739,0x7bc467df,0x51b4367c,0xd842d27a,0x926562e3,0x0fea14a6,0xdfcb6614,0xf2734cd9,0xeb394dae .long 0x11c0be98,0x3eeae5d2,0x814e8165,0xb1e6ed11,0xe52bce1c,0x191086bc,0xa75a04da,0x14b74cc6,0x8c060985,0x63cf1186,0x2dbd7f7c,0x071047de,0xce0942ca,0x4e433b8b,0xd8fec61d,0xecbac447 .long 0xebf3232f,0x8f0ed0e2,0xc52a2edd,0xfff80f9e,0x75b55fdb,0xad9ab433,0xe42e0c11,0x73ca7820,0xe6251b46,0x6dace0a0,0x4c0d932d,0x89bc6b5c,0x095da19a,0x3438cd77,0x8d48bdfb,0x2f24a939 .long 0x766561b7,0x99b47e46,0x0ed0322a,0x736600e6,0x638e1865,0x06a47cb1,0xcb136000,0x927c1c2d,0x0cc5df69,0x29542337,0x09d649a9,0x99b37c02,0x6aefdb27,0xc5f0043c,0x1be95c27,0x6cdd9987 .long 0x390420d2,0x69850931,0x0983efa4,0x299c40ac,0xaf39aead,0x3a05e778,0x43a45193,0x84274408,0x91a711a0,0x6bcd0fb9,0x9f52ab17,0x461592c8,0xda3c6ed6,0xb49302b4,0x330d7067,0xc51fddc7 .long 0xda50d531,0x94babeb6,0xa6a7b9da,0x521b840d,0x404bdc89,0x5305151e,0xd0d07449,0x1bcde201,0x3b76a59a,0xf427a78b,0x07791a1b,0xf84841ce,0xbf91ed1c,0xebd314be,0xbf172943,0x8e61d34c .long 0x5541b892,0x1d5dc451,0xfc9d9e54,0xb186ee41,0xd5bf610d,0x9d9f345e,0xf6acca9f,0x3e7ba65d,0xa8369486,0x9dda787a,0x8eb5ba53,0x09f9dab7,0xd6481bc3,0x5afb2033,0xafa62104,0x76f4ce30 .long 0xf4f066b5,0xa8fa00cf,0x461dafc2,0x89ab5143,0xa3389998,0x44339ed7,0xbc214903,0x2ff862f1,0xb05556e3,0x2c88f985,0x3467081e,0xcd96058e,0xedc637ea,0x7d6a4176,0x36a5acdc,0xe1743d09 .long 0x7eb37726,0x66fd72e2,0x1481a037,0xf7fa264e,0x45f4aa79,0x9fbd3bde,0x767c3e22,0xed1e0147,0x82e7abe2,0x7621f979,0x45f633f8,0x19eedc72,0x6137bf3a,0xe69b155e,0x414ee94e,0xa0ad13ce .long 0x1c0e651a,0x93e3d524,0x02ce227e,0xab1a6e2a,0x4ab27eca,0xe7af1797,0xbd444f39,0x245446de,0x56c07613,0x59e22a21,0xf4275498,0x43deafce,0x67fd0946,0x10834ccb,0x47406edf,0xa75841e5 .long 0x7b0ac93d,0xebd6a677,0x78f5e0d7,0xa6e37b0d,0x76f5492b,0x2516c096,0x9ac05f3a,0x1e4bf888,0x4df0ba2b,0xcdb42ce0,0x5062341b,0x935d5cfd,0x82acac20,0x8a303333,0x5198b00e,0x429438c4 .long 0x049d33fa,0x1d083bc9,0x946f67ff,0x58b82dda,0x67a1d6a3,0xac3e2db8,0x1798aac8,0x62e6bead,0xde46c58c,0xfc85980f,0x69c8d7be,0xa7f69379,0x837b35ec,0x23557927,0xe0790c0c,0x06a933d8 .long 0x077ff55d,0x827c0e9b,0xbb26e680,0x53977798,0x1d9cb54f,0x59530874,0x4aac53ef,0xcca3f449,0xa07eda0f,0x11dc5c87,0xfd6400c8,0xc138bccf,0x13e5da72,0x549680d3,0x4540617e,0xc93eed82 .long 0x4d0b75c0,0xfd3db157,0x6386075b,0x9716eb42,0x817b2c16,0x0639605c,0xf1e4f201,0x09915109,0x5cca6c3b,0x35c9a928,0x3505c900,0xb25f7d1a,0x630480c4,0xeb9f7d20,0x2a1a501c,0xc3c7b8c6 .long 0x5a1f8e24,0x3f99183c,0x9dd255f0,0xfdb118fa,0xc27f62a6,0xb9b18b90,0x396ec191,0xe8f732f7,0x0be786ab,0x524a2d91,0x0ac5a0f5,0x5d32adef,0x9725f694,0x9b53d4d6,0x0510ba89,0x032a76c6 .long 0xebeb1544,0x840391a3,0x3ed73ac3,0x44b7b88c,0x256cb8b3,0xd24bae7a,0xe394cb12,0x7ceb151a,0x5bc1e6a8,0xbd6b66d0,0x090f07bf,0xec70cecb,0x7d937589,0x270644ed,0x5f1dccfe,0xee9e1a3d .long 0x745b98d2,0xb0d40a84,0x2556ed40,0xda429a21,0x85148cb9,0xf676eced,0xded18936,0x5a22d40c,0x70e8a4ce,0x3bc4b9e5,0x9eae0379,0xbfd1445b,0x1a0bd47e,0xf23f2c0c,0xe1845531,0xa9c0bb31 .long 0x0a4c3f6b,0x9ddc4d60,0x2c15ef44,0xbdfaad79,0x7f484acc,0xce55a236,0x055b1f15,0x08653ca7,0x538873a3,0x2efa8724,0xace1c7e7,0x09299e5d,0xade332ba,0x07afab66,0x92dd71b7,0x9be1fdf6 .long 0x5758b11c,0xa49b5d59,0xc8654f40,0x0b852893,0x52379447,0xb63ef6f4,0x105e690c,0xd4957d29,0x646559b0,0x7d484363,0x49788a8e,0xf4a8273c,0x34ce54a9,0xee406cb8,0xf86fda9b,0x1e1c260f .long 0xcf6a4a81,0xe150e228,0x1b488772,0x1fa3b6a3,0xc5a9c15b,0x1e6ff110,0x8ad6aa47,0xc6133b91,0x9dffa978,0x8ac5d55c,0x5f3965f2,0xba1d1c1d,0x7732b52f,0xf969f4e0,0xa5172a07,0xfceecdb5 .long 0x10f2b8f5,0xb0120a5f,0x5c4c2f63,0xc83a6cdf,0xf8f9c213,0x4d47a491,0xd3f1bbd5,0xd9e1cce5,0xaba7e372,0x0d91bc7c,0xdfd1a2db,0xfcdc74c8,0x374618e5,0x05efa800,0x15a7925e,0x11216969 .long 0xf6021c5d,0xd4c89823,0xeff14423,0x880d5e84,0x6dcd1396,0x6523bc5a,0x113c978b,0xd1acfdfc,0xbbb66840,0xb0c164e8,0x72b58459,0xf7f4301e,0xa638e8ec,0xc29ad4a6,0x46b78699,0xf5ab8961 .long 0x0e954750,0x9dbd7974,0x64f9d2c6,0x0121de88,0xd985232e,0x2e597b42,0x53451777,0x55b6c3c5,0x519cb9fb,0xbb53e547,0x8428600d,0xf134019f,0xe081791a,0x5a473176,0x35fb0c08,0x2f3e2263 .long 0x73d273b0,0xb28c3017,0x7721ef9a,0xccd21076,0xb650dc39,0x054cc292,0x6188045e,0x662246de,0x6b83c0d1,0x904b52fa,0x97e9cd46,0xa72df267,0x899725e4,0x886b43cd,0xd849ff22,0x2b651688 .long 0x02f34533,0x60479b79,0x0c77c148,0x5e354c14,0xa8537c78,0xb4bb7581,0xefe1495f,0x188043d7,0x8c1d5026,0x9ba12f42,0x93d4aaab,0x2e0c8a26,0xaa57c450,0xbdba7b8b,0x9bbdafef,0x140c9ad6 .long 0x25ac0f18,0x2067aa42,0x04d1fbf3,0xf7b1295b,0xa4b04824,0x14829111,0x33bd5e91,0x2ce3f192,0x8f2e1b72,0x9c7a1d55,0x302aa243,0xfe932286,0xd4be9554,0x497ca7b4,0xe0547a6e,0xb8e821b8 .long 0x67e573e0,0xfb2838be,0x4084c44b,0x05891db9,0x96c1c2c5,0x91311373,0xd958444b,0x6aebfa3f,0xe56e55c1,0xac9cdce9,0x2caa46d0,0x7148ced3,0xb61fe8eb,0x2e10c7ef,0xff97cf4d,0x9fd835da .long 0x081e9387,0xa36da109,0x8c935828,0xfb9780d7,0xe540b015,0xd5940332,0xe0f466fa,0xc9d7b51b,0xd6d9f671,0xfaadcd41,0xb1a2ac17,0xba6c1e28,0xed201e5f,0x066a7833,0xf90f462b,0x19d99719 .long 0x060b5f61,0xf431f462,0x7bd057c2,0xa56f46b4,0x47e1bf65,0x348dca6c,0x41bcf1ff,0x9a38783e,0xda710718,0x7a5d33a9,0x2e0aeaf6,0x5a779987,0x2d29d187,0xca87314d,0xc687d733,0xfa0edc3e .long 0x6a31e09b,0x9df33621,0xc1350e35,0xde89e44d,0x4ca0cf52,0x29214871,0x0b88a538,0xdf379672,0x2591d61b,0xc92a510a,0x585b447b,0x79aa87d7,0xe5287f77,0xf67db604,0x5efe7a80,0x1697c8bf .long 0xcb198ac7,0x1c894849,0x0f264665,0xa884a93d,0x9b200678,0x2da964ef,0x009834e6,0x3c351b87,0xe2c4b44b,0xafb2ef9f,0x3326790c,0x580f6c47,0x0b02264a,0xb8480521,0x42a194e2,0x8ba6f9e2 .long 0x8fb54738,0xfc87975f,0x27c3ead3,0x35160788,0xb74a085a,0x834116d2,0xa62fe996,0x53c99a73,0x5b81c51b,0x87585be0,0xbe0852b7,0x925bafa8,0xa84d19a7,0x76a4fafd,0x585206d4,0x39a45982 .long 0x5eb03c0e,0x499b6ab6,0x72bc3fde,0xf19b7954,0x6e3a80d2,0xa86b5b9c,0x6d42819f,0xe4377508,0xbb3ee8a3,0xc1663650,0xb132075f,0x75eb14fc,0x7ad834f6,0xa8ccc906,0xe6e92ffd,0xea6a2474 .long 0x0f8d6758,0x9d72fd95,0x408c07dd,0xcb84e101,0xa5e23221,0xb9114bfd,0xe94e742c,0x358b5fe2,0x95f40e75,0x1c0577ec,0x3d73f3d6,0xf0155451,0xbd1b9b66,0x9d55cd67,0xaf8d63c7,0x63e86e78 .long 0xd3c095f1,0x39d934ab,0xe4b76d71,0x04b261be,0xe73e6984,0x1d2e6970,0x5e5fcb11,0x879fb23b,0xdfd75490,0x11506c72,0x61bcf1c1,0x3a97d085,0xbf5e7007,0x43201d82,0x798232a7,0x7f0ac52f .long 0x6eb564d4,0x2715cbc4,0x9e570e29,0x8d6c752c,0x9ef5fd5d,0xf80247c8,0xd53eb514,0xc3c66b46,0x0f87de56,0x9666b401,0xc6c603b5,0xce62c06f,0x7e4fc942,0xae7b4c60,0x663a9c19,0x38ac0b77 .long 0x4b049136,0xcb4d20ee,0x356a4613,0x8b63bf12,0x70e08128,0x1221aef6,0x4acb6b16,0xe62d8c51,0x379e7896,0x71f64a67,0xcafd7fa5,0xb25237a2,0x3841ba6a,0xf077bd98,0x3cd16e7e,0xc4ac0244 .long 0x21fea4ca,0x548ba869,0xf3dfdac1,0xd36d0817,0xf4685faf,0x09d8d71f,0xc52c459a,0x8eff66be,0x0b57235e,0x182faee7,0x0106712b,0xee3c39b1,0xc0fcdcb0,0x5107331f,0xa51054ba,0x669fb9dc .long 0x319d7682,0xb25101fb,0x0a982fee,0xb0293129,0x0261b344,0x51c1c9b9,0xbfd371fa,0x0e008c5b,0x0278ca33,0xd866dd1c,0xe5aa53b1,0x666f76a6,0x6013a2cf,0xe5cfb779,0xa3521836,0x1d3a1aad .long 0x73faa485,0xcedd2531,0xc0a76878,0xc8ee6c4f,0x2a11667d,0xddbccfc9,0x1c2f695a,0x1a418ea9,0x51f73971,0xdb11bd92,0xda2ed89f,0x3e4b3c82,0xe73e0319,0x9a44f3f4,0x303431af,0xd1e3de0f .long 0x50f75f9c,0x3c5604ff,0x7e752b22,0x1d8eddf3,0x3c9a1118,0x0ef074dd,0xccb86d7b,0xd0ffc172,0x037d90f2,0xabd1ece3,0x6055856c,0xe3f307d6,0x7e4c6daf,0x422f9328,0x334879a0,0x902aac66 .long 0x94cdfade,0xb6a1e7bf,0x7fc6d634,0x6c97e1ed,0xa2fb63f8,0x662ad24d,0xa5928405,0xf81be1b9,0xd14b4206,0x86d765e4,0x8fa0db65,0xbecc2e0e,0xb17fc76c,0xa28838e0,0xe37cf24e,0xe49a602a .long 0x567193ec,0x76b4131a,0xe5f6e70b,0xaf3c305a,0x031eebdd,0x9587bd39,0x71bbe831,0x5709def8,0x0eb2b669,0x57059983,0x875b7029,0x4d80ce1b,0x0364ac16,0x838a7da8,0xbe1c83ab,0x2f431d23 .long 0xf9294dd3,0xe56812a6,0x9b4b0d77,0xb448d01f,0x04e8305c,0xf3ae6061,0x94d8c63e,0x2bead645,0x84fd8b07,0x0a85434d,0xf7a9dee5,0x537b983f,0xef55bd85,0xedcc5f18,0x21c6cf8b,0x2041af62 .long 0xb940c71e,0x8e52874c,0xdb5f4b3a,0x211935a9,0x301b1dc3,0x94350492,0x29958620,0x33d2646d,0xef911404,0x16b0d64b,0x9a3c5ef4,0x9d1f25ea,0x4a352c78,0x20f200eb,0x4bd0b428,0x43929f2c .long 0xc7196e29,0xa5656667,0x9391be48,0x7992c2f0,0x9ee0cd6e,0xaaa97cbd,0x3dc8c9bf,0x51b0310c,0xdd9f22cb,0x237f8acf,0xb585d584,0xbb1d81a1,0x8c416388,0x8d5d85f5,0x42fe474f,0x0d6e5a5a .long 0x38235d4e,0xe7812766,0x496e3298,0x1c62bd67,0x3f175bc8,0x8378660c,0x17afdd4d,0x4d04e189,0x85a8068c,0x32a81601,0x92b29a85,0xdb58e4e1,0xc70d8a3b,0xe8a65b86,0x98a0403b,0x5f0e6f4e .long 0x69ed2370,0x08129684,0x0871ee26,0x34dc30bd,0x7c9c5b05,0x3a5ce948,0x43a90c87,0x7d487b80,0xdd0e7179,0x4089ba37,0xb4041811,0x45f80191,0x98747ba5,0x1c3e1058,0x6e1ae592,0x98c4e13a .long 0xe82c9f9e,0xd44636e6,0xc33a1043,0x711db87c,0xaa8aec05,0x6f431263,0x2744a4aa,0x43ff120d,0xae77779b,0xd3bd892f,0x8cdc9f82,0xf0fe0cc9,0xf1c5b1bc,0xca5f7fe6,0x44929a72,0xcc63a682 .long 0x09dbe19a,0xc7eaba0c,0x6b5c73c2,0x2f3585ad,0x0ae50c30,0x8ab8924b,0x638b30ba,0x17fcd27a,0x10b3d5a5,0xaf414d34,0x2a9accf1,0x09c107d2,0x946a6242,0x15dac49f,0xd707d642,0xaec3df2a .long 0x3f894ae0,0x2c2492b7,0xb75f18ce,0xf59df3e5,0x8f53cad0,0x7cb740d2,0xc4f01294,0x3eb585fb,0x32c7f717,0x17da0c86,0xaf943f4c,0xeb8c795b,0xf67c51d2,0x4ee23fb5,0x68889949,0xef187575 .long 0x0389168b,0xa6b4bdb2,0xea577d03,0xc4ecd258,0x55743082,0x3a63782b,0xc72f08cd,0x6f678f4c,0x65e58dd8,0x553511cf,0xd402c0cd,0xd53b4e3e,0xa037c14c,0x37de3e29,0xc05712aa,0x86b6c516 .long 0xb38dff6f,0x2834da3e,0xea636be8,0xbe012c52,0x61dd37f8,0x292d238c,0x8f8142db,0x0e54523f,0x036a05d8,0xe31eb436,0x1e93c0ff,0x83e3cdff,0x50821ddf,0x3fd2fe0f,0xff9eb33b,0xc8e19b0d .long 0xb569a5fe,0xc8cc943f,0xd4342d75,0xad0090d4,0xcaeca000,0x82090b4b,0x1bd410eb,0xca39687f,0x65959d77,0xe7bb0df7,0x9c964999,0x39d78218,0xb2415451,0xd87f62e8,0xbed76108,0xe5efb774 .long 0xe822f0d0,0x3ea011a4,0x5a8704f8,0xbc647ad1,0x50c6820f,0xbb315b35,0xb7e76bec,0x863dec3d,0xf017bfc7,0x01ff5d3a,0x976b8229,0x20054439,0x0bbd0d3b,0x067fca37,0x7f5e3d0f,0xf63dde64 .long 0x2a4c94e9,0x22dbefb3,0x96f8278a,0xafbff0fe,0x3503793d,0x80aea0b1,0x5f06cd29,0xb2238029,0x8ec3feca,0x65703e57,0x393e7053,0x06c38314,0x7c6734c4,0xa0b751eb,0xc59f0f1e,0xd2e8a435 .long 0x5e9ca895,0x147d9052,0x972072df,0x2f4dd31e,0xe6c6755c,0xa16fda8e,0xcf196558,0xc66826ff,0x0cf43895,0x1f1a76a3,0x83c3097b,0xa9d604e0,0x66390e0e,0xe1908309,0xb3c85eff,0xa50bf753 .long 0xf6a70251,0x0696bdde,0x3c6ab16a,0x548b801b,0xa4d08762,0x37fcf704,0xdff76c4e,0x090b3def,0x69cb9158,0x87e8cb89,0x995ece43,0x44a90744,0x0ad9fbf5,0xf85395f4,0x4fb0c82d,0x49b0f6c5 .long 0xadf7cccf,0x75d9bc15,0xdfa1e1b0,0x81a3e5d6,0x249bc17e,0x8c39e444,0x8ea7fd43,0xf37dccb2,0x907fba12,0xda654873,0x4a372904,0x35daa6da,0x6283a6c5,0x0564cfc6,0x4a9395bf,0xd09fa4f6 .long 0xaeb19a36,0x688e9ec9,0xc7bfbfb4,0xd913f1ce,0x61c2faa6,0x797b9a3c,0x6a0a9c12,0x2f979bec,0x359679ec,0xb5969d0f,0x079b0460,0xebcf523d,0x10fab870,0xfd6b0008,0x9373a39c,0x3f2edcda .long 0x6f568431,0x0d64f9a7,0x02f8898c,0xf848c27c,0x260b5bd5,0xf418ade1,0x6973dee8,0xc1f3e323,0x26c185dd,0x46e9319c,0x546f0ac4,0x6d85b7d8,0x247f9d57,0x427965f2,0xb0035f48,0xb519b636 .long 0xab87d59c,0x6b6163a9,0x39caaa11,0xff9f58c3,0x3177387b,0x4ac39cde,0x873e77f9,0x5f6557c2,0x36a83041,0x67504006,0x75ef196c,0x9b1c96ca,0xb08c7940,0xf34283de,0x1128c316,0x7ea09644 .long 0x6aa39dff,0xb510b3b5,0x9f8e4d8c,0x59b43da2,0x9e4c4b9f,0xa8ce31fd,0xc1303c01,0x0e20be26,0xe8ee47c9,0x18187182,0x7db98101,0xd9687cdb,0xa1e14ff6,0x7a520e4d,0x8836d572,0x429808ba .long 0x4944b663,0xa37ca60d,0xa3f91ae5,0xf901f7a9,0x9e36e3b1,0xe4e3e76e,0x29d93250,0x9aa219cf,0x056a2512,0x347fe275,0xde65d95c,0xa4d643d9,0x699fc3ed,0x9669d396,0xcf8c6bbe,0xb598dee2 .long 0xdda9e5c6,0x682ac1e5,0xcaa9fc95,0x4e0d3c72,0x772bea44,0x17faaade,0xab0009c8,0x5ef8428c,0x460ff016,0xcc4ce47a,0x725281cb,0xda6d12bf,0x0223aad2,0x44c67848,0x36256e28,0x6e342afa .long 0x93a37c04,0x1400bb0b,0xdd10bd96,0x62b1bc9b,0x0dac46b7,0x7251adeb,0x7be4ef51,0x7d33b92e,0xe61fa29a,0x28b2a94b,0x06422233,0x4b2be13f,0x330d8d37,0x36d6d062,0xb28ca005,0x5ef80e1e .long 0x6d16768e,0x174d4699,0x628bf217,0x9fc4ff6a,0x154e490d,0x77705a94,0x8d2d997a,0x9d96dd28,0xce5d72c4,0x77e2d9d8,0xc11c714f,0x9d06c5a4,0x79e4a03e,0x02aa5136,0x030ff28b,0x1386b3c2 .long 0xfb283f61,0xfe82e8a6,0xf3abc3fb,0x7df203e5,0x3a4d3622,0xeec7c351,0xdf762761,0xf7d17dbf,0x522055f0,0xc3956e44,0x8fa748db,0xde3012db,0xbf1dcc14,0xca9fcb63,0xbe4e2f3a,0xa56d9dcf .long 0x8bcec9c2,0xb86186b6,0x680b9f06,0x7cf24df9,0xc0d29281,0xc46b45ea,0x07b10e12,0xfff42bc5,0x4d289427,0x12263c40,0xb4848ec4,0x3d5f1899,0xd040800c,0x11f97010,0x300feb20,0xb4c5f529 .long 0xde94fdcb,0xcc543f8f,0xc7c2f05e,0xe96af739,0x882692e1,0xaa5e0036,0x950d4ae9,0x09c75b68,0xb5932a7a,0x62f63df2,0xde0979ad,0x2658252e,0xb5e69631,0x2a19343f,0x525b666b,0x718c7501 .long 0xea40dc3a,0x26a42d69,0xaecc018f,0xdc84ad22,0x3270f04a,0x25c36c7b,0x50fa72ed,0x46ba6d47,0x93e58a8e,0x6c37d1c5,0x120c088c,0xa2394731,0xcb6e86da,0xc3be4263,0x7126d038,0x2c417d36 .long 0x8b6f8efa,0x5b70f9c5,0x37718536,0x671a2faa,0xb539c92b,0xd3ced3c6,0xa31203c2,0xe56f1bd9,0x9ff3c8eb,0x8b096ec4,0x43491cea,0x2deae432,0x17943794,0x2465c6eb,0x20586843,0x5d267e66 .long 0xb07159d0,0x9d3d116d,0xc1896210,0xae07a67f,0xbb961579,0x8fc84d87,0x1c1f8dd6,0x30009e49,0xe3132819,0x8a8caf22,0xf23ab4ff,0xcffa197c,0x205dd687,0x58103a44,0x0ded67a2,0x57b796c3 .long 0xa1779ad7,0x0b9c3a6c,0x357c09c5,0xa33cfe2e,0x3db4a57e,0x2ea29315,0x8ebeb52e,0x91959695,0xe546c879,0x118db9a6,0x6295c8d6,0x8e996df4,0x55ec806b,0xdd990484,0x165c1035,0x24f291ca .long 0x440e2229,0xcca523bb,0x73ef4d04,0x324673a2,0x3e11ec39,0xaf3adf34,0xdc5968d3,0x6136d7f1,0xb053a927,0x7a7b2899,0xae067ecd,0x3eaa2661,0x02779cd9,0x8549b9c8,0xc53385ea,0x061d7940 .long 0xf06d18bd,0x3e0ba883,0xb2700843,0x4ba6de53,0x591a9e4d,0xb966b668,0x7f4fa0ed,0x93f67567,0x4347237b,0x5a02711b,0xe794608e,0xbc041e2f,0x70f73d8c,0x55af10f5,0xbb7564f7,0xd2d4d4f7 .long 0xb3e93ce7,0xd7d27a89,0x5d3a2c1b,0xf7b5a875,0x255b218a,0xb29e68a0,0x8af76754,0xb533837e,0x579fab2e,0xd1b05a73,0xecd74385,0xb41055a1,0x445e9115,0xb2369274,0xf520274e,0x2972a7c4 .long 0xf678e68a,0x6c08334e,0x99b057ed,0x4e4160f0,0x52ccb69a,0x3cfe11b8,0x21c8f772,0x2fd1823a,0x3298f055,0xdf7f072f,0xfec74a6e,0x8c0566f9,0x5bb4d041,0xe549e019,0x9208d850,0x7c3930ba .long 0xaaa2902b,0xe07141fc,0xe4f69ad3,0x539ad799,0x813f9ffd,0xa6453f94,0x375bc2f7,0xc58d3c48,0x5dc64e96,0xb3326fad,0xb240e354,0x3aafcaa9,0xaca1e7a9,0x1d1b0903,0x1211b8a0,0x4ceb9767 .long 0xe32a858e,0xeca83e49,0xae907bad,0x4c32892e,0x2eb9b494,0xd5b42ab6,0x1eabae1b,0x7fde3ee2,0xcaf54957,0x13b5ab09,0xe5f5d5d5,0xbfb028be,0x2003e2c0,0x928a0650,0x67476843,0x90793aac .long 0xc81710a0,0x5e942e79,0x27ccadd4,0x557e4a36,0x4bcf6d0c,0x72a2bc56,0x26d7b80c,0x09ee5f43,0xd4292f19,0x6b70dbe9,0x63f16b18,0x56f74c26,0x35fbb42a,0xc23db0f7,0x6ae10040,0xb606bdf6 .long 0x044573ac,0x1eb15d4d,0x556b0ba4,0x7dc3cf86,0xc60df6f7,0x97af9a33,0xa716ce8c,0x0b1ef85c,0xc96958be,0x2922f884,0x35690963,0x7c32fa94,0xeaa00061,0x2d7f667c,0x3547365c,0xeaaf7c17 .long 0x87032d58,0x1eb4de46,0x5e2c79e0,0xc54f3d83,0x5d04ef23,0x07818df4,0x673d41b4,0x55faa9c8,0x89b95355,0xced64f6f,0xb7415c84,0x4860d2ea,0x050ebad3,0x5fdb9bd2,0x6685a5bf,0xdb53e0cc .long 0x9feb6593,0xb830c031,0x6accff17,0xdd87f310,0x9f555c10,0x2303ebab,0x287e7065,0x94603695,0x2e83358c,0xf88311c3,0xeefb0178,0x508dd9b4,0x2dba8652,0x7ca23706,0x0047abe5,0x62aac5a3 .long 0x8b1ea7b3,0x9a61d2a0,0xae8b1485,0xd495ab63,0x87052f99,0x38740f84,0xb2974eea,0x178ebe5b,0x5b36d17f,0x030bbcca,0xaaf86eea,0xb5e4cce3,0x68f8e9e0,0xb51a0220,0x09eb3e75,0xa4348796 .long 0xeef1a752,0xbe592309,0x6f2aa1ed,0x5d7162d7,0x0f007dd2,0xaebfb5ed,0xc89edd22,0x255e14b2,0x0303b697,0xba85e072,0xf05720ff,0xc5d17e25,0x5128ebb6,0x02b58d6e,0xd754e113,0x2c80242d .long 0xabfae1ca,0x919fca5f,0x1a21459b,0x937afaac,0x1f66a4d2,0x9e0ca91c,0x23ec1331,0x194cc7f3,0x8aa11690,0xad25143a,0x09b59e08,0xbe40ad8d,0xe750860a,0x37d60d9b,0xc6bf434c,0x6c53b008 .long 0x1356eb80,0xb572415d,0x9578ded8,0xb8bf9da3,0x5e8fb38b,0x22658e36,0x5af8cb22,0x9b70ce22,0x829a8180,0x7c00018a,0xb81ed295,0x84329f93,0x5f3cea83,0x7c343ea2,0x67586536,0x38f8655f .long 0x1d3ec517,0xa661a0d0,0x512321ae,0x98744652,0xeca92598,0x084ca591,0x1dcb3feb,0xa9bb9dc9,0x78b4c240,0x14c54355,0x610cafdc,0x5ed62a3b,0x1b38846b,0x07512f37,0xb0e38161,0x571bb70a .long 0x2da705d2,0xb556b95b,0xb1a08f98,0x3ef8ada6,0xddecfbe5,0x85302ca7,0x943105cd,0x0e530573,0x21a9255d,0x60554d55,0xf2f3802a,0x63a32fa1,0xcd477875,0x35c8c5b0,0x6ad42da1,0x97f458ea .long 0xeb6b242d,0x832d7080,0x3b71e246,0xd30bd023,0xbe31139d,0x7027991b,0x462e4e53,0x68797e91,0x6b4e185a,0x423fe20a,0x42d9b707,0x82f2c67e,0x4cf7811b,0x25c81768,0x045bb95d,0xbd53005e .long 0x9d8e68fd,0xe5f649be,0x1b044320,0xdb0f0533,0xe0c33398,0xf6fde9b3,0x66c8cfae,0x92f4209b,0x1a739d4b,0xe9d1afcc,0xa28ab8de,0x09aea75f,0xeac6f1d0,0x14375fb5,0x708f7aa5,0x6420b560 .long 0x6254dc41,0x9eae499c,0x7a837e7e,0x7e293924,0x090524a7,0x74aec08c,0x8d6f55f2,0xf82b9219,0x1402cec5,0x493c962e,0xfa2f30e7,0x9f17ca17,0xe9b879cb,0xbcd783e8,0x5a6f145f,0xea3d8c14 .long 0x5e0dee6e,0xdede15e7,0xdc628aa2,0x74f24872,0x7861bb93,0xd3e9c4fe,0x6187b2e0,0x56d4822a,0xc59826f9,0xb66417cf,0x2408169e,0xca260969,0xc79ef885,0xedf69d06,0xdc7d138f,0x00031f8a .long 0x0ebcf726,0x103c46e6,0x6231470e,0x4482b831,0x487c2109,0x6f6dfaca,0x62e666ef,0x2e0ace97,0x1f8d1f42,0x3246a9d3,0x574944d2,0x1b1e83f1,0xa57f334b,0x13dfa63a,0x9f025d81,0x0cf8daed .long 0x00ee11c1,0x30d78ea8,0xb5e3dd75,0xeb053cd4,0xd58c43c5,0x9b65b13e,0xbd151663,0xc3ad49bd,0xb6427990,0x99fd8e41,0x707eae1e,0x12cf15bd,0x1aabb71e,0x29ad4f1b,0x07545d0e,0x5143e74d .long 0xc88bdee1,0x30266336,0x5876767c,0x25f29306,0xc6731996,0x9c078571,0xed552951,0xc88690b2,0x852705b4,0x274f2c2d,0x4e09552d,0xb0bf8d44,0x986575d1,0x7628beeb,0x7f864651,0x407be238 .long 0xa639fc6b,0x0e5e3049,0x86003625,0xe75c35d9,0x5dcc1646,0x0cf35bd8,0x6c26273a,0x8bcaced2,0xb5536742,0xe22ecf1d,0x1a9e068b,0x013dd897,0x8a7909c5,0x17f411cb,0x861dd506,0x5757ac98 .long 0x1e935abb,0x85de1f0d,0x154de37a,0xdefd10b4,0x369cebb5,0xb8d9e392,0x761324be,0x54d5ef9b,0x74f17e26,0x4d6341ba,0x78c1dde4,0xc0a0e3c8,0x87d918fd,0xa6d77581,0x02ca3a13,0x66876015 .long 0xf36658f0,0xc7313e9c,0x71f8057e,0xc433ef1c,0x1b6a835a,0x85326246,0x7c86394c,0xc8f05398,0xe983c4a1,0xff398cdf,0x03b7b931,0xbf5e8162,0xb7b9045b,0x93193c46,0xa4a6e46b,0x1e4ebf5d .long 0x43a24fe7,0xf9942a60,0xffb3492b,0x29c1191e,0x902fde05,0x9f662449,0x6713c32d,0xc792a7ac,0xb737982c,0x2fd88ad8,0xa21e60e3,0x7e3a0319,0x7383591a,0x09b0de44,0x8310a456,0x6df141ee .long 0xe6d6f471,0xaec1a039,0x1198d12e,0x14b2ba0f,0x3aeee5ac,0xebc1a160,0xe0b964ce,0x401f4836,0x4fd03f66,0x2ee43796,0xdd8f3f12,0x3fdb4e49,0x29380f18,0x6ef267f6,0x8da64d16,0x3e8e9670 .long 0x207674f1,0xbc19180c,0x33ae8fdb,0x112e09a7,0x6aaeb71e,0x99667554,0xe101b1c7,0x79432af1,0xde2ddec6,0xd5eb558f,0x5357753f,0x81392d1f,0x3ae1158a,0xa7a76b97,0x4a899991,0x416fbbff .long 0x0d4a9dcf,0x9e65fdfd,0x944ddf12,0x7bc29e48,0x3c856866,0xbc1a92d9,0x6e98dfe2,0x273c6905,0xcdfaa6b8,0x69fce418,0x5061c69f,0x606bd823,0x6af75e27,0x42d495a0,0x6d873a1f,0x8ed3d505 .long 0x6ab25b6a,0xaf552841,0x2b1a4523,0xc6c0ffc7,0x21c99e03,0xab18827b,0x9034691b,0x060e8648,0x93c7f398,0x5207f90f,0x82f8d10b,0x9f4a96cb,0x3ad0f9e3,0xdd71cd79,0xfc3a54f5,0x84f435d2 .long 0x8e33787f,0x4b03c55b,0xa6384673,0xef42f975,0x5051b9f0,0xff7304f7,0x741c87c2,0x18aca1dc,0x2d4bfe80,0x56f120a7,0x053e732c,0xfd823b3d,0x7537ca16,0x11bccfe4,0x1b5a996b,0xdf6c9c74 .long 0x904fc3fa,0xee7332c7,0xc7e3636a,0x14a23f45,0xf091d9aa,0xc38659c3,0xb12d8540,0x4a995e5d,0xf3a5598a,0x20a53bec,0xb1eaa995,0x56534b17,0xbf04e03c,0x9ed3dca4,0xd8d56268,0x716c563a .long 0x1d6178e7,0x27ba77a4,0x68a1ff8e,0xe4c80c40,0x0a13f63d,0x75011099,0xa61d46f3,0x7bf33521,0x10b365bb,0x0aff218e,0x0fd7ea75,0x81021804,0xa4b3a925,0x05a3fd8a,0x9b3db4e6,0xb829e75f .long 0x4d53e5fb,0x6bdc75a5,0xd52717e3,0x04a5dc02,0xe9a42ec2,0x86af502f,0x2630e382,0x8867e8fb,0xbec9889b,0xbf845c6e,0xcb47c98d,0x54f491f2,0x790c2a12,0xa3091fba,0xc20f708b,0xd7f6fd78 .long 0xacde5e17,0xa569ac30,0x6852b4d7,0xd0f996d0,0x4609ae54,0xe51d4bb5,0x0daed061,0x3fa37d17,0x34b8fb41,0x62a88684,0x9efb64f1,0x99a2acbd,0x6448e1f2,0xb75c1a5e,0x42b5a069,0xfa99951a .long 0x2f3b26e7,0x6d956e89,0xda875247,0xf4709860,0x2482dda3,0x3ad15179,0x017d82f0,0xd64110e3,0xfad414e4,0x14928d2c,0x2ed02b24,0x2b155f58,0xcb821bf1,0x481a141b,0x4f81f5da,0x12e3c770 .long 0x9fff8381,0xe49c5de5,0x5bbec894,0x11053232,0x454d88c4,0xa0d051cc,0x1f8e531b,0x4f6db89c,0xca563a44,0x34fe3fd6,0x58da8ab9,0x7f5c2215,0x9474f0a1,0x8445016d,0xcb7d8a0a,0x17d34d61 .long 0x1c474019,0x8e9d3910,0xd52ceefb,0xcaff2629,0xc1622c2b,0xf9cf3e32,0xe9071a05,0xd4b95e3c,0x1594438c,0xfbbca61f,0x04aadedf,0x1eb6e6a6,0x68e14940,0x853027f4,0xdfabda9c,0x221d322a .long 0xb7cb179a,0xed8ea9f6,0xb7934dcc,0xdc7b764d,0x5e09180d,0xfcb13940,0xb47dc2dd,0x6629a6bf,0x9f5a915e,0xbfc55e4e,0x6204441e,0xb1db9d37,0x930c5f53,0xf82d68cf,0xcbb605b1,0x17d3a142 .long 0x308780f2,0xdd5944ea,0x3845f5e4,0xdc8de761,0x7624d7a3,0x6beaba7d,0x304df11e,0x1e709afd,0x02170456,0x95364376,0xc8f94b64,0xbf204b3a,0x5680ca68,0x4e53af7c,0xe0c67574,0x0526074a .long 0xecd92af6,0x95d8cef8,0x6cd1745a,0xe6b9fa7a,0xa325c3e4,0x3d546d3d,0x9ae93aae,0x1f57691d,0x9d2e1a33,0xe891f3fe,0xac063d35,0xd430093f,0x5513a327,0xeda59b12,0x5536f18f,0xdc2134f3 .long 0x5c210286,0xaa51fe2c,0x1cab658c,0x3f68aaee,0xf9357292,0x5a23a00b,0x7efdabed,0x9a626f39,0x199d78e3,0xfe2b3bf3,0x71bbc345,0xb7a2af77,0x1e59802c,0x3d19827a,0xb487a51c,0x823bbc15 .long 0x99d0a422,0x856139f2,0xf456c6fb,0x9ac3df65,0x701f8bd6,0xaddf65c6,0x3758df87,0x149f321e,0x721b7eba,0xb1ecf714,0x31a3312a,0xe17df098,0xd5c4d581,0xdb2fd6ec,0x8fcea1b3,0xfd02996f .long 0x7882f14f,0xe29fa63e,0x07c6cadc,0xc9f6dc35,0xb882bed0,0x46f22d6f,0xd118e52c,0x1a45755b,0x7c4608cf,0x9f2c7c27,0x568012c2,0x7ccbdf32,0x61729b0e,0xfcb0aedd,0xf7d75dbf,0x7ca2ca9e .long 0x6f640f62,0xf58fecb1,0x39f51946,0xe274b92b,0x6288af44,0x7f4dfc04,0xeac329e5,0x0a91f32a,0xd6aaba31,0x43ad274b,0x0f6884f9,0x719a1640,0xdaf91e20,0x685d29f6,0x27e49d52,0x5ec1cc33 .long 0x3b54a059,0x38f4de96,0xefbcfdb3,0x0e0015e5,0x4dbb8da6,0x177d23d9,0x97a617ad,0x98724aa2,0xfdb6558e,0x30f0885b,0xc7899a96,0xf9f7a28a,0x872dc112,0xd2ae8ac8,0x73c3c459,0xfa0642ca .long 0xe7dfc8d6,0x15296981,0x1fb5b94a,0x67cd4450,0x0eddfd37,0x0ec71cf1,0x9a8eddc7,0xc7e5eeb3,0x81d95028,0x02ac8e3d,0x70b0e35d,0x0088f172,0xe1881fe3,0xec041fab,0xd99e7faa,0x62cf71b8 .long 0xe0f222c2,0x5043dea7,0x72e65142,0x309d42ac,0x9216cd30,0x94fe9ddd,0x0f87feec,0xd6539c7d,0x432ac7d7,0x03c5a57c,0x327fda10,0x72692cf0,0x280698de,0xec28c85f,0x7ec283b1,0x2331fb46 .long 0x2867e633,0xd34bfa32,0x0a9cc815,0x78709a82,0x875e2fa5,0xb7fe6964,0x9e98bfb5,0x25cc064f,0x493a65c5,0x9eb0151c,0x53182464,0x5fb5d941,0xf04618e2,0x69e6f130,0xf89c8ab6,0xa8ecec22 .long 0xb96209bd,0xcd6ac88b,0xb3e1c9e0,0x65fa8cdb,0x4a8d8eac,0xa47d22f5,0x8d33f963,0x83895cdf,0xb56cd3d1,0xa8adca59,0xdaf38232,0x10c8350b,0xa5080a9f,0x2b161fb3,0x3af65b3a,0xbe7f5c64 .long 0x97403a11,0x2c754039,0x121b96af,0x94626cf7,0x6a983ec2,0x431de7c4,0x52cc3df7,0x3780dd3a,0x2baf8e3b,0xe28a0e46,0x51d299ae,0xabe68aad,0x647a2408,0x603eb8f9,0x5c750981,0x14c61ed6 .long 0xc53352e7,0x88b34414,0x1337d46e,0x5a34889c,0xf95f2bc8,0x612c1560,0xd4807a3a,0x8a3f8441,0x5224da68,0x680d9e97,0xc3eb00e9,0x60cd6e88,0x9a6bc375,0x3875a98e,0x4fd554c2,0xdc80f924 .long 0x6ac77407,0x6c4b3415,0x25420681,0xa1e5ea8f,0x4607a458,0x541bfa14,0x96d7fbf9,0x5dbc7e7a,0x31590a47,0x646a851b,0x15ee6df8,0x039e85ba,0xd7b43fc0,0xd19fa231,0x299a0e04,0x84bc8be8 .long 0xf20df03a,0x2b9d2936,0x8608d472,0x24054382,0x9149202a,0x76b6ba04,0x3670e7b7,0xb21c3831,0xd6fdee10,0xddd93059,0x78488e71,0x9da47ad3,0xa0fcfb25,0x99cc1dfd,0x64696954,0x42abde10 .long 0x17eab9fe,0x14cc15fc,0xd3e70972,0xd6e863e4,0x6432112c,0x29a7765c,0x5b0774d8,0x88660001,0x2c088eae,0x3729175a,0x8230b8d4,0x13afbcae,0x915f4379,0x44768151,0xd8d22812,0xf086431a .long 0xc298b974,0x37461955,0xf8711e04,0x905fb5f0,0xfe969d18,0x787abf3a,0x6f6a494e,0x392167c2,0x28c511da,0xfc7a0d2d,0xb66a262d,0xf127c7dc,0xfd63fdf0,0xf9c4bb95,0x3913ef46,0x90016589 .long 0x11aa600d,0x74d2a73c,0x9fb5ab52,0x2f5379bd,0x7fb70068,0xe49e53a4,0x404aa9a7,0x68dd39e5,0x2ecaa9c3,0xb9b0cf57,0xe824826b,0xba0e103b,0x4631a3c4,0x60c2198b,0xfa8966a2,0xc5ff84ab .long 0xac95aff8,0x2d6ebe22,0xb5a46d09,0x1c9bb6db,0x53ee4f8d,0x419062da,0xbb97efef,0x7b9042d0,0x830cf6bd,0x0f87f080,0x6ec8a6c6,0x4861d19a,0x202f01aa,0xd3a0daa1,0xf25afbd5,0xb0111674 .long 0x1afb20d9,0x6d00d6cf,0x40671bc5,0x13695000,0x2485ea9b,0x913ab0dc,0x9eef61ac,0x1f2bed06,0x6d799e20,0x850c8217,0x3271c2de,0x93415f37,0x6c4f5910,0x5afb06e9,0xc4e9e421,0x688a52df .long 0xe2a9a6db,0x30495ba3,0x58f9268b,0x4601303d,0x7eb0f04f,0xbe3b0dad,0x4456936d,0x4ea47250,0xd33fd3e7,0x8caf8798,0xeb433708,0x1ccd8a89,0x87fd50ad,0x9effe3e8,0x6b29c4df,0xbe240a56 .long 0xca0e7ebd,0xec4ffd98,0xe748616e,0xf586783a,0xc77baa99,0xa5b00d8f,0xb4f34c9c,0x0acada29,0x0fe723ac,0x36dad67d,0x39c36c1e,0x1d8e53a5,0x1f4bea41,0xe4dd342d,0xebc9e4e0,0x64fd5e35 .long 0x57908805,0x96f01f90,0x5ed480dd,0xb5b9ea3d,0x3efd2dd0,0x366c5dc2,0x6e9dfa27,0xed2fe305,0x6e9197e2,0x4575e892,0xab502a5d,0x11719c09,0xe81f213f,0x264c7bec,0x55f5c457,0x741b9241 .long 0x49a5f4f4,0x78ac7b68,0x9fc45b7d,0xf91d70a2,0xb0f5f355,0x39b05544,0xeef930d9,0x11f06bce,0x038d05e1,0xdb84d25d,0xbacc1d51,0x04838ee5,0x9e8ee00b,0x9da3ce86,0xc36eda1f,0xc3412057 .long 0x64d9c2f4,0xae80b913,0xa010a8ff,0x7468bac3,0x37359d41,0xdfd20037,0x15efeacc,0x1a0f5ab8,0x659d0ce0,0x7c25ad2f,0x6785cff1,0x4011bcbb,0x7e2192c7,0x128b9912,0x13ccb0e8,0xa549d8e1 .long 0xc85438b1,0x805588d8,0xbc25cb27,0x5680332d,0x1a4bfdf4,0xdcd1bc96,0x706f6566,0x779ff428,0xf059987a,0x8bbee998,0xcc686de7,0xf6ce8cf2,0x953cfdb2,0xf8ad3c4a,0x2205da36,0xd1d426d9 .long 0xc781a241,0xb3c0f13f,0xd75362a8,0x3e89360e,0xc8a91184,0xccd05863,0xefa8a7f4,0x9bd0c9b7,0x8a912a4b,0x97ee4d53,0xbcf518fd,0xde5e15f8,0xc467e1e0,0x6a055bf8,0x1587e256,0x10be4b4b .long 0x668621c9,0xd90c14f2,0xab9c92c1,0xd5518f51,0xd6d47b3c,0x8e6a0100,0x66716175,0xcbe980dd,0xddd83683,0x500d3f10,0x99cac73c,0x3b6cb35d,0x6083d550,0x53730c8b,0xdf0a1987,0xcf159767 .long 0x43ad73b3,0x84bfcf53,0x4f035a94,0x1b528c20,0x33eeac69,0x4294edf7,0x817f3240,0xb6283e83,0x0a5f25b1,0xc3fdc959,0x5844ee22,0xefaf8aa5,0xdbdde4de,0xde269ba5,0xc56133bf,0xe3347160 .long 0x8d9ea9f8,0xc1184219,0xf3fc1ab5,0x090de5db,0x0bf22cda,0x404c37b1,0xf5618894,0x7de20ec8,0xecdaecab,0x754c588e,0x88342743,0x6ca4b0ed,0xf4a938ec,0x76f08bdd,0x91493ccb,0xd182de89 .long 0xc8a4186a,0xd652c53e,0x946d8e33,0xb3e878db,0x5f37663c,0x088453c0,0xb407748b,0x5cd9daaa,0x586d5e72,0xa1f5197f,0xc443ca59,0x47500be8,0xe2652424,0x78ef35b2,0x6dd7767d,0x09c5d26f .long 0xa74d3f7b,0x7175a79a,0xcf5ea459,0x0428fd8d,0xa5d1746d,0x511cb97c,0xe71d1278,0x36363939,0x10350bf4,0xcf2df955,0x60aae782,0xb3817439,0x3e688809,0xa748c0e4,0xd7a5a006,0x98021fbf .long 0x0e367a98,0x9076a70c,0x0f62b7c2,0xbea1bc15,0x30fe0343,0x2645a68c,0x699dc14f,0xacaffa78,0x457bf9c4,0xf4469964,0x0d2ead83,0x0db6407b,0xb2c6f3eb,0x68d56cad,0xf376356c,0x3b512e73 .long 0xfce10408,0xe43b0e1f,0x5a5e257d,0x89ddc003,0x0362e5b3,0xb0ae0d12,0xb0519161,0x07f983c7,0x5d5231e7,0xc2e94d15,0x0b4f9513,0xcff22aed,0x6ad0b0b5,0xb02588dd,0x11d0dcd5,0xb967d1ac .long 0xcf777b6c,0x8dac6bc6,0x4c6d1959,0x0062bdbd,0x0ef5cc85,0x53da71b5,0x4006f14f,0x07012c7d,0xac47800d,0x4617f962,0xc102ed75,0x53365f2b,0x4ab8c9d3,0xb422efcb,0x34af31c9,0x195cb26b .long 0x05f2c4ce,0x3a926e29,0x9856966c,0xbd2bdecb,0x85527015,0x5d16ab3a,0x4486c231,0x9f81609e,0xda350002,0xd8b96b2c,0xfa1b7d36,0xbd054690,0xe71d79bc,0xdc90ebf5,0x08964e4e,0xf241b6f9 .long 0x2fe3cd4c,0x7c838643,0xb4bc633c,0xe0f33acb,0x3d139f1f,0xb4a9ecec,0xdc4a1f49,0x05ce69cd,0xf5f98aaf,0xa19d1b16,0x6f23e0ef,0x45bb71d6,0x46cdfdd3,0x33789fcd,0xcee040ca,0x9b8e2978 .long 0xae0a6828,0x9c69b246,0x7078d5aa,0xba533d24,0x7bb4fbdb,0x7a2e42c0,0x7035385c,0xcfb4879a,0x3281705b,0x8c3dd30b,0x404fe081,0x7e361c6c,0x3f604edf,0x7b21649c,0xe52ffe47,0x5dbf6a3f .long 0x4b54d9bf,0xc41b7c23,0x3511c3d9,0x1374e681,0xc1b2b758,0x1863bf16,0x1e9e6a96,0x90e78507,0x5d86f174,0xab4bf98d,0x85e96fe4,0xd74e0bd3,0xcac5d344,0x8afde39f,0xbd91b847,0x90946dbc .long 0xfe1a838c,0xf5b42358,0x620ac9d8,0x05aae6c5,0xa1ce5a0b,0x8e193bd8,0x4dabfd72,0x8f710571,0x182caaac,0x8d8fdd48,0x040745cf,0x8c4aeefa,0xf3b93e6d,0x73c6c30a,0x16f42011,0x991241f3 .long 0xe457a477,0xa0158eea,0xee6ddc05,0xd19857db,0x18c41671,0xb3265224,0x3c2c0d58,0x3ffdfc7e,0x26ee7cda,0x3a3a5254,0xdf02c3a8,0x341b0869,0x723bbfc8,0xa023bf42,0x14452691,0x3d15002a .long 0x85edfa30,0x5ef7324c,0x87d4f3da,0x25976554,0xdcb50c86,0x352f5bc0,0x4832a96c,0x8f6927b0,0x55f2f94c,0xd08ee1ba,0x344b45fa,0x6a996f99,0xa8aa455d,0xe133cb8d,0x758dc1f7,0x5d0721ec .long 0x79e5fb67,0x6ba7a920,0x70aa725e,0xe1331feb,0x7df5d837,0x5080ccf5,0x7ff72e21,0xe4cae01d,0x0412a77d,0xd9243ee6,0xdf449025,0x06ff7cac,0x23ef5a31,0xbe75f7cd,0x0ddef7a8,0xbc957822 .long 0xb0ce1c55,0x8cf7230c,0x0bbfb607,0x5b534d05,0x0e16363b,0xee1ef113,0xb4999e82,0x27e0aa7a,0x79362c41,0xce1dac2d,0x91bb6cb0,0x67920c90,0x2223df24,0x1e648d63,0xe32e8f28,0x0f7d9eef .long 0xfa833834,0x6943f39a,0xa6328562,0x22951722,0x4170fc10,0x81d63dd5,0xaecc2e6d,0x9f5fa58f,0xe77d9a3b,0xb66c8725,0x6384ebe0,0x11235cea,0x5845e24a,0x06a8c118,0xebd093b1,0x0137b286 .long 0x44ace150,0xc589e1ce,0x4381e97c,0xe0f8d3d9,0x62c5a4b8,0x59e99b11,0xfd0ec9f9,0x90d262f7,0x283e13c9,0xfbc854c9,0xaedc7085,0x2d04fde7,0x47dcbecb,0x057d7765,0x9a76fa5f,0x8dbdf591 .long 0x0de1e578,0xd0150695,0xe9f72bc6,0x2e1463e7,0x1b39eca5,0xffa68441,0x7c037f2f,0x673c8530,0x747f91da,0xd0d6a600,0xc9cb78e9,0xb08d43e1,0x27b5cef5,0x0fc0c644,0xa60a2fd6,0x5c1d160a .long 0x28c8e13b,0xf98cae53,0xb2eddcd1,0x375f10c4,0x5cce06ad,0xd4eb8b7f,0x80a2e1ef,0xb4669f45,0x5bbd8699,0xd593f9d0,0xe7976d13,0x5528a4c9,0x1c7e28d3,0x3923e095,0x3f6bb577,0xb9293790 .long 0xc42bd6d2,0xdb567d6a,0xbb1f96ae,0x6df86468,0x4843b28e,0x0efe5b1a,0x6379b240,0x961bbb05,0x70a6a26b,0xb6caf5f0,0x328e6e39,0x70686c0d,0x895fc8d3,0x80da06cf,0xb363fdc9,0x804d8810 .long 0x207f1670,0xbe22877b,0x4e615291,0x9b0dd188,0x97a3c2bf,0x625ae8dc,0x439b86e8,0x08584ef7,0xdcd898ff,0xde7190a5,0x2058ee3d,0x26286c40,0x5f87b1c1,0x3db0b217,0x102a6db5,0xcc334771 .long 0x2f770fb1,0xd99de954,0x4cd7535e,0x97c1c620,0x3f09cefc,0xd3b6c448,0x5a63b4f8,0xd725af15,0xc01e20ec,0x0c95d24f,0x9ae7121f,0xdfd37494,0xec77b7ec,0x7d6ddb72,0x0353a4ae,0xfe079d3b .long 0x2e6ac8d2,0x3066e70a,0x106e5c05,0x9c6b5a43,0xede59b8c,0x52d3c6f5,0xfccec9ae,0x30d6a5c3,0x4fc0a9ef,0xedec7c22,0x95c16ced,0x190ff083,0x94de0fde,0xbe12ec8f,0x852d3433,0x0d131ab8 .long 0x85701291,0x42ace07e,0x194061a8,0x94793ed9,0xd7f4a485,0x30e83ed6,0xf9eeff4d,0x9eec7269,0x0c9d8005,0x90acba59,0x1e79b9d1,0x5feca458,0x1d506a1e,0x8fbe5427,0x2439cfa7,0xa32b2c8e .long 0x73dd0b4e,0x1671c173,0x44a054c6,0x37a28214,0x4e8b53f1,0x81760a1b,0xf9f93b9e,0xa6c04224,0xcf671e3c,0x18784b34,0xcda9b994,0x81bbecd2,0xb2ab3848,0x38831979,0xf2e03c2d,0xef54feb7 .long 0xfb8088fa,0xcf197ca7,0x4ddc96c5,0x01427247,0x30777176,0xa2d2550a,0x4d0cf71d,0x53469898,0x3a2aaac6,0x6ce937b8,0x5af38d9b,0xe9f91dc3,0xc8bf2899,0x2598ad83,0xb5536c16,0x8e706ac9 .long 0xf688dc98,0x40dc7495,0x124c4afc,0x26490cd7,0x1f18775c,0xe651ec84,0xb4fdaf4a,0x393ea6c3,0x7f338e0d,0x1e1f3343,0x6053e7b5,0x39fb832b,0x619e14d5,0x46e702da,0xcdeef6e0,0x859cacd1 .long 0x4462007d,0x63b99ce7,0x4cb5f5b7,0xb8ab48a5,0xf55edde7,0x9ec673d2,0x8cfaefda,0xd1567f74,0x0887bcec,0x46381b6b,0xe178f3c2,0x694497ce,0x1e6266cb,0x5e6525e3,0x697d6413,0x5931de26 .long 0x0e58d493,0x87f8df7c,0x58b73f12,0xb1ae5ed0,0xdea0c34d,0xc368f784,0x859a91a0,0x9bd0a120,0xcc863c68,0xb00d88b7,0x3d1f4d65,0x3a1cc11e,0x0aa85593,0xea38e0e7,0x7dc4aee8,0x37f13e98 .long 0xbc947bad,0x10d38667,0x2a36ee2e,0x738e07ce,0xc577fcac,0xc93470cd,0x2782470d,0xdee1b616,0x2e793d12,0x36a25e67,0xe0f186da,0xd6aa6cae,0x80e07af7,0x474d0fd9,0xba8a5cd4,0xf7cdc47d .long 0xab15247f,0x28af6d9d,0x493a537f,0x7c789c10,0x23a334e7,0x7ac9b110,0x12c9c277,0x0236ac09,0x1d7a5144,0xa7e5bd25,0xf13ec4ec,0x098b9c2a,0xd3f0abca,0x3639daca,0xa23960f9,0x642da81a .long 0x4f7269b1,0x7d2e5c05,0xe287c385,0xfcf30777,0xf2a46f21,0x10edc84f,0x4f43fa36,0x35441757,0xfd703431,0xf1327899,0x16dd587a,0xa438d7a6,0xe9c8352d,0x65c34c57,0x5cc5a24e,0xa728edab .long 0x42531689,0xaed78abc,0x010963ef,0x0a51a0e8,0xd717d9b3,0x5776fa0a,0x7dd3428b,0xf356c239,0x8d3a3dac,0x29903fff,0x3d94491f,0x409597fa,0xbf4a56a4,0x4cd7a5ff,0x8adab462,0xe5096474 .long 0x5c3427b0,0xa97b5126,0xd282c9bd,0x6401405c,0x222c5c45,0x3629f8d7,0xe8d50aed,0xb1c02c16,0xd9635bc9,0xbea2ed75,0x6e24552f,0x226790c7,0x65f1d066,0x3c33f2a3,0x6dfccc2e,0x2a43463e .long 0xdb483761,0x8cc3453a,0x65d5672b,0xe7cc6085,0xde3efc87,0x277ed6cb,0x69234eaf,0x19f2f368,0x5c0b800b,0x9aaf4317,0x8b6da6e2,0x1f1e7c89,0xb94ec75e,0x6cfb4715,0x453118c2,0xd590dd5f .long 0x1f17a34c,0x14e49da1,0x235a1456,0x5420ab39,0x2f50363b,0xb7637241,0xc3fabb6e,0x7b15d623,0xe274e49c,0xa0ef40b1,0x96b1860a,0x5cf50744,0x66afe5a4,0xd6583fbf,0xf47e3e9a,0x44240510 .long 0x11b2d595,0x99254343,0xeec8df57,0xf1367499,0x3e73dd05,0x3cb12c61,0x7dac102a,0xd248c033,0xa77739f5,0xcf154f13,0x23d2af42,0xbf4288cb,0x32e4a1cf,0xaa64c9b6,0xc8a208f3,0xee8c07a8 .long 0x6fe8393f,0xe10d4999,0xe91f3a32,0x0f809a3f,0x802f63c8,0x61096d1c,0x57750d3d,0x289e1462,0x9889feea,0xed06167e,0xe0993909,0xd5c9c0e2,0x56508ac6,0x46fca0d8,0x4f1b8e83,0x91826047 .long 0x9a4a2751,0x4f2c877a,0xcae6fead,0x71bd0072,0x06aa1941,0x38df8dcc,0x63beeaa8,0x5a074b4c,0xc1cec8ed,0xd6d65934,0xaabc03bd,0xa6ecb49e,0xde8a8415,0xaade91c2,0x691136e0,0xcfb0efdf .long 0x23ab3495,0x11af45ee,0x0b77463d,0xa132df88,0x815d06f4,0x8923c15c,0x0d61a436,0xc3ceb3f5,0xe88fb1da,0xaf52291d,0x1da12179,0xea057974,0xd2fef720,0xb0d7218c,0x8e1d8845,0x6c0899c9 .long 0x752ddad7,0x98157504,0xa1a68a97,0xd60bd74f,0xf658fb99,0x7047a3a9,0x5f8511e4,0x1f5d86d6,0x4b5a6d88,0xb8a4bc42,0x1abefa7d,0x69eb2c33,0x13c9c510,0x95bf39e8,0xd48aab43,0xf571960a .long 0x704e23c6,0x7e8cfbcf,0x28aaa65b,0xc71b7d22,0x245e3c83,0xa041b2bd,0xd21854ff,0x69b98834,0x963bfeec,0x89d227a3,0xde7da7cb,0x99947aaa,0xee68a9b1,0x1d9ee9db,0x698ec368,0x0a08f003 .long 0x78ef2487,0xe9ea4094,0x02cfec26,0xc8d2d415,0xb7dcf328,0xc52f9a6e,0x85b6a937,0x0ed489e3,0xbef3366e,0x9b94986b,0xedddddb8,0x0de59c70,0xeadddbe2,0xffdb748c,0x8266ea40,0x9b9784bb .long 0x1a93507a,0x142b5502,0x8d3c06cf,0xb4cd1187,0x91ec3f40,0xdf70e76a,0x4e7553c2,0x484e81ad,0x272e9d6e,0x830f87b5,0xc6ff514a,0xea1c93e5,0xc4192a8e,0x67cc2adc,0x42f4535a,0xc77e27e2 .long 0xd2b713c5,0x9cdbab36,0xcf7b0cd3,0x86274ea0,0x09af826b,0x784680f3,0x0c72dea3,0xbfcc837a,0xd6529b73,0xa8bdfe9d,0x63a88002,0x708aa228,0xc91d45b9,0x6c7a9a54,0xfd004f56,0xdf1a38bb .long 0xb8bad853,0x2e8c9a26,0x3723eae7,0x2d52cea3,0x56ca2830,0x054d6d81,0x9a8dc411,0xa3317d14,0xfd4ddeda,0xa08662fe,0xb55d792b,0xed2a153a,0xbfc6e944,0x7035c16a,0x00171cf3,0xb6bc5834 .long 0x83d102b6,0xe27152b3,0x0646b848,0xfe695a47,0x916e6d37,0xa5bb09d8,0x0d17015e,0xb4269d64,0x0a1d2285,0x8d8156a1,0x46d26d72,0xfeef6c51,0x4c5434a7,0x9dac57c8,0x59d39e31,0x0282e5be .long 0x721c486d,0xedfff181,0xbc58824e,0x301baf10,0x00570031,0x8136a6aa,0x1cddde68,0x55aaf78c,0x59c63952,0x26829371,0x8bc25baf,0x3a3bd274,0xb7e52dc3,0xecdf8657,0xfd78e6c8,0x2dd8c087 .long 0xf5531461,0x20553274,0x5d95499b,0x8b4a1281,0x1a80f9d2,0xe2c8763a,0x4ddec758,0xd1dbe32b,0x30c34169,0xaf12210d,0x78baa533,0xba74a953,0xa438f254,0x3d133c6e,0x201bef5b,0xa431531a .long 0xf669d7ec,0x15295e22,0x357fb515,0xca374f64,0xeaa3fdb3,0x8a8406ff,0xdf3f2da8,0x106ae448,0x33c8e9a1,0x8f9b0a90,0x71ad5885,0x234645e2,0x1c0aed14,0x3d083224,0x7a942d46,0xf10a7d3e .long 0x40d5c9be,0x7c11deee,0xba84ed98,0xb2bae7ff,0xaad58ddd,0x93e97139,0x3f6d1fa3,0x3d872796,0x8569ff13,0x483aca81,0x9a600f72,0x8b89a5fb,0xc06f2b86,0x4cbc27c3,0x63ad9c0b,0x22130713 .long 0x48ac2840,0xb5358b1e,0xecba9477,0x18311294,0xa6946b43,0xda58f990,0x9ab41819,0x3098baf9,0x4198da52,0x66c4c158,0x146bfd1b,0xab4fc17c,0xbf36a908,0x2f0a4c3c,0x58cf7838,0x2ae9e34b .long 0x3fa11b1f,0xf411529e,0x974af2b4,0x21e43677,0xc230793b,0x7c20958e,0x16e840f3,0x710ea885,0xc5dc67cf,0xfc0b21fc,0x88405718,0x08d51647,0xcfe49eb7,0xd955c21f,0x56dd4a1f,0x9722a5d5 .long 0xc861baa5,0xc9ef50e2,0x9505ac3e,0xc0c21a5d,0x8b7c063f,0xaf6b9a33,0x2f4779c1,0xc6370339,0x638167c3,0x22df99c7,0x795db30c,0xfe6ffe76,0xa4854989,0x2b822d33,0x30563aa5,0xfef031dd .long 0xd57c667f,0x16b09f82,0xcc0b76f1,0xc70312ce,0xc9118aec,0xbf04a9e6,0x3409d133,0x82fcb419,0xab45d44d,0x1a8ab385,0x617b83a3,0xfba07222,0x58e81b52,0xb05f50dd,0x21ce5aff,0x1d8db553 .long 0xe344a873,0x3097b8d4,0xfe36d53e,0x7d8d116d,0x7875e750,0x6db22f58,0x43e144ea,0x2dc5e373,0xe799eb95,0xc05f32e6,0x6899e6ec,0xe9e5f4df,0x1fab23d5,0xbdc3bd68,0x73af60e6,0xb72b8ab7 .long 0x2cecc84a,0x8db27ae0,0x7bdb871c,0x600016d8,0xd7c46f58,0x42a44b13,0xc3a77d39,0xb8919727,0xdafd6088,0xcfc6bbbd,0x6bd20d39,0x1a740146,0x98c41072,0x8c747abd,0xbdf68ea1,0x4c91e765 .long 0x08819a78,0x7c95e5ca,0xc9587921,0xcf48b729,0xdebbcc7d,0x091c7c5f,0xf0e05149,0x6f287404,0x26cd44ec,0xf83b5ac2,0xcfea250e,0x88ae32a6,0x1d06ebc5,0x6ac5047a,0xd434f781,0xc7e550b4 .long 0x5c727bd2,0x61ab1cf2,0x1cf915b0,0x2e4badb1,0xf69d3920,0x1b4dadec,0xf14c1dfe,0xe61b1ca6,0xbd6bd51f,0x90b479cc,0x8045ec30,0x8024e401,0x25ef0e62,0xcab29ca3,0x49e4ebc0,0x4f2e9416 .long 0x0ccced58,0x45eb40ec,0x0da44f98,0x25cd4b9c,0x871812c6,0x43e06458,0x16cef651,0x99f80d55,0xce6dc153,0x571340c9,0xd8665521,0x138d5117,0x4e07014d,0xacdb45bc,0x84b60b91,0x2f34bb38 .long 0x2ae8921e,0xf44a4fd2,0x892ba1e2,0xb039288e,0xb1c180b2,0x9da50174,0x1693dc87,0x6b70ab66,0xe7057481,0x7e9babc9,0x9c80dc41,0x4581ddef,0x51294682,0x0c890da9,0x3f4736e5,0x0b5629d3 .long 0xb06f5b41,0x2340c79e,0x4e243469,0xa42e84ce,0x045a71a9,0xf9a20135,0xd27b6fb6,0xefbfb415,0x9d33cd6f,0x25ebea23,0xaa6c0af8,0x9caedb88,0xd9ce6f96,0x53dc7e9a,0x51e0b15a,0x3897f9fd .long 0x8e5d788e,0xf51cb1f8,0xe1d490ee,0x1aec7ba8,0xcc58cb3c,0x265991e0,0x9fc3ad31,0x9f306e8c,0x5040a0ac,0x5fed006e,0xfb476f2e,0xca9d5043,0xbeea7a23,0xa19c06e8,0x0edabb63,0xd2865801 .long 0x6967469a,0xdb92293f,0x8d8a8ed8,0x2894d839,0xbbc77122,0x87c9e406,0x2ea3a26a,0x8671c6f1,0xd7de9853,0xe42df8d6,0xb1f2bcc7,0x2e3ce346,0x899d50cf,0xda601dfc,0xfb1b598f,0xbfc913de .long 0xe61f7908,0x81c4909f,0x9bbc7b29,0x192e304f,0xc104b338,0xc3ed8738,0x783f5d61,0xedbe9e47,0x2db30660,0x0c06e9be,0xc0eb7d8e,0xda3e613f,0x322e096e,0xd8fa3e97,0xd336e247,0xfebd91e8 .long 0xdf655a49,0x8f13ccc4,0x5eb20210,0xa9e00dfc,0xc656b6ea,0x84631d0f,0xd8c0d947,0x93a058cd,0x67bd3448,0x6846904a,0xf394fd5c,0x4a3d4e1a,0xdb225f52,0xc102c1a5,0xfc4f5e9a,0xe3455bba .long 0x4b9ad1ce,0x6b36985b,0x5bb7f793,0xa9818536,0x48b1a416,0x6c25e1d0,0x3c81bee7,0x1381dd53,0x7a4a7620,0xd2a30d61,0x39b8944c,0xc8412926,0x7a97c33a,0x3c1c6fbe,0x938664e7,0x941e541d .long 0x4a34f239,0x417499e8,0xb90402d5,0x15fdb83c,0x433aa832,0xb75f46bf,0x63215db1,0xb61e15af,0xa127f89a,0xaabe59d4,0x07e816da,0x5d541e0c,0xa618b692,0xaaba0659,0x17266026,0x55327733 .long 0x95f57552,0xaf53a0fc,0x6cacb0c9,0x32947650,0xc821be01,0x253ff58d,0xa06f1146,0xb0309531,0x05c2e54d,0x59bbbdf5,0x26e8dd22,0x158f27ad,0x397e1e53,0xcc5b7ffb,0x7fc1e50d,0xae03f65b .long 0x9c95f0f9,0xa9784ebd,0x24640771,0x5ed9deb2,0x035561c4,0x31244af7,0x7ee857de,0x87332f3a,0x2b9e0d88,0x09e16e9e,0x56a06049,0x52d910f4,0xa9592f48,0x507ed477,0x2365d678,0x85cb917b .long 0x4c8998d1,0xf8511c93,0x730ea58f,0x2186a3f1,0xb2029db0,0x50189626,0x02ceb75a,0x9137a6d9,0x748bc82c,0x2fe17f37,0x80469f8c,0x87c2e931,0xbf891aa2,0x850f71cd,0x75ec3d8d,0x0ca1b89b .long 0x5e1cd3cd,0x516c43aa,0x9a887c28,0x89397808,0xddea1f9f,0x0059c699,0x8e6868f7,0x7737d6fa,0x60f1524b,0x6d93746a,0xba052aa7,0x36985e55,0xed923ea5,0x41b1d322,0x25852a11,0x3429759f .long 0x092e9f41,0xbeca6ec3,0x62256bbd,0x3a238c66,0x70ad487d,0xd82958ea,0x65610d93,0x4ac8aaf9,0x5e4ccab0,0x3fa101b1,0x9de14bfb,0x9bf430f2,0x6531899d,0xa10f5cc6,0xea8ce17d,0x590005fb .long 0x24544cb6,0xc437912f,0xd79ac2e3,0x9987b71a,0xc058a212,0x13e3d9dd,0xd2de9606,0x00075aac,0x6cac8369,0x80ab508b,0xf54f6c89,0x87842be7,0x6bc532a4,0xa7ad663d,0x78a91bc8,0x67813de7 .long 0xc3427239,0x5dcb61ce,0xc56934d9,0x5f3c7cf0,0xe3191591,0xc079e0fb,0xb01aada7,0xe40896bd,0x0492d25f,0x8d466791,0xe7408276,0x8aeb30c9,0x9287aacc,0xe9437495,0x79fe03d4,0x23d4708d .long 0xd0c05199,0x8cda9cf2,0xfae78454,0x502fbc22,0xf572a182,0xc0bda9df,0x6158b372,0x5f9b71b8,0x2b82dd07,0xe0f33a59,0x9523032e,0x76302735,0xc4505a32,0x7fe1a721,0xf796409f,0x7b6e3e82 .long 0x35d0b34a,0xe3417bc0,0x8327c0a7,0x440b386b,0xac0362d1,0x8fb7262d,0xe0cdf943,0x2c41114c,0xad95a0b1,0x2ba5cef1,0x67d54362,0xc09b37a8,0x01e486c9,0x26d6cdd2,0x42ff9297,0x20477abf .long 0x292a9287,0xa004dcb3,0x77b092c7,0xddc15cf6,0x806c0605,0x083a8464,0x3db997b0,0x4a68df70,0x05bf7dd0,0x9c134e45,0x8ccf7f8c,0xa4e63d39,0x41b5f8af,0xa6e6517f,0xad7bc1cc,0xaa8b9342 .long 0x1e706ad9,0x126f35b5,0xc3a9ebdf,0xb99cebb4,0xbf608d90,0xa75389af,0xc6c89858,0x76113c4f,0x97e2b5aa,0x80de8eb0,0x63b91304,0x7e1022cc,0x6ccc066c,0x3bdab605,0xb2edf900,0x33cbb144 .long 0x7af715d2,0xc4176471,0xd0134a96,0xe2f7f594,0xa41ec956,0x2c1873ef,0x77821304,0xe4e7b4f6,0x88d5374a,0xe5c8ff97,0x80823d5b,0x2b915e63,0xb2ee8fe2,0xea6bc755,0xe7112651,0x6657624c .long 0xdace5aca,0x157af101,0x11a6a267,0xc4fdbcf2,0xc49c8609,0xdaddf340,0xe9604a65,0x97e49f52,0x937e2ad5,0x9be8e790,0x326e17f1,0x846e2508,0x0bbbc0dc,0x3f38007a,0xb11e16d6,0xcf03603f .long 0x7442f1d5,0xd6f800e0,0x66e0e3ab,0x475607d1,0xb7c64047,0x82807f16,0xa749883d,0x8858e1e3,0x8231ee10,0x5859120b,0x638a1ece,0x1b80e7eb,0xc6aa73a4,0xcb72525a,0x844423ac,0xa7cdea3d .long 0xf8ae7c38,0x5ed0c007,0x3d740192,0x6db07a5c,0x5fe36db3,0xbe5e9c2a,0x76e95046,0xd5b9d57a,0x8eba20f2,0x54ac32e7,0x71b9a352,0xef11ca8f,0xff98a658,0x305e373e,0x823eb667,0xffe5a100 .long 0xe51732d2,0x57477b11,0x2538fc0e,0xdfd6eb28,0x3b39eec5,0x5c43b0cc,0xcb36cc57,0x6af12778,0x06c425ae,0x70b0852d,0x5c221b9b,0x6df92f8c,0xce826d9c,0x6c8d4f9e,0xb49359c3,0xf59aba7b .long 0xda64309d,0x5c8ed8d5,0x91b30704,0x61a6de56,0x2f9b5808,0xd6b52f6a,0x98c958a7,0x0eee4194,0x771e4caa,0xcddd9aab,0x78bc21be,0x83965dfd,0xb3b504f5,0x02affce3,0x561c8291,0x30847a21 .long 0x52bfda05,0xd2eb2cf1,0x6197b98c,0xe0e4c4e9,0xf8a1726f,0x1d35076c,0x2db11e3d,0x6c06085b,0x4463ba14,0x15c0c4d7,0x0030238c,0x9d292f83,0x3727536d,0x1311ee8b,0xbeaedc1e,0xfeea86ef .long 0x66131e2e,0xb9d18cd3,0x80fe2682,0xf31d974f,0xe4160289,0xb6e49e0f,0x08e92799,0x7c48ec0b,0xd1989aa7,0x818111d8,0xebf926f9,0xb34fa0aa,0xa245474a,0xdb5fe2f5,0x3c7ca756,0xf80a6ebb .long 0xafa05dd8,0xa7f96054,0xfcaf119e,0x26dfcf21,0x0564bb59,0xe20ef2e3,0x61cb02b8,0xef4dca50,0x65d30672,0xcda7838a,0xfd657e86,0x8b08d534,0x46d595c8,0x4c5b4395,0x425cb836,0x39b58725 .long 0x3de9abe3,0x8ea61059,0x9cdc03be,0x40434881,0xcfedce8c,0x9b261245,0xcf5234a1,0x78c318b4,0xfde24c99,0x510bcf16,0xa2c2ff5d,0x2a77cb75,0x27960fb4,0x9c895c2b,0xb0eda42b,0xd30ce975 .long 0x1a62cc26,0xfda85393,0x50c0e052,0x23c69b96,0xbfc633f3,0xa227df15,0x1bae7d48,0x2ac78848,0x187d073d,0x487878f9,0x967f807d,0x6c2be919,0x336e6d8f,0x765861d8,0xce528a43,0x88b8974c .long 0xff57d051,0x09521177,0xfb6a1961,0x2ff38037,0xa3d76ad4,0xfc0aba74,0x25a7ec17,0x7c764803,0x48879bc8,0x7532d75f,0x58ce6bc1,0xea7eacc0,0x8e896c16,0xc82176b4,0x2c750fed,0x9a30e0b2 .long 0x421d3aa4,0xc37e2c2e,0xe84fa840,0xf926407c,0x1454e41c,0x18abc03d,0x3f7af644,0x26605ecd,0xd6a5eabf,0x242341a6,0x216b668e,0x1edb84f4,0x04010102,0xd836edb8,0x945e1d8c,0x5b337ce7 .long 0xc055dc14,0xd2075c77,0x81d89cdf,0x2a0ffa25,0x6ffdcbaf,0x8ce815ea,0xfb648867,0xa3428878,0x884655fb,0x277699cf,0x364d3e41,0xfa5b5bd6,0x441e1cb7,0x01f680c6,0xb70a7d67,0x3fd61e66 .long 0xcc78cf66,0x666ba2dc,0x6fdbff77,0xb3018174,0x168d4668,0x8d4dd0db,0x1dab3a2a,0x259455d0,0xcde3acec,0xf58564c5,0x13adb276,0x77141925,0x8a303f65,0x527d725d,0xe6f38f7b,0x55deb6c9 .long 0xb1fa70fb,0xfd5bb657,0xd8073a00,0xfa07f50f,0xbca02500,0xf72e3aa7,0x9975740d,0xf68f895d,0x5cae2a6a,0x30112060,0x02874842,0x01bd7218,0x7ce47bd3,0x3d423891,0x789544f6,0xa66663c1 .long 0x3272d838,0x864d05d7,0xfa6295c5,0xe22924f9,0x6c2fda32,0x8189593f,0xb184b544,0x330d7189,0xbde1f714,0x79efa62c,0xe5cb1a63,0x35771c94,0x641c8332,0x2f4826b8,0xc8cee854,0x00a894fb .long 0x36194d40,0xb4b9a39b,0x77612601,0xe857a7c5,0x4ecf2f58,0xf4209dd2,0x5a033487,0x82b9e66d,0xe4e8b9dd,0xc1e36934,0xa42377d7,0xd2372c9d,0x0e3ae43b,0x51dc94c7,0x04474f6f,0x4c57761e .long 0x1058a318,0xdcdacd0a,0x78053a9a,0x369cf3f5,0x31c68de2,0xc6c3de50,0x3c4b6d9f,0x4653a576,0xaa4e5c97,0x1688dd5a,0xb7ab3c74,0x5be80aa1,0xbc65c283,0x70cefe7c,0x06867091,0x57f95f13 .long 0x4415503b,0xa39114e2,0x4cbb17e9,0xc08ff7c6,0xd7dec966,0x1eff674d,0x53376f63,0x6d4690af,0xea74237b,0xff6fe32e,0xcd57508e,0xc436d17e,0xedcc40fe,0x15aa28e1,0x581bbb44,0x0d769c04 .long 0x34eaacda,0xc240b6de,0x2ba0f1de,0xd9e116e8,0x79438e55,0xcbe45ec7,0x96f752d7,0x91787c9d,0xf129ac2f,0x897f532b,0x5a36e22c,0xd307b7c8,0x749fb8f3,0x91940675,0x157fdb28,0xd14f95d0 .long 0x6ae55043,0xfe51d029,0x44a87de1,0x8931e98f,0x09e4fee2,0xe57f1cc6,0x4e072d92,0x0d063b67,0xed0e4316,0x70a998b9,0x306aca46,0xe74a736b,0x4fda97c7,0xecf0fbf2,0x3e178d93,0xa40f65cb .long 0x16df4285,0x16253604,0xd0c56ae2,0xb0c9babb,0xcfc5cfc3,0x73032b19,0x09752056,0xe497e5c3,0x164bda96,0x12096bb4,0xa0b74da1,0x1ee42419,0x403826ba,0x8fc36243,0xdc09e660,0x0c8f0069 .long 0xc27253c9,0x8667e981,0x92b36a45,0x05a6aefb,0x9cb7bb46,0xa62c4b36,0x11f7027b,0x8394f375,0x5f109d0f,0x747bc79c,0x5b8cc60a,0xcad88a76,0x58f09e68,0x80c5a66b,0xf6127eac,0xe753d451 .long 0x5b0ec6f5,0xc44b74a1,0x5289b2b8,0x47989fe4,0x58d6fc73,0x745f8484,0xf61c70ab,0xec362a6f,0xb3a8ad41,0x070c98a7,0x7b63db51,0x73a20fc0,0xf44c35f4,0xed2c2173,0x9acc9dca,0x8a56149d .long 0x9ac6e0f4,0x98f17881,0xa413b5ed,0x360fdeaf,0xa300b0fd,0x0625b8f4,0x5b3222d3,0xf1f4d76a,0x587f76b8,0x9d6f5109,0x2317fdb5,0x8b4ee08d,0x8c68b095,0x88089bb7,0x5808d9b9,0x95570e9a .long 0x35d33ae7,0xa395c36f,0x50bb5a94,0x200ea123,0x0bafe84b,0x20c789bd,0x0919276a,0x243ef52d,0xe23ae233,0x3934c577,0xa460d1ec,0xb93807af,0xf8fa76a4,0xb72a53b1,0xc3ca4491,0xd8914cb0 .long 0x3fb42622,0x2e128494,0x500907d5,0x3b2700ac,0x1a95ec63,0xf370fb09,0x31b6dfbd,0xf8f30be2,0x69e55f15,0xf2b2f8d2,0xcc1323e9,0x1fead851,0xd9e5eef6,0xfa366010,0xe316107e,0x64d487b0 .long 0xd23ddc82,0x4c076b86,0x7e0143f0,0x03fd344c,0x317af2c5,0xa95362ff,0xe18b7a4f,0x0add3db7,0x8260e01b,0x9c673e3f,0x54a1cc91,0xfbeb49e5,0x92f2e433,0x91351bf2,0x851141eb,0xc755e7ec .long 0x29607745,0xc9a95139,0xa26f2b28,0x0ca07420,0x4bc6f9dd,0xcb2790e7,0xadcaffc0,0x345bbb58,0xbe0f27a2,0xc65ea38c,0x641fcb56,0x67c24d7c,0xa9e2c757,0x2c25f0a7,0x16f16c49,0x93f5cdb0 .long 0xc5ee30a1,0x2ca5a9d7,0xb909b729,0xd1593635,0xdadeff48,0x804ce9f3,0xb07c30c3,0xec464751,0x9e49af6a,0x89d65ff3,0x6f3d01bc,0xf2d6238a,0x0bced843,0x1095561e,0xc8a13fd8,0x51789e12 .long 0x763231df,0xd633f929,0xe7cbddef,0x46df9f7d,0xcb265da8,0x01c889c0,0xaf4336d2,0xfce1ad10,0xfc6a0a7e,0x8d110df6,0x6da425dc,0xdd431b98,0x1834aabe,0xcdc4aeab,0x8439b7fc,0x84deb124 .long 0x3c2a5998,0x8796f169,0x7947190d,0x9b9247b4,0x11597014,0x55b9d9a5,0x7b1566ee,0x7e9dd70d,0xcbcd5e64,0x94ad78f7,0x9bd4c032,0x0359ac17,0x7cc222ae,0x3b11baaf,0xba78e812,0xa6a6e284 .long 0x24cea1a0,0x8392053f,0x33621491,0xc97bce4a,0x35399ee9,0x7eb1db34,0xece81ad1,0x473f78ef,0xf63d3d0d,0x41d72fe0,0xafab62fc,0xe620b880,0x93158383,0x92096bc9,0x8f896f6c,0x41a21357 .long 0xc7dcfcab,0x1b5ee2fa,0x9546e007,0x650acfde,0xb1b02e07,0xc081b749,0xf9eca03d,0xda9e41a0,0x175a54ab,0x013ba727,0xea5d8d10,0xca0cd190,0x95fd96a9,0x85ea52c0,0xbc5c3940,0x2c591b9f .long 0x2bad4d5f,0x6fb4d4e4,0xfef0059b,0xfa4c3590,0xf5122294,0x6a10218a,0xa85751d1,0x9a78a81a,0xa98e84e7,0x04f20579,0x4997e5b5,0xfe1242c0,0xca21e1e4,0xe77a273b,0x9411939d,0xfcc8b1ef .long 0x92d0487a,0xe20ea302,0x294b91fe,0x1442dbec,0xbb6b0e8f,0x1f7a4afe,0x6889c318,0x1700ef74,0x70f1fc62,0xf5bbffc3,0x69c79cca,0x3b31d4b6,0xa7f6340d,0xe8bc2aab,0xa725e10a,0xb0b08ab4 .long 0xae340050,0x44f05701,0x1cf0c569,0xba4b3016,0xfbe19a51,0x5aa29f83,0xb71d752e,0x1b9ed428,0xeb4819f5,0x1666e54e,0x9e18b75b,0x616cdfed,0x3ee27b0b,0x112ed5be,0x44c7de4d,0xfbf28319 .long 0xe0e60d84,0xd685ec85,0x1db7ee78,0x68037e30,0x003c4d6e,0x5b65bdcd,0x93e29a6a,0x33e7363a,0x08d0756c,0x995b3a61,0x2faf134b,0xd727f85c,0x1d337823,0xfac6edf7,0x0439b8b4,0x99b9aa50 .long 0xe2b4e075,0x722eb104,0x437c4926,0x49987295,0x46a9b82d,0xb1e4c0e4,0x57a006f5,0xd0cb3197,0xd7808c56,0xf3de0f7d,0x51f89772,0xb5c54d8f,0xadbd31aa,0x500a114a,0x295f6cab,0x9afaaaa6 .long 0x04cf667a,0x94705e21,0x9d3935d7,0xfc2a811b,0x6d09267c,0x560b0280,0xf780e53b,0xf19ed119,0x067b6269,0xf0227c09,0x5caef599,0x967b8533,0x68efeebc,0x155b9243,0xc497bae6,0xcd6d34f5 .long 0x6cceb370,0x1dd8d5d3,0xa78d7bf9,0x2aeac579,0x70b67a62,0x5d65017d,0x17c53f67,0x70c8e44f,0x86a34d09,0xd1fc0950,0xe7134907,0xe0fca256,0x80fdd315,0xe24fa29c,0xd87499ad,0x2c4acd03 .long 0x3b5a9ba6,0xbaaf7517,0x12e51a51,0xb9cbe1f6,0x5e154897,0xd88edae3,0x77b66ca0,0xe4309c3c,0xf67f3746,0xf5555805,0xa36401ff,0x85fc37ba,0xd9499a53,0xdf86e2ca,0xecbc955b,0x6270b2a3 .long 0x974ad33b,0xafae64f5,0xfe7b2df1,0x04d85977,0x4ab03f73,0x2a3db3ff,0x8702740a,0x0b87878a,0x5a061732,0x6d263f01,0xa32a1901,0xc25430ce,0xdb155018,0xf7ebab3d,0x63a9b78e,0x3a86f693 .long 0xda9f3804,0x349ae368,0xa164349c,0x470f07fe,0x8562baa5,0xd52f4cc9,0x2b290df3,0xc74a9e86,0x43471a24,0xd3a1aa35,0xb8194511,0x239446be,0x81dcd44d,0xbec2dd00,0xc42ac82d,0xca3d7f0f .long 0xfdaf4520,0x1f3db085,0x4549daf2,0xbb6d3e80,0x19ad5c42,0xf5969d8a,0xdbfd1511,0x7052b13d,0x682b9060,0x11890d1b,0xac34452c,0xa71d3883,0x783805b4,0xa438055b,0x4725b23e,0x43241277 .long 0x4901bbed,0xf20cf96e,0xf432a2bb,0x6419c710,0xdfa9cd7d,0x57a0fbb9,0x00daa249,0x589111e4,0x7b60554e,0x19809a33,0xede283a4,0xea5f8887,0x503bfd35,0x2d713802,0x585d2a53,0x151bb0af .long 0x43b30ca8,0x40b08f74,0xd9934583,0xe10b5bba,0xb51110ad,0xe8a546d6,0x28e0b6c5,0x1dd50e66,0xcff2b821,0x292e9d54,0x47281760,0x3882555d,0x3724d6e3,0x134838f8,0x22ddcda1,0xf2c679e0 .long 0x6d2a5768,0x40ee8815,0x1c1e7e2d,0x7f227bd2,0xd04ff443,0x487ba134,0xc614e54b,0x76e2ff3d,0xa3177ec7,0x36b88d6f,0x2328fff5,0xbf731d51,0x49ba158e,0x758caea2,0x02938188,0x5ab8ff4c .long 0x35edc56d,0x33e16056,0x7e940d79,0x5a69d349,0x03866dcb,0x6c4fd001,0x4893cdef,0x20a38f57,0xfac3a15b,0xfbf3e790,0x7a4f8e6b,0x6ed7ea2e,0xbc3aca86,0xa663eb4f,0x080d53f7,0x22061ea5 .long 0xf546783f,0x2480dfe6,0x5a0a641e,0xd38bc6da,0x2ede8965,0xfb093cd1,0xacb455cf,0x89654db4,0x26e1adee,0x413cbf9a,0x373294d4,0x291f3764,0x648083fe,0x00797257,0x208cc341,0x25f504d3 .long 0xc3a0ee43,0x635a8e5e,0x679898ff,0x70aaebca,0x5dc63d56,0x9ee9f547,0xffb34d00,0xce987966,0x5e26310a,0xf9f86b19,0x382a8ca8,0x9e435484,0xc2352fe4,0x253bcb81,0x4474b571,0xa4eac8b0 .long 0xc1ad8cf8,0xc1b97512,0x99e0b697,0x193b4e9e,0x01e85df0,0x939d2716,0xcd44eafd,0x4fb265b3,0xe51e1ae2,0x321e7dcd,0xe3d8b096,0x8e3a8ca6,0x52604998,0x8de46cb0,0x39072aa7,0x91099ad8 .long 0x93aa96b8,0x2617f91c,0x7fca2e13,0x0fc8716b,0x95328723,0xa7106f5e,0x262e6522,0xd1c9c40b,0x42b7c094,0xb9bafe86,0x1543c021,0x1873439d,0x5cbefd5d,0xe1baa5de,0x521e8aff,0xa363fc5e .long 0xf862eaac,0xefe6320d,0x22c647dc,0x14419c63,0x4e46d428,0x0e06707c,0x4a178f8f,0xcb6c834f,0xd30f917c,0x0f993a45,0x9879afee,0xd4c4b049,0x70500063,0xb6142a1e,0xa5d9d605,0x7c9b41c3 .long 0x2f8ba2c7,0xbc00fc2f,0x7c67aa28,0x0966eb2f,0x5a786972,0x13f7b516,0x8a2fbba0,0x3bfb7557,0x5a2b9620,0x131c4f23,0x6faf46be,0xbff3ed27,0x7e172323,0x9b4473d1,0x339f6246,0x421e8878 .long 0x25a41632,0x0fa8587a,0xa35b6c93,0xc0814124,0x59ebb8db,0x2b18a9f5,0x76edb29c,0x264e3357,0xc87c51e2,0xaf245ccd,0x501e6214,0x16b3015b,0x0a3882ce,0xbb31c560,0xfec11e04,0x6961bb94 .long 0xeff7a3a0,0x3b825b8d,0xb1df7326,0xbec33738,0x99604a1f,0x68ad747c,0x9a3bd499,0xd154c934,0x1cc7a906,0xac33506f,0x6c560e8f,0x73bb5392,0x263e3944,0x6428fcbe,0x1c387434,0xc11828d5 .long 0x3e4b12ff,0x3cd04be1,0x2d88667c,0xc3aad9f9,0x248120cf,0xc52ddcf8,0x2a389532,0x985a892e,0x3bb85fa0,0xfbb4b21b,0x8dfc6269,0xf95375e0,0x7ee2acea,0xfb4fb06c,0x309c4d1f,0x6785426e .long 0xd8ceb147,0x659b17c8,0xb70a5554,0x9b649eee,0xac6bc634,0x6b7fa0b5,0x1d6e732f,0xd99fe2c7,0x8d3abba2,0x30e6e762,0xa797b799,0x18fee6e7,0xc696464d,0x5c9d360d,0x27bfde12,0xe3baeb48 .long 0xf23206d5,0x2bf5db47,0x1d260152,0x2f6d3420,0x3f8ff89a,0x17b87653,0x378fa458,0x5157c30c,0x2d4fb936,0x7517c5c5,0xe6518cdc,0xef22f7ac,0xbf847a64,0xdeb483e6,0x92e0fa89,0xf5084558 .long 0xdf7304d4,0xab9659d8,0xff210e8e,0xb71bcf1b,0xd73fbd60,0xa9a2438b,0x5d11b4de,0x4595cd1f,0x4835859d,0x9c0d329a,0x7dbb6e56,0x4a0f0d2d,0xdf928a4e,0xc6038e5e,0x8f5ad154,0xc9429621 .long 0xf23f2d92,0x91213462,0x60b94078,0x6cab71bd,0x176cde20,0x6bdd0a63,0xee4d54bc,0x54c9b20c,0x9f2ac02f,0x3cd2d8aa,0x206eedb0,0x03f8e617,0x93086434,0xc7f68e16,0x92dd3db9,0x831469c5 .long 0x8f981354,0x8521df24,0x3588a259,0x587e23ec,0xd7a0992c,0xcbedf281,0x38961407,0x06930a55,0xbe5bbe21,0x09320deb,0x2491817f,0xa7ffa5b5,0x09065160,0xe6c8b4d9,0xfff6d2a9,0xac4f3992 .long 0x3ae9c1bd,0x7aa7a158,0xe37ce240,0xe0af6d98,0x28ab38b4,0xe54342d9,0x0a1c98ca,0xe8b75007,0xe02358f2,0xefce86af,0xea921228,0x31b8b856,0x0a1c67fc,0x052a1912,0xe3aead59,0xb4069ea4 .long 0x7fa03cb3,0x3232d6e2,0x0fdd7d88,0xdb938e5b,0x2ccbfc5d,0x04c1d2cd,0xaf3a580f,0xd2f45c12,0x7883e614,0x592620b5,0xbe7c5f26,0x5fd27e68,0x1567e1e3,0x139e45a9,0x44d8aaaf,0x2cc71d2d .long 0xe36d0757,0x4a9090cd,0xd9a29382,0xf722d7b1,0x04b48ddf,0xfb7fb04c,0xebe16f43,0x628ad2a7,0x20226040,0xcd3fbfb5,0x5104b6c4,0x6c34ecb1,0xc903c188,0x30c0754e,0x2d23cab0,0xec336b08 .long 0x1e206ee5,0x473d62a2,0x8c49a633,0xf1e27480,0xe9f6b2c3,0x87ab956c,0x62b606ea,0x61830b48,0xe78e815f,0x67cd6846,0x4c02082a,0xfe40139f,0x952ec365,0x52bbbfcb,0x6b9836ab,0x74c11642 .long 0x558df019,0x9f51439e,0xac712b27,0x230da4ba,0x55185a24,0x518919e3,0x84b78f50,0x4dcefcdd,0xa47d4c5a,0xa7d90fb2,0xb30e009e,0x55ac9abf,0x74eed273,0xfd2fc359,0xdbea8faf,0xb72d824c .long 0x4513e2ca,0xce721a74,0x38240b2c,0x0b418612,0xd5baa450,0x05199968,0x2b0e8c25,0xeb1757ed,0x3dfac6d5,0x6ebc3e28,0x48a237f5,0xb2431e2e,0x52f61499,0x2acb5e23,0xe06c936b,0x5558a2a7 .long 0xcbb13d1b,0xd213f923,0x5bfb9bfe,0x98799f42,0x701144a9,0x1ae8ddc9,0x4c5595ee,0x0b8b3bb6,0x3ecebb21,0x0ea9ef2e,0x3671f9a7,0x17cb6c4b,0x726f1d1f,0x47ef464f,0x6943a276,0x171b9484 .long 0x7ef0329c,0x51a4ae2d,0x91c4402a,0x08509222,0xafd45bbc,0x64a61d35,0x3035a851,0x38f096fe,0xa1dec027,0xc7468b74,0x4fc7dcba,0xe8cf10e7,0xf4a06353,0xea35ff40,0x8b77dd66,0x0b4c0dfa .long 0xde7e5c19,0x779b8552,0xc1c0256c,0xfab28609,0xabd4743d,0x64f58eee,0x7b6cc93b,0x4e8ef838,0x4cb1bf3d,0xee650d26,0x73dedf61,0x4c1f9d09,0xbfb70ced,0xaef7c9d7,0x1641de1e,0x1ec0507e .long 0xcde45079,0xcd7e5cc7,0x516ac9e4,0xde173c9a,0xc170315c,0x517a8494,0x91d8e8fb,0x438fd905,0xc7d9630b,0x5145c506,0xf47d4d75,0x6457a87b,0x0d9a80e8,0xd31646bf,0xcef3aabe,0x453add2b .long 0xa607419d,0xc9941109,0xbb6bca80,0xfaa71e62,0x07c431f3,0x34158c13,0x992bc47a,0x594abebc,0xeb78399f,0x6dfea691,0x3f42cba4,0x48aafb35,0x077c04f0,0xedcd65af,0xe884491a,0x1a29a366 .long 0x1c21f2bf,0x023a40e5,0xa5057aee,0xf99a513c,0xbcab072e,0xa3fe7e25,0x40e32bcf,0x8568d2e1,0xd3f69d9f,0x904594eb,0x07affab1,0x181a9733,0xb6e330f4,0xe4d68d76,0xc75a7fc1,0x87a6dafb .long 0xef7d9289,0x549db2b5,0x197f015a,0x2480d4a8,0xc40493b6,0x61d5590b,0x6f780331,0x3a55b52e,0x309eadb0,0x40eb8115,0x92e5c625,0xdea7de5a,0xcc6a3d5a,0x64d631f0,0x93e8dd61,0x9d5e9d7c .long 0x206d3ffc,0xf297bef5,0x7d808bd4,0x23d5e033,0xd24cf5ba,0x4a4f6912,0x09cdaa8a,0xe4d8163b,0xd3082e8e,0x0e0de9ef,0x0192f360,0x4fe1246c,0x4b8eee0a,0x1f900150,0xf1da391b,0x5219da81 .long 0xf7ea25aa,0x7bf6a5c1,0xfbb07d5f,0xd165e6bf,0x89e78671,0xe3539361,0x2bac4219,0xa3fcac89,0xf0baa8ab,0xdfab6fd4,0xe2c1c2e5,0x5a4adac1,0x40d85849,0x6cd75e31,0x19b39181,0xce263fea .long 0x07032c72,0xcb6803d3,0x790968c8,0x7f40d5ce,0xdce978f0,0xa6de86bd,0x368f751c,0x25547c4f,0x65fb2a9e,0xb1e685fd,0x1eb9179c,0xce69336f,0x12504442,0xb15d1c27,0xb911a06b,0xb7df465c .long 0x315980cd,0xb8d804a3,0xfa3bebf7,0x693bc492,0x2253c504,0x3578aeee,0xcd2474a2,0x158de498,0xcfda8368,0x1331f5c7,0x78d7177e,0xd2d7bbb3,0xf3c1e46e,0xdf61133a,0xd30e7be8,0x5836ce7d .long 0x94f834cb,0x83084f19,0x429ed782,0xd35653d4,0x59e58243,0xa542f16f,0x0470a22d,0xc2b52f65,0x18f23d96,0xe3b6221b,0x3f5252b4,0xcb05abac,0x87d61402,0xca00938b,0x411933e4,0x2f186cdd .long 0x9a29a5c5,0xe042ece5,0x3b6c8402,0xb19b3c07,0x19d92684,0xc97667c7,0xebc66372,0xb5624622,0x3c04fa02,0x0cb96e65,0x8eaa39aa,0x83a7176c,0xeaa1633f,0x2033561d,0x4533df73,0x45a9d086 .long 0x3dc090bc,0xe0542c1d,0xaa59c167,0x82c996ef,0x0ee7fc4d,0xe3f735e8,0x7c35db79,0x7b179393,0xf8c5dbfd,0xb6419e25,0x1f327b04,0x4d9d7a1e,0x298dfca8,0x979f6f9b,0x8de9366a,0xc7c5dff1 .long 0x04c82bdd,0x1b7a588d,0xf8319dfd,0x68005534,0xd8eb9580,0xde8a55b5,0x8d5bca81,0x5ea886da,0x252a0b4d,0xe8530a01,0x35eaa0a1,0x1bffb4fe,0xd8e99563,0x2ad828b1,0x95f9cd87,0x7de96ef5 .long 0xd77d970c,0x4abb2d0c,0xd33ef9cb,0x03cfb933,0x8b211fe9,0xb0547c01,0xa56ed1c6,0x2fe64809,0xc2ac98cc,0xcb7d5624,0x1a393e33,0x2a1372c0,0x29660521,0xc8d1ec1c,0xb37ac3e9,0xf3d31b04 .long 0x5ece6e7c,0xa29ae9df,0x0facfb55,0x0603ac8f,0xdda233a5,0xcfe85b7a,0xbd75f0b8,0xe618919f,0x99bf1603,0xf555a3d2,0xf184255a,0x1f43afc9,0x319a3e02,0xdcdaf341,0x03903a39,0xd3b117ef .long 0x65d1d131,0xe095da13,0xc37ad03e,0x86f16367,0x462cd8dd,0x5f37389e,0xd67a60e6,0xc103fa04,0xf4b478f0,0x57c34344,0xe117c98d,0xce91edd8,0x231fc12e,0x001777b0,0xb207bccb,0x11ae47f2 .long 0x20f8a242,0xd983cf8d,0xf22e1ad8,0x7aff5b1d,0x7fc4feb3,0x68fd11d0,0xb0f1c3e1,0x5d53ae90,0xec041803,0x50fb7905,0x14404888,0x85e3c977,0xac628d8f,0x0e67faed,0x6668532c,0x2e865150 .long 0x6a67a6b0,0x15acaaa4,0xb25cec41,0xf4cdee25,0xe4c6701e,0x49ee565a,0xfc7d63d8,0x2a04ca66,0xef0543fb,0xeb105018,0xd1b0d81d,0xf709a4f5,0x2915d333,0x5b906ee6,0x96f1f0ab,0xf4a87412 .long 0x4d82f4c2,0xb6b82fa7,0x6804efb3,0x90725a60,0xadc3425e,0xbc82ec46,0x2787843e,0xb7b80581,0xdd1fc74c,0xdf46d91c,0xe783a6c4,0xdc1c62cb,0x1a04cbba,0x59d1b9f3,0x95e40764,0xd87f6f72 .long 0x317f4a76,0x02b4cfc1,0x91036bce,0x8d2703eb,0xa5e72a56,0x98206cc6,0xcf53fb0f,0x57be9ed1,0xef0b17ac,0x09374571,0xd9181b38,0x74b2655e,0x89935d0e,0xc8f80ea8,0x91529936,0xc0d9e942 .long 0x1e84e0e5,0x19686041,0xaea34c93,0xa5db84d3,0x7073a732,0xf9d5bb19,0x6bcfd7c0,0xb8d2fe56,0xf3eb82fa,0x45775f36,0xfdff8b58,0x8cb20ccc,0x8374c110,0x1659b65f,0x330c789a,0xb8b4a422 .long 0x6fe8208b,0x75e3c3ea,0x286e78fe,0xbd74b9e4,0xd7d93a1a,0x0be2e81b,0xdd0a5aae,0x7ed06e27,0x6be8b800,0x721f5a58,0xd846db28,0x428299d1,0x5be88ed3,0x95cb8e6b,0x1c034e11,0xc3186b23 .long 0x8977d99b,0xa6312c9e,0x83f531e7,0xbe944331,0x18d3b1d4,0x8232c0c2,0xe1247b73,0x617aae8b,0x282aec3b,0x40153fc4,0xf7b8f823,0xc6063d2f,0x3304f94c,0x68f10e58,0xee676346,0x31efae74 .long 0x40a9b97c,0xbadb6c6d,0x4f666256,0x14702c63,0x5184b2e3,0xdeb954f1,0x94b6ca40,0x5184a526,0x003c32ea,0xfff05337,0x205974c7,0x5aa374dd,0x4b0dd71a,0x9a763854,0xdeb947ec,0x459cd27f .long 0x459c2b92,0xa6e28161,0x75ee8ef5,0x2f020fa8,0x30b06310,0xb132ec2d,0xbc6a4530,0xc3e15899,0xaa3f451a,0xdc5f53fe,0xc2d9acac,0x3a3c7f23,0x6b27e58b,0x2ec2f892,0xd742799f,0x68466ee7 .long 0x1fa26613,0x98324dd4,0xbdc29d63,0xa2dc6dab,0xd712d657,0xf9675faa,0x21fd8d15,0x813994be,0xfd4f7553,0x5ccbb722,0xf3a36b20,0x5135ff8b,0x69559df5,0x44be28af,0x9d41bf30,0x40b65bed .long 0x3734e520,0xd98bf2a4,0x209bdcba,0x5e3abbe3,0xbc945b35,0x77c76553,0xc6ef14aa,0x5331c093,0x76b60c80,0x518ffe29,0x7ace16f8,0x2285593b,0xbe2b9784,0xab1f64cc,0xab2421b6,0xe8f2c0d9 .long 0xc1df065c,0x617d7174,0x5f6578fa,0xafeeb5ab,0x263b54a8,0x16ff1329,0xc990dce3,0x45c55808,0xecc8c177,0x42eab6c0,0x5982ecaa,0x799ea9b5,0xb607ef8e,0xf65da244,0x32a3fc2c,0x8ab226ce .long 0x7ea973dc,0x745741e5,0x20888f2e,0x5c00ca70,0x45fd9cf1,0x7cdce3cf,0x5507f872,0x8a741ef1,0x196b4cec,0x47c51c2f,0xc97ea618,0x70d08e43,0x15b18a2b,0x930da15c,0x2f610514,0x33b6c678 .long 0x07ac9794,0xc662e4f8,0xba06cb79,0x1eccf050,0xe7d954e5,0x1ff08623,0x24cf71c3,0x6ef2c5fb,0x67978453,0xb2c063d2,0x1d654af8,0xa0cf3796,0x7ebdaa37,0x7cb242ea,0xb86747e0,0x206e0b10 .long 0xd5ecfefc,0x481dae5f,0xc2bff8fc,0x07084fd8,0xea324596,0x8040a01a,0xd4de4036,0x4c646980,0xd65abfc3,0x9eb8ab4e,0x13541ec7,0xe01cb91f,0xfd695012,0x8f029adb,0x3c7569ec,0x9ae28483 .long 0xa66d80a1,0xa5614c9e,0x75f5f911,0x680a3e44,0xceba4fc1,0x0c07b14d,0xa13071c1,0x891c285b,0x799ece3c,0xcac67ceb,0x41e07e27,0x29b910a9,0xf2e43123,0x66bdb409,0x7ac9ecbe,0x06f8b137 .long 0x38547090,0x5981fafd,0x85e3415d,0x19ab8b9f,0xc7e31b27,0xfc28c194,0x6fbcbb42,0x843be0aa,0xa6db836c,0xf3b1ed43,0x01a45c05,0x2a1330e4,0x95c1a377,0x4f19f3c5,0x44b5ee33,0xa85f39d0 .long 0x4ae52834,0x3da18e6d,0x7423dcb0,0x5a403b39,0xf2374aef,0xbb555e0a,0x1e8ca111,0x2ad599c4,0x014b3bf8,0x1b3a2fb9,0xf66d5007,0x73092684,0xc4340102,0x079f1426,0x8fddf4de,0x1827cf81 .long 0xf10ff927,0xc83605f6,0x23739fc6,0xd3871451,0xcac1c2cc,0x6d163450,0xa2ec1ac5,0x6b521296,0x6e3cb4a5,0x0606c4f9,0x778abff7,0xe47d3f41,0xbe8e3a45,0x425a8d5e,0xa6102160,0x53ea9e97 .long 0x39cbb688,0x477a106e,0xf3386d32,0x532401d2,0xb1b9b421,0x8e564f64,0x81dad33f,0xca9b8388,0x2093913e,0xb1422b4e,0x69bc8112,0x533d2f92,0xebe7b2c7,0x3fa017be,0xcaf197c6,0xb2767c4a .long 0xaedbae9f,0xc925ff87,0x36880a54,0x7daf0eb9,0x9c4d0e71,0x9284ddf5,0x316f8cf5,0x1581cf93,0x3ac1f452,0x3eeca887,0xfb6aeffe,0xb417fce9,0xeefb8dc3,0xa5918046,0x02209400,0x73d318ac .long 0x728693e5,0xe800400f,0x339927ed,0xe87d814b,0x57ea9910,0x93e94d3b,0x2245fb69,0xff8a35b6,0x7f200d34,0x043853d7,0x0f653ce1,0x470f1e68,0x59a06379,0x81ac05bd,0x03930c29,0xa14052c2 .long 0x26bc2797,0x6b72fab5,0x99f16771,0x13670d16,0x1e3e48d1,0x00170052,0xb7adf678,0x978fe401,0xd41c5dd4,0x55ecfb92,0xc7b27da5,0x5ff8e247,0x013fb606,0xe7518272,0x2f547a3c,0x5768d7e5 .long 0x60017a5f,0xbb24eaa3,0x9c64ce9b,0x6b18e6e4,0x103dde07,0xc225c655,0x7592f7ea,0xfc3672ae,0xd06283a1,0x9606ad77,0xe4d59d99,0x542fc650,0x2a40e7c2,0xabb57c49,0xa8db9f55,0xac948f13 .long 0xb04465c3,0x6d4c9682,0x6468bd15,0xe3d062fa,0x5f318d7e,0xa51729ac,0x9eb6fc95,0x1fc87df6,0x0591f652,0x63d146a8,0x589621aa,0xa861b8f7,0xce31348c,0x59f5f15a,0x440da6da,0x8f663391 .long 0xb591ffa3,0xcfa778ac,0x4cdfebce,0x027ca9c5,0x444ea6b3,0xbe8e05a5,0xa78d8254,0x8aab4e69,0xb474d6b8,0x2437f04f,0x045b3855,0x6597ffd4,0xca47ecaa,0xbb0aea4e,0x85c7ebfc,0x568aae83 .long 0xc73b2383,0x0e966e64,0xd17d8762,0x49eb3447,0x8da05dab,0xde107821,0x016b7236,0x443d8baa,0xea7610d6,0x163b63a5,0xce1ca979,0xe47e4185,0x80baa132,0xae648b65,0x0e0d5b64,0xebf53de2 .long 0xd3c8c1ca,0x8d3bfcb4,0x5d04b309,0x0d914ef3,0x3de7d395,0x55ef6415,0x26b850e8,0xbde1666f,0xd449ab19,0xdbe1ca6e,0xe89a2672,0x8902b322,0xdacb7a53,0xb1674b7e,0xf52523ff,0x8e9faf6e .long 0x9a85788b,0x6ba535da,0xbd0626d4,0xd21f03ae,0xe873dc64,0x099f8c47,0x018ec97e,0xcda8564d,0xde92c68c,0x3e8d7a5c,0x73323cc4,0x78e035a1,0xf880ff7c,0x3ef26275,0x273eedaa,0xa4ee3dff .long 0xaf4e18f8,0x58823507,0x0672f328,0x967ec9b5,0x559d3186,0x9ded19d9,0x6cdce39c,0x5e2ab3de,0x11c226df,0xabad6e4d,0x87723014,0xf9783f43,0x1a885719,0x9a49a0cf,0x90da9dbf,0xfc0c1a5a .long 0x571d92ac,0x8bbaec49,0x4692517f,0x569e85fe,0xa14ea4af,0x8333b014,0x12e5c5ad,0x32f2a62f,0x06d89b85,0x98c2ce3a,0x2ff77a08,0xb90741aa,0x01f795a2,0x2530defc,0x84b3c199,0xd6e5ba0b .long 0x12e4c936,0x7d8e8451,0xbd0be17b,0xae419f7d,0x22262bc9,0xa583fc8c,0x91bfe2bd,0x6b842ac7,0x440d6827,0x33cef4e9,0xef81fb14,0x5f69f4de,0x234fbb92,0xf16cf6f6,0xd9e7e158,0x76ae3fc3 .long 0xe9740b33,0x4e89f6c2,0x4962d6a1,0x677bc85d,0x68d10d15,0x6c6d8a7f,0x0257b1cd,0x5f9a7224,0x4ad85961,0x7096b916,0xe657ab4a,0x5f8c47f7,0xf7461d7e,0xde57d7d0,0x80ce5ee2,0x7eb6094d .long 0x34190547,0x0b1e1dfd,0xf05dd150,0x8a394f43,0x97df44e6,0x0a9eb24d,0x87675719,0x78ca06bf,0x6ffeec22,0x6f0b3462,0x36cdd8fb,0x9d91bcea,0xa105be47,0xac83363c,0x069710e3,0x81ba76c1 .long 0x28c682c6,0x3d1b24cb,0x8612575b,0x27f25228,0xe8e66e98,0xb587c779,0x405eb1fe,0x7b0c03e9,0x15b548e7,0xfdf0d030,0x38b36af7,0xa8be76e0,0x4f310c40,0x4cdab04a,0xf47ecaec,0x6287223e .long 0x8b399320,0x678e6055,0xc01e4646,0x61fe3fa6,0x03261a5e,0xc482866b,0x5c2f244a,0xdfcf45b8,0x2f684b43,0x8fab9a51,0xc7220a66,0xf796c654,0xf5afa58f,0x1d90707e,0x4fdbe0de,0x2c421d97 .long 0xaf2ebc2f,0xc4f4cda3,0xcb4efe24,0xa0af843d,0x9ccd10b1,0x53b857c1,0x914d3e04,0xddc9d1eb,0x62771deb,0x7bdec8bb,0x91c5aa81,0x829277aa,0x832391ae,0x7af18dd6,0xc71a84ca,0x1740f316 .long 0xeeaf8c49,0x8928e99a,0x6e24d728,0xee7aa73d,0xe72b156c,0x4c5007c2,0xed408a1d,0x5fcf57c5,0xb6057604,0x9f719e39,0xc2868bbf,0x7d343c01,0x7e103e2d,0x2cca254b,0xf131bea2,0xe6eb38a9 .long 0x8be762b4,0xb33e624f,0x058e3413,0x2a9ee4d1,0x67d805fa,0x968e6369,0x7db8bfd7,0x9848949b,0xd23a8417,0x5308d7e5,0xf3e29da5,0x892f3b1d,0x3dee471f,0xc95c139e,0xd757e089,0x8631594d .long 0xde918dcc,0xe0c82a3c,0x26fdcf4b,0x2e7b5994,0x32cb1b2d,0x82c50249,0x7657ae07,0xea613a9d,0xf1fdc9f7,0xc2eb5f6c,0x879fe682,0xb6eae8b8,0x591cbc7f,0x253dfee0,0x3e1290e6,0x000da713 .long 0x1f095615,0x1083e2ea,0x14e68c33,0x0a28ad77,0x3d8818be,0x6bfc0252,0xf35850cd,0xb585113a,0x30df8aa1,0x7d935f0b,0x4ab7e3ac,0xaddda07c,0x552f00cb,0x92c34299,0x2909df6c,0xc33ed1de .long 0x80e87766,0x22c2195d,0x9ddf4ac0,0x9e99e6d8,0x65e74934,0x09642e4e,0xff1ff241,0x2610ffa2,0x751c8159,0x4d1d47d4,0xaf3a9363,0x697b4985,0x87477c33,0x0318ca46,0x9441eff3,0xa90cb565 .long 0x36f024cb,0x58bb3848,0x36016168,0x85be1f77,0xdc7e07f1,0x6c59587c,0xaf1d8f02,0x191be071,0xcca5e55c,0xbf169fa5,0xf7d04eac,0x3864ba3c,0x8d7d05db,0x915e367f,0xa6549e5d,0xb48a876d .long 0x580e40a2,0xef89c656,0x728068bc,0xf194ed8c,0xa47990c9,0x74528045,0x5e1a4649,0xf53fc7d7,0x78593e7d,0xbec5ae9b,0x41db65d7,0x2cac4ee3,0x04a3d39b,0xa8c1eb24,0x03f8f3ef,0x53b7d634 .long 0x3e07113c,0x2dc40d48,0x7d8b63ae,0x6e4a5d39,0x79684c2b,0x5582a94b,0x622da26c,0x932b33d4,0x0dbbf08d,0xf534f651,0x64c23a52,0x211d07c9,0xee5bdc9b,0x0eeece0f,0xf7015558,0xdf178168 .long 0x0a712229,0xd4294635,0x09273f8c,0x93cbe448,0x8f13bc83,0x00b095ef,0x8798978c,0xbb741972,0x56dbe6e7,0x9d7309a2,0x5a5d39ec,0xe578ec56,0x851f9a31,0x3961151b,0xe5709eb4,0x2da7715d .long 0x53dfabf0,0x867f3017,0xb8e39259,0x728d2078,0x815d9958,0x5c75a0cd,0x16603be1,0xf84867a6,0x70e35b1c,0xc865b13d,0x19b03e2c,0x02414468,0xac1f3121,0xe46041da,0x6f028a7c,0x7c9017ad .long 0x0a482873,0xabc96de9,0xb77e54d4,0x4265d6b1,0xa57d88e7,0x68c38e79,0x9ce82de3,0xd461d766,0x64a7e489,0x817a9ec5,0xa0def5f2,0xcc5675cd,0x985d494e,0x9a00e785,0x1b03514a,0xc626833f .long 0x83cdd60e,0xabe7905a,0xa1170184,0x50602fb5,0xb023642a,0x689886cd,0xa6e1fb00,0xd568d090,0x0259217f,0x5b1922c7,0xc43141e4,0x93831cd9,0x0c95f86e,0xdfca3587,0x568ae828,0xdec2057a .long 0xf98a759a,0xc44ea599,0xf7c23c1d,0x55a0a7a2,0x94c4f687,0xd5ffb6e6,0x12848478,0x3563cce2,0xe7b1fbe1,0x812b3517,0x4f7338e0,0x8a7dc979,0x52d048db,0x211ecee9,0xc86ea3b8,0x2eea4056 .long 0xba772b34,0xd8cb68a7,0x5f4e2541,0xe16ed341,0x0fec14db,0x9b32f6a6,0x391698be,0xeee376f7,0x83674c02,0xe9a7aa17,0x5843022a,0x65832f97,0x5ba4990f,0x29f3a8da,0xfb8e3216,0x79a59c3a .long 0xbd19bb16,0x9cdc4d2e,0xb3262d86,0xc6c7cfd0,0x969c0b47,0xd4ce14d0,0x13e56128,0x1fa352b7,0x973db6d3,0x383d55b8,0xe8e5b7bf,0x71836850,0xe6bb571f,0xc7714596,0x2d5b2dd2,0x259df31f .long 0x913cc16d,0x568f8925,0xe1a26f5a,0x18bc5b6d,0xf5f499ae,0xdfa413be,0xc3f0ae84,0xf8835dec,0x65a40ab0,0xb6e60bd8,0x194b377e,0x65596439,0x92084a69,0xbcd85625,0x4f23ede0,0x5ce433b9 .long 0x6ad65143,0xe8e8f04f,0xd6e14af6,0x11511827,0x8295c0c7,0x3d390a10,0x621eba16,0x71e29ee4,0x63717b46,0xa588fc09,0xe06ad4a2,0x02be02fe,0x04c22b22,0x931558c6,0x12f3c849,0xbb4d4bd6 .long 0x20efd662,0x54a4f496,0xc5952d14,0x92ba6d20,0xcc9784c2,0x2db8ea1e,0x4b353644,0x81cc10ca,0x4b4d7f6c,0x40b570ad,0x84a1dcd2,0x5c9f1d96,0x3147e797,0x01379f81,0x2bd499f5,0xe5c6097b .long 0x328e5e20,0x40dcafa6,0x54815550,0xf7b5244a,0x47bfc978,0xb9a4f118,0xd25825b1,0x0ea0e79f,0x646c7ecf,0xa50f96eb,0x446dea9d,0xeb811493,0xdfabcf69,0x2af04677,0xc713f6e8,0xbe3a068f .long 0x42e06189,0x860d523d,0x4e3aff13,0xbf077941,0xc1b20650,0x0b616dca,0x2131300d,0xe66dd6d1,0xff99abde,0xd4a0fd67,0xc7aac50d,0xc9903550,0x7c46b2d7,0x022ecf8b,0x3abf92af,0x3333b1e8 .long 0x6c491c14,0x11cc113c,0x80dd3f88,0x05976688,0x29d932ed,0xf5b4d9e7,0xa2c38b6d,0xe982aad8,0x8be0dcf0,0x6f925347,0x65ca53f2,0x700080ae,0x443ca77f,0xd8131156,0xec51f984,0xe92d6942 .long 0x85dfe9ae,0xd2a08af8,0x4d2a86ca,0xd825d9a5,0x39dff020,0x2c53988d,0x430cdc40,0xf38b135a,0x62a7150b,0x0c918ae0,0x0c340e9b,0xf31fd8de,0x4dbbf02e,0xafa0e7ae,0x5eba6239,0x5847fb2a .long 0xdccbac8b,0x6b1647dc,0x06f485c8,0xb642aa78,0x7038ecdf,0x873f3765,0xfa49d3fe,0x2ce5e865,0xc98c4400,0xea223788,0xf1fa5279,0x8104a8cd,0x06becfd7,0xbcf7cc7a,0xc8f974ae,0x49424316 .long 0x84d6365d,0xc0da65e7,0x8f759fb8,0xbcb7443f,0x7ae81930,0x35c712b1,0x4c6e08ab,0x80428dff,0xa4faf843,0xf19dafef,0xffa9855f,0xced8538d,0xbe3ac7ce,0x20ac409c,0x882da71e,0x358c1fb6 .long 0xfd349961,0xafa9c0e5,0x8421c2fc,0x2b2cfa51,0xf3a28d38,0x2a80db17,0x5d138e7e,0xa8aba539,0x6e96eb8d,0x52012d1d,0xcbaf9622,0x65d8dea0,0xb264f56c,0x57735447,0x1b6c8da2,0xbeebef3f .long 0xce785254,0xfc346d98,0xbb64a161,0xd50e8d72,0x49794add,0xc03567c7,0x752c7ef6,0x15a76065,0x961f23d6,0x59f3a222,0x73ecc0b0,0x378e4438,0x5a82fde4,0xc74be434,0xd8b9cf34,0xae509af2 .long 0x577f44a1,0x4a61ee46,0xb611deeb,0xe09b748c,0xf5f7b884,0xc0481b2c,0x61acfa6b,0x35626678,0xbf8d21e6,0x37f4c518,0xb205a76d,0x22d96531,0x954073c0,0x37fb85e1,0x65b3a567,0xbceafe4f .long 0xbe42a582,0xefecdef7,0x65046be6,0xd3fc6080,0x09e8dba9,0xc9af13c8,0x641491ff,0x1e6c9847,0xd30c31f7,0x3b574925,0xac2a2122,0xb7eb72ba,0xef0859e7,0x776a0dac,0x21900942,0x06fec314 .long 0xf8c22049,0x2464bc10,0x875ebf69,0x9bfbcce7,0x4336326b,0xd7a88e2a,0x5bc2acfa,0xda05261c,0xeba7efc8,0xc29f5bdc,0x25dbbf2e,0x471237ca,0x2975f127,0xa72773f2,0x04d0b326,0xdc744e8e .long 0xa56edb73,0x38a7ed16,0x2c007e70,0x64357e37,0x5080b400,0xa167d15b,0x23de4be1,0x07b41164,0x74c89883,0xb2d91e32,0x2882e7ed,0x3c162821,0x7503e482,0xad6b36ba,0x0ea34331,0x48434e8e .long 0x2c7ae0b9,0x79f4f24f,0x1939b44a,0xc46fbf81,0x56595eb1,0x76fefae8,0xcd5f29c7,0x417b66ab,0xc5ceec20,0x5f2332b2,0xe1a1cae2,0xd69661ff,0x9b0286e6,0x5ede7e52,0xe276b993,0x9d062529 .long 0x7e50122b,0x324794b0,0x4af07ca5,0xdd744f8b,0xd63fc97b,0x30a12f08,0x76626d9d,0x39650f1a,0x1fa38477,0x101b47f7,0xd4dc124f,0x3d815f19,0xb26eb58a,0x1569ae95,0x95fb1887,0xc3cde188 .long 0xf9539a48,0x54e9f37b,0x7408c1a5,0xb0100e06,0xea580cbb,0x821d9811,0x86e50c56,0x8af52d35,0xdbbf698b,0xdfbd9d47,0x03dc1c73,0x2961a1ea,0xe76a5df8,0x203d38f8,0x6def707a,0x08a53a68 .long 0x1bee45d4,0x26eefb48,0x3c688036,0xb3cee346,0xc42f2469,0x463c5315,0x81378162,0x19d84d2e,0x1c4d349f,0x22d7c3c5,0x163d59c5,0x65965844,0xb8abceae,0xcf198c56,0x628559d5,0x6fb1fb1b .long 0x07bf8fe3,0x8bbffd06,0x3467734b,0x46259c58,0x35f7f0d3,0xd8953cea,0xd65b0ff1,0x1f0bece2,0xf3c72914,0xf7d5b4b3,0x3cb53389,0x29e8ea95,0x836b6d46,0x4a365626,0xea174fde,0xe849f910 .long 0xf4737f21,0x7ec62fbb,0x6209f5ac,0xd8dba5ab,0xa5f9adbe,0x24b5d7a9,0xa61dc768,0x707d28f7,0xcaa999ea,0x7711460b,0x1c92e4cc,0xba7b174d,0x18d4bf2d,0x3c4bab66,0xeb8bd279,0xb8f0c980 .long 0x324b4737,0x024bea9a,0x32a83bca,0xfba9e423,0xa232dced,0x6e635643,0x2571c8ba,0x99619367,0x54b7032b,0xe8c9f357,0x2442d54a,0xf936b3ba,0x8290c65a,0x2263f0f0,0xee2c7fdb,0x48989780 .long 0x13d4f95e,0xadc5d55a,0xad9b8500,0x737cff85,0x8a73f43d,0x271c557b,0xe18bc476,0xbed617a4,0x7dfd8ab2,0x66245401,0x3a2870aa,0xae7b89ae,0x23a7e545,0x1b555f53,0xbe057e4c,0x6791e247 .long 0x324fa34d,0x860136ad,0x4cbeae28,0xea111447,0xbedd3299,0x023a4270,0xc1c35c34,0x3d5c3a7f,0x8d0412d2,0xb0f6db67,0xfcdc6b9a,0xd92625e2,0x4e28a982,0x92ae5ccc,0x47a3ce7e,0xea251c36 .long 0x790691bf,0x9d658932,0x06b736ae,0xed610589,0xc0d63b6e,0x712c2f04,0xc63d488f,0x5cf06fd5,0xd9588e41,0x97363fac,0x2b93257e,0x1f9bf762,0x667acace,0xa9d1ffc4,0x0a061ecf,0x1cf4a1aa .long 0xdc1818d0,0x40e48a49,0xa3621ab0,0x0643ff39,0xe39ef639,0x5768640c,0x04d86854,0x1fc099ea,0xeccd28fd,0x9130b9c3,0x7eec54ab,0xd743cbd2,0xe5b475b6,0x052b146f,0x900a7d1f,0x058d9a82 .long 0x91262b72,0x65e02292,0xbb0edf03,0x96f924f9,0xfe206842,0x5cfa59c8,0x5eafa720,0xf6037004,0x18d7dd96,0x5f30699e,0xcbab2495,0x381e8782,0xdd8be949,0x91669b46,0x26aae8ef,0xb40606f5 .long 0xfc6751a4,0x2812b839,0xfba800ef,0x16196214,0x4c1a2875,0x4398d5ca,0x653d8349,0x720c00ee,0xd820007c,0xc2699eb0,0xa39b5825,0x880ee660,0x471f6984,0x70694694,0xe3dda99a,0xf7d16ea8 .long 0xc0519a23,0x28d675b2,0x4f6952e3,0x9ebf94fe,0xa2294a8a,0xf28bb767,0xfe0af3f5,0x85512b4d,0x99b16a0d,0x18958ba8,0xba7548a7,0x95c2430c,0xa16be615,0xb30d1b10,0x85bfb74c,0xe3ebbb97 .long 0x18549fdb,0xa3273cfe,0x4fcdb792,0xf6e200bf,0x83aba56c,0x54a76e18,0x89ef6aa2,0x73ec66f6,0xd1b9a305,0x8d17add7,0xb7ae1b9d,0xa959c5b9,0x6bcc094a,0x88643522,0xd7d429b9,0xcc5616c4 .long 0xe6a33f7c,0xa6dada01,0x9d4e70ad,0xc6217a07,0x09c15b7c,0xd619a818,0x0e80c854,0xea06b329,0xa5f5e7b9,0x174811ce,0x787c65f4,0x66dfc310,0x3316ab54,0x4ea7bd69,0x1dcc0f70,0xc12c4acb .long 0x1e407dd9,0xe4308d1a,0x91afa997,0xe8a3587c,0xab77b7a5,0xea296c12,0x673c0d52,0xb5ad49e4,0x7006085a,0x40f9b2b2,0x87bf6ec2,0xa88ff340,0x4e3066a6,0x978603b1,0xb5e486e2,0xb3f99fc2 .long 0xb2e63645,0x07b53f5e,0x84c84232,0xbe57e547,0x7214d5cf,0xd779c216,0x029a3aca,0x617969cd,0x8a7017a0,0xd17668cd,0xbe9b7ee8,0x77b4d19a,0x9c161776,0x58fd0e93,0xd5968a72,0xa8c4f4ef .long 0x67b3de77,0x296071cc,0x634f7905,0xae3c0b8e,0x8a7100c9,0x67e440c2,0xeb4b9b42,0xbb8c3c1b,0xc51b3583,0x6d71e8ea,0x9525e642,0x7591f5af,0x13f509f3,0xf73a2f7b,0x5619ac9b,0x618487aa .long 0x9d61718a,0x3a72e5f7,0x7592d28c,0x00413bcc,0x963c35cf,0x7d9b11d3,0xb90a46ed,0x77623bcf,0xdcdd2a50,0xdeef273b,0x0601846e,0x4a741f9b,0x0ec6e929,0x33b89e51,0x8b7f22cd,0xcb02319f .long 0x084bae24,0xbbe1500d,0x343d2693,0x2f0ae8d7,0x7cdef811,0xacffb5f2,0x263fb94f,0xaa0c030a,0xa0f442de,0x6eef0d61,0x27b139d3,0xf92e1817,0x0ad8bc28,0x1ae6deb7,0xc0514130,0xa89e38dc .long 0xd2fdca23,0x81eeb865,0xcc8ef895,0x5a15ee08,0x01905614,0x768fa10a,0x880ee19b,0xeff5b8ef,0xcb1c8a0e,0xf0c0cabb,0xb8c838f9,0x2e1ee9cd,0x8a4a14c0,0x0587d8b8,0x2ff698e5,0xf6f27896 .long 0x89ee6256,0xed38ef1c,0x6b353b45,0xf44ee1fe,0x70e903b3,0x9115c0c7,0x818f31df,0xc78ec0a1,0xb7dccbc6,0x6c003324,0x163bbc25,0xd96dd1f3,0x5cedd805,0x33aa82dd,0x7f7eb2f1,0x123aae4f .long 0xa26262cd,0x1723fcf5,0x0060ebd5,0x1f7f4d5d,0xb2eaa3af,0xf19c5c01,0x9790accf,0x2ccb9b14,0x52324aa6,0x1f9c1cad,0x7247df54,0x63200526,0xbac96f82,0x5732fe42,0x01a1c384,0x52fe771f .long 0xb1001684,0x546ca13d,0xa1709f75,0xb56b4eee,0xd5db8672,0x266545a9,0x1e8f3cfb,0xed971c90,0xe3a07b29,0x4e7d8691,0xe4b696b9,0x7570d9ec,0x7bc7e9ae,0xdc5fa067,0xc82c4844,0x68b44caf .long 0xbf44da80,0x519d34b3,0x5ab32e66,0x283834f9,0x6278a000,0x6e608797,0x627312f6,0x1e62960e,0xe6901c55,0x9b87b27b,0x24fdbc1f,0x80e78538,0x2facc27d,0xbbbc0951,0xac143b5a,0x06394239 .long 0x376c1944,0x35bb4a40,0x63da1511,0x7cb62694,0xb7148a3b,0xafd29161,0x4e2ea2ee,0xa6f9d9ed,0x880dd212,0x15dc2ca2,0xa61139a9,0x903c3813,0x6c0f8785,0x2aa7b46d,0x901c60ff,0x36ce2871 .long 0xe10d9c12,0xc683b028,0x032f33d3,0x7573baa2,0x67a31b58,0x87a9b1f6,0xf4ffae12,0xfd3ed11a,0x0cb2748e,0x83dcaa9a,0x5d6fdf16,0x8239f018,0x72753941,0xba67b49c,0xc321cb36,0x2beec455 .long 0x3f8b84ce,0x88015606,0x8d38c86f,0x76417083,0x598953dd,0x054f1ca7,0x4e8e7429,0xc939e110,0x5a914f2f,0x9b1ac2b3,0xe74b8f9c,0x39e35ed3,0x781b2fb0,0xd0debdb2,0x2d997ba2,0x1585638f .long 0x9e2fce99,0x9c4b646e,0x1e80857f,0x68a21081,0x3643b52a,0x06d54e44,0x0d8eb843,0xde8d6d63,0x42146a0a,0x70321563,0x5eaa3622,0x8ba826f2,0x86138787,0x227a58bd,0x10281d37,0x43b6c03c .long 0xb54dde39,0x6326afbb,0xdb6f2d5f,0x744e5e8a,0xcff158e1,0x48b2a99a,0xef87918f,0xa93c8fa0,0xde058c5c,0x2182f956,0x936f9e7a,0x216235d2,0xd2e31e67,0xace0c0db,0xf23ac3e7,0xc96449bf .long 0x170693bd,0x7e9a2874,0xa45e6335,0xa28e14fd,0x56427344,0x5757f6b3,0xacf8edf9,0x822e4556,0xe6a285cd,0x2b7a6ee2,0xa9df3af0,0x5866f211,0xf845b844,0x40dde2dd,0x110e5e49,0x986c3726 .long 0xf7172277,0x73680c2a,0x0cccb244,0x57b94f0f,0x2d438ca7,0xbdff7267,0xcf4663fd,0xbad1ce11,0xd8f71cae,0x9813ed9d,0x961fdaa6,0xf43272a6,0xbd6d1637,0xbeff0119,0x30361978,0xfebc4f91 .long 0x2f41deff,0x02b37a95,0xe63b89b7,0x0e44a59a,0x143ff951,0x673257dc,0xd752baf4,0x19c02205,0xc4b7d692,0x46c23069,0xfd1502ac,0x2e6392c3,0x1b220846,0x6057b1a2,0x0c1b5b63,0xe51ff946 .long 0x566c5c43,0x6e85cb51,0x3597f046,0xcff9c919,0x4994d94a,0x9354e90c,0x2147927d,0xe0a39332,0x0dc1eb2b,0x8427fac1,0x2ff319fa,0x88cfd8c2,0x01965274,0xe2d4e684,0x67aaa746,0xfa2e067d .long 0x3e5f9f11,0xb6d92a7f,0xd6cb3b8e,0x9afe153a,0xddf800bd,0x4d1a6dd7,0xcaf17e19,0xf6c13cc0,0x325fc3ee,0x15f6c58e,0xa31dc3b2,0x71095400,0xafa3d3e7,0x168e7c07,0x94c7ae2d,0x3f8417a1 .long 0x813b230d,0xec234772,0x17344427,0x634d0f5f,0xd77fc56a,0x11548ab1,0xce06af77,0x7fab1750,0x4f7c4f83,0xb62c10a7,0x220a67d9,0xa7d2edc4,0x921209a0,0x1c404170,0xface59f0,0x0b9815a0 .long 0x319540c3,0x2842589b,0xa283d6f8,0x18490f59,0xdaae9fcb,0xa2731f84,0xc3683ba0,0x3db6d960,0x14611069,0xc85c63bb,0x0788bf05,0xb19436af,0x347460d2,0x905459df,0xe11a7db1,0x73f6e094 .long 0xb6357f37,0xdc7f938e,0x2bd8aa62,0xc5d00f79,0x2ca979fc,0xc878dcb9,0xeb023a99,0x37e83ed9,0x1560bf3d,0x6b23e273,0x1d0fae61,0x1086e459,0x9a9414bd,0x78248316,0xf0ea9ea1,0x1b956bc0 .long 0xc31b9c38,0x7b85bb91,0x48ef57b5,0x0c5aa90b,0xaf3bab6f,0xdedeb169,0x2d373685,0xe610ad73,0x02ba8e15,0xf13870df,0x8ca7f771,0x0337edb6,0xb62c036c,0xe4acf747,0xb6b94e81,0xd921d576 .long 0x2c422f7a,0xdbc86439,0xed348898,0xfb635362,0xc45bfcd1,0x83084668,0x2b315e11,0xc357c9e3,0x5b2e5b8c,0xb173b540,0xe102b9a4,0x7e946931,0x7b0fb199,0x17c890eb,0xd61b662b,0xec225a83 .long 0xee3c76cb,0xf306a3c8,0xd32a1f6e,0x3cf11623,0x6863e956,0xe6d5ab64,0x5c005c26,0x3b8a4cbe,0x9ce6bb27,0xdcd529a5,0x04d4b16f,0xc4afaa52,0x7923798d,0xb0624a26,0x6b307fab,0x85e56df6 .long 0x2bf29698,0x0281893c,0xd7ce7603,0x91fc19a4,0xad9a558f,0x75a5dca3,0x4d50bf77,0x40ceb3fa,0xbc9ba369,0x1baf6060,0x597888c2,0x927e1037,0x86a34c07,0xd936bf19,0xc34ae980,0xd4cf10c1 .long 0x859dd614,0x3a3e5334,0x18d0c8ee,0x9c475b5b,0x07cd51d5,0x63080d1f,0xb88b4326,0xc9c0d0a6,0xc234296f,0x1ac98691,0x94887fb6,0x2a0a83a4,0x0cea9cf2,0x56511427,0xa24802f5,0x5230a6e8 .long 0x72e3d5c1,0xf7a2bf0f,0x4f21439e,0x37717446,0x9ce30334,0xfedcbf25,0x7ce202f9,0xe0030a78,0x1202e9ca,0x6f2d9ebf,0x75e6e591,0xe79dde6c,0xf1dac4f8,0xf52072af,0xbb9b404d,0x6c8d087e .long 0xbce913af,0xad0fc73d,0x458a07cb,0x909e587b,0xd4f00c8a,0x1300da84,0xb54466ac,0x425cd048,0x90e9d8bf,0xb59cb9be,0x3e431b0e,0x991616db,0x531aecff,0xd3aa117a,0x59f4dc3b,0x91af92d3 .long 0xe93fda29,0x9b1ec292,0xe97d91bc,0x76bb6c17,0xaface1e6,0x7509d95f,0xbe855ae3,0x3653fe47,0x0f680e75,0x73180b28,0xeeb6c26c,0x75eefd1b,0xb66d4236,0xa4cdf29f,0x6b5821d8,0x2d70a997 .long 0x20445c36,0x7a3ee207,0x59877174,0x71d1ac82,0x949f73e9,0x0fc539f7,0x982e3081,0xd05cf3d7,0x7b1c7129,0x8758e20b,0x569e61f2,0xffadcc20,0x59544c2d,0xb05d3a2f,0x9fff5e53,0xbe16f5c1 .long 0xaad58135,0x73cf65b8,0x037aa5be,0x622c2119,0x646fd6a0,0x79373b3f,0x0d3978cf,0x0e029db5,0x94fba037,0x8bdfc437,0x620797a6,0xaefbd687,0xbd30d38e,0x3fa5382b,0x585d7464,0x7627cfbf .long 0x4e4ca463,0xb2330fef,0x3566cc63,0xbcef7287,0xcf780900,0xd161d2ca,0x5b54827d,0x135dc539,0x27bf1bc6,0x638f052e,0x07dfa06c,0x10a224f0,0x6d3321da,0xe973586d,0x26152c8f,0x8b0c5738 .long 0x34606074,0x07ef4f2a,0xa0f7047a,0x80fe7fe8,0xe1a0e306,0x3d1a8152,0x88da5222,0x32cf43d8,0x5f02ffe6,0xbf89a95f,0x806ad3ea,0x3d9eb9a4,0x79c8e55e,0x012c17bb,0x99c81dac,0xfdcd1a74 .long 0xb9556098,0x7043178b,0x801c3886,0x4090a1df,0x9b67b912,0x759800ff,0x232620c8,0x3e5c0304,0x70dceeca,0x4b9d3c4b,0x181f648e,0xbb2d3c15,0x6e33345c,0xf981d837,0x0cf2297a,0xb626289b .long 0x8baebdcf,0x766ac659,0x75df01e5,0x1a28ae09,0x375876d8,0xb71283da,0x607b9800,0x4865a96d,0x237936b2,0x25dd1bcd,0x60417494,0x332f4f4b,0x370a2147,0xd0923d68,0xdc842203,0x497f5dfb .long 0x32be5e0f,0x9dc74cbd,0x17a01375,0x7475bcb7,0x50d872b1,0x438477c9,0xffe1d63d,0xcec67879,0xd8578c70,0x9b006014,0x78bb6b8b,0xc9ad99a8,0x11fb3806,0x6799008e,0xcd44cab3,0xcfe81435 .long 0x2f4fb344,0xa2ee1582,0x483fa6eb,0xb8823450,0x652c7749,0x622d323d,0xbeb0a15b,0xd8474a98,0x5d1c00d0,0xe43c154d,0x0e3e7aac,0x7fd581d9,0x2525ddf8,0x2b44c619,0xb8ae9739,0x67a033eb .long 0x9ef2d2e4,0x113ffec1,0xd5a0ea7f,0x1bf6767e,0x03714c0a,0x57fff75e,0x0a23e9ee,0xa23c422e,0x540f83af,0xdd5f6b2d,0x55ea46a7,0xc2c2c27e,0x672a1208,0xeb6b4246,0xae634f7a,0xd13599f7 .long 0xd7b32c6e,0xcf914b5c,0xeaf61814,0x61a5a640,0x208a1bbb,0x8dc3df8b,0xb6d79aa5,0xef627fd6,0xc4c86bc8,0x44232ffc,0x061539fe,0xe6f9231b,0x958b9533,0x1d04f25a,0x49e8c885,0x180cf934 .long 0x9884aaf7,0x89689595,0x07b348a6,0xb1959be3,0x3c147c87,0x96250e57,0xdd0c61f8,0xae0efb3a,0xca8c325e,0xed00745e,0xecff3f70,0x3c911696,0x319ad41d,0x73acbc65,0xf0b1c7ef,0x7b01a020 .long 0x63a1483f,0xea32b293,0x7a248f96,0x89eabe71,0x343157e5,0x9c6231d3,0xdf3c546d,0x93a375e5,0x6a2afe69,0xe76e9343,0xe166c88e,0xc4f89100,0x4f872093,0x248efd0d,0x8fe0ea61,0xae0eb3ea .long 0x9d79046e,0xaf89790d,0x6cee0976,0x4d650f2d,0x43071eca,0xa3935d9a,0x283b0bfe,0x66fcd2c9,0x696605f1,0x0e665eb5,0xa54cd38d,0xe77e5d07,0x43d950cf,0x90ee050a,0xd32e69b5,0x86ddebda .long 0xfddf7415,0x6ad94a3d,0x3f6e8d5a,0xf7fa1309,0xe9957f75,0xc4831d1d,0xd5817447,0x7de28501,0x9e2aeb6b,0x6f1d7078,0xf67a53c2,0xba2b9ff4,0xdf9defc3,0x36963767,0x0d38022c,0x479deed3 .long 0x3a8631e8,0xd2edb89b,0x7a213746,0x8de855de,0xb00c5f11,0xb2056cb7,0x2c9b85e4,0xdeaefbd0,0xd150892d,0x03f39a8d,0x218b7985,0x37b84686,0xb7375f1a,0x36296dd8,0xb78e898e,0x472cd4b1 .long 0xe9f05de9,0x15dff651,0x2ce98ba9,0xd4045069,0x9b38024c,0x8466a7ae,0xe5a6b5ef,0xb910e700,0xb3aa8f0d,0xae1c56ea,0x7eee74a6,0xbab2a507,0x4b4c4620,0x0dca11e2,0x4c47d1f4,0xfd896e2e .long 0x308fbd93,0xeb45ae53,0x02c36fda,0x46cd5a2e,0xbaa48385,0x6a3d4e90,0x9dbe9960,0xdd55e62e,0x2a81ede7,0xa1406aa0,0xf9274ea7,0x6860dd14,0x80414f86,0xcfdcb0c2,0x22f94327,0xff410b10 .long 0x49ad467b,0x5a33cc38,0x0a7335f1,0xefb48b6c,0xb153a360,0x14fb54a4,0xb52469cc,0x604aa9d2,0x754e48e9,0x5e9dc486,0x37471e8e,0x693cb455,0x8d3b37b6,0xfb2fd7cd,0xcf09ff07,0x63345e16 .long 0x23a5d896,0x9910ba6b,0x7fe4364e,0x1fe19e35,0x9a33c677,0x6e1da8c3,0x29fd9fd0,0x15b4488b,0x1a1f22bf,0x1f439254,0xab8163e8,0x920a8a70,0x07e5658e,0x3fd1b249,0xb6ec839b,0xf2c4f79c .long 0x4aa38d1b,0x1abbc3d0,0xb5d9510e,0x3b0db35c,0x3e60dec0,0x1754ac78,0xea099b33,0x53272fd7,0x07a8e107,0x5fb0494f,0x6a8191fa,0x4a89e137,0x3c4ad544,0xa113b7f6,0x6cb9897b,0x88a2e909 .long 0xb44a3f84,0x17d55de3,0x17c6c690,0xacb2f344,0x10232390,0x32088168,0x6c733bf7,0xf2e8a61f,0x9c2d7652,0xa774aab6,0xed95c5bc,0xfb5307e3,0x4981f110,0xa05c73c2,0xa39458c9,0x1baae31c .long 0xcbea62e7,0x1def185b,0xeaf63059,0xe8ac9eae,0x9921851c,0x098a8cfd,0x3abe2f5b,0xd959c3f1,0x20e40ae5,0xa4f19525,0x07a24aa1,0x320789e3,0x7392b2bc,0x259e6927,0x1918668b,0x58f6c667 .long 0xc55d2d8b,0xce1db2bb,0xf4f6ca56,0x41d58bb7,0x8f877614,0x7650b680,0xf4c349ed,0x905e16ba,0xf661acac,0xed415140,0xcb2270af,0x3b8784f0,0x8a402cba,0x3bc280ac,0x0937921a,0xd53f7146 .long 0xe5681e83,0xc03c8ee5,0xf6ac9e4a,0x62126105,0x936b1a38,0x9503a53f,0x782fecbd,0x3d45e2d4,0x76e8ae98,0x69a5c439,0xbfb4b00e,0xb53b2eeb,0x72386c89,0xf1674712,0x4268bce4,0x30ca34a2 .long 0x78341730,0x7f1ed86c,0xb525e248,0x8ef5beb8,0xb74fbf38,0xbbc489fd,0x91a0b382,0x38a92a0e,0x22433ccf,0x7a77ba3f,0xa29f05a9,0xde8362d6,0x61189afc,0x7f6a30ea,0x59ef114f,0x693b5505 .long 0xcd1797a1,0x50266bc0,0xf4b7af2d,0xea17b47e,0x3df9483e,0xd6c4025c,0xa37b18c9,0x8cbb9d9f,0x4d8424cf,0x91cbfd9c,0xab1c3506,0xdb7048f1,0x028206a3,0x9eaf641f,0x25bdf6ce,0xf986f3f9 .long 0x224c08dc,0x262143b5,0x81b50c91,0x2bbb09b4,0xaca8c84f,0xc16ed709,0xb2850ca8,0xa6210d9d,0x09cb54d6,0x6d8df67a,0x500919a4,0x91eef6e0,0x0f132857,0x90f61381,0xf8d5028b,0x9acede47 .long 0x90b771c3,0x844d1b71,0xba6426be,0x563b71e4,0xbdb802ff,0x2efa2e83,0xab5b4a41,0x3410cbab,0x30da84dd,0x555b2d26,0xee1cc29a,0xd0711ae9,0x2f547792,0xcf3e8c60,0xdc678b35,0x03d7d5de .long 0xced806b8,0x071a2fa8,0x697f1478,0x222e6134,0xabfcdbbf,0xdc16fd5d,0x121b53b8,0x44912ebf,0x2496c27c,0xac943674,0x1ffc26b0,0x8ea3176c,0x13debf2c,0xb6e224ac,0xf372a832,0x524cc235 .long 0x9f6f1b18,0xd706e1d8,0x44cce35b,0x2552f005,0xa88e31fc,0x8c8326c2,0xf9552047,0xb5468b2c,0x3ff90f2b,0xce683e88,0x2f0a5423,0x77947bdf,0xed56e328,0xd0a1b28b,0xc20134ac,0xaee35253 .long 0x3567962f,0x7e98367d,0x8188bffb,0x379ed61f,0xfaf130a1,0x73bba348,0x904ed734,0x6c1f75e1,0x3b4a79fc,0x18956642,0x54ef4493,0xf20bc83d,0x9111eca1,0x836d425d,0x009a8dcf,0xe5b5c318 .long 0x13221bc5,0x3360b25d,0x6b3eeaf7,0x707baad2,0x743a95a1,0xd7279ed8,0x969e809f,0x7450a875,0xe5d0338f,0x32b6bd53,0x2b883bbc,0x1e77f7af,0x1063ecd0,0x90da12cc,0xc315be47,0xe2697b58 .long 0xda85d534,0x2771a5bd,0xff980eea,0x53e78c1f,0x900385e7,0xadf1cf84,0xc9387b62,0x7d3b14f6,0xcb8f2bd2,0x170e74b0,0x827fa993,0x2d50b486,0xf6f32bab,0xcdbe8c9a,0xc3b93ab8,0x55e906b0 .long 0x8fe280d1,0x747f22fc,0xb2e114ab,0xcd8e0de5,0xe10b68b0,0x5ab7dbeb,0xa480d4b2,0x9dc63a9c,0x4be1495f,0x78d4bc3b,0x9359122d,0x25eb3db8,0x0809cbdc,0x3f8ac05b,0xd37c702f,0xbf4187bb .long 0x1416a6a5,0x84cea069,0x43ef881c,0x8f860c79,0x38038a5d,0x41311f8a,0xfc612067,0xe78c2ec0,0x5ad73581,0x494d2e81,0x59604097,0xb4cc9e00,0xf3612cba,0xff558aec,0x9e36c39e,0x35beef7a .long 0xdbcf41b9,0x1845c7cf,0xaea997c0,0x5703662a,0xe402f6d8,0x8b925afe,0x4dd72162,0xd0a1b1ae,0x03c41c4b,0x9f47b375,0x0391d042,0xa023829b,0x503b8b0a,0x5f5045c3,0x98c010e5,0x123c2688 .long 0x36ba06ee,0x324ec0cc,0x3dd2cc0c,0xface3115,0xf333e91f,0xb364f3be,0x28e832b0,0xef8aff73,0x2d05841b,0x1e9bad04,0x356a21e2,0x42f0e3df,0x4add627e,0xa3270bcb,0xd322e711,0xb09a8158 .long 0x0fee104a,0x86e326a1,0x3703f65d,0xad7788f8,0x47bc4833,0x7e765430,0x2b9b893a,0x6cee582b,0xe8f55a7b,0x9cd2a167,0xd9e4190d,0xefbee3c6,0xd40c2e9d,0x33ee7185,0xa380b548,0x844cc9c5 .long 0x66926e04,0x323f8ecd,0x8110c1ba,0x0001e38f,0xfc6a7f07,0x8dbcac12,0x0cec0827,0xd65e1d58,0xbe76ca2d,0xd2cd4141,0xe892f33a,0x7895cf5c,0x367139d2,0x956d230d,0xd012c4c1,0xa91abd3e .long 0x87eb36bf,0x34fa4883,0x914b8fb4,0xc5f07102,0xadb9c95f,0x90f0e579,0x28888195,0xfe6ea8cb,0xedfa9284,0x7b9b5065,0x2b8c8d65,0x6c510bd2,0xcbe8aafd,0xd7b8ebef,0x96b1da07,0xedb3af98 .long 0x6295d426,0x28ff779d,0x3fa3ad7b,0x0c4f6ac7,0x8b8e2604,0xec44d054,0x8b0050e1,0x9b32a66d,0xf0476ce2,0x1f943366,0xa602c7b4,0x7554d953,0x524f2809,0xbe35aca6,0xfd4edbea,0xb6881229 .long 0x508efb63,0xe8cd0c8f,0x6abcefc7,0x9eb5b5c8,0xb441ab4f,0xf5621f5f,0xb76a2b22,0x79e6c046,0xe37a1f69,0x74a4792c,0x03542b60,0xcbd252cb,0xb3c20bd3,0x785f65d5,0x4fabc60c,0x8dea6143 .long 0xde673629,0x45e21446,0x703c2d21,0x57f7aa1e,0x98c868c7,0xa0e99b7f,0x8b641676,0x4e42f66d,0x91077896,0x602884dc,0xc2c9885b,0xa0d690cf,0x3b9a5187,0xfeb4da33,0x153c87ee,0x5f789598 .long 0x52b16dba,0x2192dd47,0x3524c1b1,0xdeefc0e6,0xe4383693,0x465ea76e,0x361b8d98,0x79401711,0xf21a15cb,0xa5f9ace9,0xefee9aeb,0x73d26163,0xe677016c,0xcca844b3,0x57eaee06,0x6c122b07 .long 0x15f09690,0xb782dce7,0x2dfc0fc9,0x508b9b12,0x65d89fc6,0x9015ab4b,0xd6d5bb0f,0x5e79dab7,0x6c775aa2,0x64f021f0,0x37c7eca1,0xdf09d8cc,0xef2fa506,0x9a761367,0x5b81eec6,0xed4ca476 .long 0x10bbb8b5,0x262ede36,0x0641ada3,0x0737ce83,0xe9831ccc,0x4c94288a,0x8065e635,0x487fc1ce,0xb8bb3659,0xb13d7ab3,0x855e4120,0xdea5df3e,0x85eb0244,0xb9a18573,0xa7cfe0a3,0x1a1b8ea3 .long 0x67b0867c,0x3b837119,0x9d364520,0x8d5e0d08,0xd930f0e3,0x52dccc1e,0xbf20bbaf,0xefbbcec7,0x0263ad10,0x99cffcab,0xfcd18f8a,0xd8199e6d,0xe9f10617,0x64e2773f,0x08704848,0x0079e8e1 .long 0x8a342283,0x1169989f,0xa83012e6,0x8097799c,0x8a6a9001,0xece966cb,0x072ac7fc,0x93b3afef,0x2db3d5ba,0xe6893a2a,0x89bf4fdc,0x263dc462,0xe0396673,0x8852dfc9,0x3af362b6,0x7ac70895 .long 0x5c2f342b,0xbb9cce4d,0xb52d7aae,0xbf80907a,0x2161bcd0,0x97f3d3cd,0x0962744d,0xb25b0834,0x6c3a1dda,0xc5b18ea5,0x06c92317,0xfe4ec7eb,0xad1c4afe,0xb787b890,0x0ede801a,0xdccd9a92 .long 0xdb58da1f,0x9ac6ddda,0xb8cae6ee,0x22bbc12f,0x815c4a43,0xc6f8bced,0xf96480c7,0x8105a92c,0x7a859d51,0x0dc3dbf3,0x3041196b,0xe3ec7ce6,0x0d1067c9,0xd9f64b25,0x3d1f8dd8,0xf2321321 .long 0x76497ee8,0x8b5c619c,0xc717370e,0x5d2b0ac6,0x4fcf68e1,0x98204cb6,0x62bc6792,0x0bdec211,0xa63b1011,0x6973ccef,0xe0de1ac5,0xf9e3fa97,0x3d0e0c8b,0x5efb693e,0xd2d4fcb4,0x037248e9 .long 0x1ec34f9e,0x80802dc9,0x33810603,0xd8772d35,0x530cb4f3,0x3f06d66c,0xc475c129,0x7be5ed0d,0x31e82b10,0xcb9e3c19,0xc9ff6b4c,0xc63d2857,0x92a1b45e,0xb92118c6,0x7285bbca,0x0aec4414 .long 0x1e29a3ef,0xfc189ae7,0x4c93302e,0xcbe906f0,0xceaae10e,0xd0107914,0xb68e19f8,0xb7a23f34,0xefd2119d,0xe9d875c2,0xfcadc9c8,0x03198c6e,0x4da17113,0x65591bf6,0x3d443038,0x3cf0bbf8 .long 0x2b724759,0xae485bb7,0xb2d4c63a,0x945353e1,0xde7d6f2c,0x82159d07,0x4ec5b109,0x389caef3,0xdb65ef14,0x4a8ebb53,0xdd99de43,0x2dc2cb7e,0x83f2405f,0x816fa3ed,0xc14208a3,0x73429bb9 .long 0xb01e6e27,0xb618d590,0xe180b2dc,0x047e2ccd,0x04aea4a9,0xd1b299b5,0x9fa403a4,0x412c9e1e,0x79407552,0x88d28a36,0xf332b8e3,0x49c50136,0xe668de19,0x3a1b6fcc,0x75122b97,0x178851bc .long 0xfb85fa4c,0xb1e13752,0x383c8ce9,0xd61257ce,0xd2f74dae,0xd43da670,0xbf846bbb,0xa35aa23f,0x4421fc83,0x5e74235d,0xc363473b,0xf6df8ee0,0x3c4aa158,0x34d7f52a,0x9bc6d22e,0x50d05aab .long 0xa64785f4,0x8c56e735,0x5f29cd07,0xbc56637b,0x3ee35067,0x53b2bb80,0xdc919270,0x50235a0f,0xf2c4aa65,0x191ab6d8,0x8396023b,0xc3475831,0xf0f805ba,0x80400ba5,0x5ec0f80f,0x8881065b .long 0xcc1b5e83,0xc370e522,0x860b8bfb,0xde2d4ad1,0x67b256df,0xad364df0,0xe0138997,0x8f12502e,0x7783920a,0x503fa0dc,0xc0bc866a,0xe80014ad,0xd3064ba6,0x3f89b744,0xcba5dba5,0x03511dcd .long 0x95a7b1a2,0x197dd46d,0x3c6341fb,0x9c4e7ad6,0x484c2ece,0x426eca29,0xde7f4f8a,0x9211e489,0xc78ef1f4,0x14997f6e,0x06574586,0x2b2c0910,0x1c3eede8,0x17286a6e,0x0f60e018,0x25f92e47 .long 0x31890a36,0x805c5646,0x57feea5b,0x703ef600,0xaf3c3030,0x389f747c,0x54dd3739,0xe0e5daeb,0xc9c9f155,0xfe24a4c3,0xb5393962,0x7e4bf176,0xaf20bf29,0x37183de2,0xf95a8c3b,0x4a1bd7b5 .long 0x46191d3d,0xa83b9699,0x7b87f257,0x281fc8dd,0x54107588,0xb18e2c13,0x9b2bafe8,0x6372def7,0x0d8972ca,0xdaf4bb48,0x56167a3f,0x3f2dd4b7,0x84310cf4,0x1eace32d,0xe42700aa,0xe3bcefaf .long 0xd785e73d,0x5fe5691e,0x2ea60467,0xa5db5ab6,0xdfc6514a,0x02e23d41,0xe03c3665,0x35e8048e,0x1adaa0f8,0x3f8b118f,0x84ce1a5a,0x28ec3b45,0x2c6646b8,0xe8cacc6e,0xdbd0e40f,0x1343d185 .long 0xcaaa358c,0xe5d7f844,0x9924182a,0x1a1db7e4,0x9c875d9a,0xd64cd42d,0x042eeec8,0xb37b515f,0x7b165fbe,0x4d4dd409,0xe206eff3,0xfc322ed9,0x59b7e17e,0x7dee4102,0x8236ca00,0x55a481c0 .long 0xc23fc975,0x8c885312,0x05d6297b,0x15715806,0xf78edd39,0xa078868e,0x03c45e52,0x956b31e0,0xff7b33a6,0x470275d5,0x0c7e673f,0xc8d5dc3a,0x7e2f2598,0x419227b4,0x4c14a975,0x8b37b634 .long 0x8b11888c,0xd0667ed6,0x803e25dc,0x5e0e8c3e,0xb987a24a,0x34e5d0dc,0xae920323,0x9f40ac3b,0x34e0f63a,0x5463de95,0x6b6328f9,0xa128bf92,0xda64f1b7,0x491ccd7c,0xc47bde35,0x7ef1ec27 .long 0xa36a2737,0xa857240f,0x63621bc1,0x35dc1366,0xd4fb6897,0x7a3a6453,0xc929319d,0x80f1a439,0xf8cb0ba0,0xfc18274b,0x8078c5eb,0xb0b53766,0x1e01d0ef,0xfb0d4924,0x372ab09c,0x50d7c67d .long 0x3aeac968,0xb4e370af,0xc4b63266,0xe4f7fee9,0xe3ac5664,0xb4acd4c2,0xceb38cbf,0xf8910bd2,0xc9c0726e,0x1c3ae50c,0xd97b40bf,0x15309569,0xfd5a5a1b,0x70884b7f,0xef8314cd,0x3890896a .long 0xa5618c93,0x58e1515c,0x77d942d1,0xe665432b,0xb6f767a8,0xb32181bf,0x3a604110,0x753794e8,0xe8c0dbcc,0x09afeb7c,0x598673a3,0x31e02613,0x7d46db00,0x5d98e557,0x9d985b28,0xfc21fb8c .long 0xb0843e0b,0xc9040116,0x69b04531,0x53b1b3a8,0x85d7d830,0xdd1649f0,0xcb7427e8,0xbb3bcc87,0xc93dce83,0x77261100,0xa1922a2a,0x7e79da61,0xf3149ce8,0x587a2b02,0xde92ec83,0x147e1384 .long 0xaf077f30,0x484c83d3,0x0658b53a,0xea78f844,0x027aec53,0x912076c2,0x93c8177d,0xf34714e3,0xc2376c84,0x37ef5d15,0x3d1aa783,0x8315b659,0xef852a90,0x3a75c484,0x16086bd4,0x0ba0c58a .long 0x529a6d48,0x29688d7a,0xc2f19203,0x9c7f250d,0x682e2df9,0x123042fb,0xad8121bc,0x2b7587e7,0xe0182a65,0x30fc0233,0xe3e1128a,0xb82ecf87,0x93fb098f,0x71682861,0x85e9e6a7,0x043e21ae .long 0x66c834ea,0xab5b49d6,0x47414287,0x3be43e18,0x219a2a47,0xf40fb859,0xcc58df3c,0x0e6559e9,0x0c6615b4,0xfe1dfe8e,0x56459d70,0x14abc8fd,0x05de0386,0x7be0fa8e,0xe9035c7c,0x8e63ef68 .long 0x53b31e91,0x116401b4,0x4436b4d8,0x0cba7ad4,0x107afd66,0x9151f9a0,0x1f0ee4c4,0xafaca8d0,0x9ee9761c,0x75fe5c1d,0xf0c0588f,0x3497a16b,0x0304804c,0x3ee2bebd,0xc2c990b9,0xa8fb9a60 .long 0x39251114,0xd14d32fe,0xcac73366,0x36bf25bc,0xdba7495c,0xc9562c66,0x46ad348b,0x324d301b,0xd670407e,0x9f46620c,0xe3733a01,0x0ea8d4f1,0xb0c324e0,0xd396d532,0x03c317cd,0x5b211a0e .long 0x5ffe7b37,0x090d7d20,0x1747d2da,0x3b7f3efb,0xb54fc519,0xa2cb525f,0xf66a971e,0x6e220932,0xb486d440,0xddc160df,0x3fe13465,0x7fcfec46,0x76e4c151,0x83da7e4e,0xd8d302b5,0xd6fa48a1 .long 0x5872cd88,0xc6304f26,0x278b90a1,0x806c1d3c,0xcaf0bc1c,0x3553e725,0xbb9d8d5c,0xff59e603,0x7a0b85dd,0xa4550f32,0x93ecc217,0xdec5720a,0x69d62213,0x0b88b741,0x5b365955,0x7212f245 .long 0xb5cae787,0x20764111,0x1dfd3124,0x13cb7f58,0x1175aefb,0x2dca77da,0xffaae775,0xeb75466b,0xdb6cff32,0x74d76f3b,0x61fcda9a,0x7440f37a,0xb525028b,0x1bb3ac92,0xa1975f29,0x20fbf8f7 .long 0xdf83097f,0x982692e1,0x554b0800,0x28738f6c,0xa2ce2f2f,0xdc703717,0x40814194,0x7913b93c,0x1fe89636,0x04924593,0xf78834a6,0x7b98443f,0x5114a5a1,0x11c6ab01,0xffba5f4c,0x60deb383 .long 0x01a982e6,0x4caa54c6,0x3491cd26,0x1dd35e11,0x7cbd6b05,0x973c315f,0x52494724,0xcab00775,0x6565e15a,0x04659b1f,0x8c8fb026,0xbf30f529,0xa8a0de37,0xfc21641b,0xfa5e5114,0xe9c7a366 .long 0x52f03ad8,0xdb849ca5,0x024e35c0,0xc7e8dbe9,0xcfc3c789,0xa1a2bbac,0x9c26f262,0xbf733e7d,0xb8444823,0x882ffbf5,0x6bf8483b,0xb7224e88,0x65bef640,0x53023b8b,0xd4d5f8cd,0xaabfec91 .long 0x079ea1bd,0xa40e1510,0xd05d5d26,0x1ad9addc,0x13e68d4f,0xdb3f2eab,0x640f803f,0x1cff1ae2,0xd4cee117,0xe0e7b749,0x4036d909,0x8e9f275b,0x8f4d4c38,0xce34e31d,0xd75130fc,0x22b37f69 .long 0xb4014604,0x83e0f1fd,0x89415078,0xa8ce9919,0x41792efe,0x82375b75,0x97d4515b,0x4f59bf5c,0x923a277d,0xac4f324f,0x650f3406,0xd9bc9b7d,0x8a39bc51,0xc6fa87d1,0x5ccc108f,0x82588530 .long 0x82e4c634,0x5ced3c9f,0x3a4464f8,0x8efb8314,0x7a1dca25,0xe706381b,0x5a2a412b,0x6cd15a3c,0xbfcd8fb5,0x9347a8fd,0x6e54cd22,0x31db2eef,0xf8d8932f,0xc4aeb11e,0x344411af,0x11e7c1ed .long 0xdc9a151e,0x2653050c,0x3bb0a859,0x9edbfc08,0xfd5691e7,0x926c81c7,0x6f39019a,0x9c1b2342,0x7f8474b9,0x64a81c8b,0x01761819,0x90657c07,0x55e0375a,0x390b3331,0xb6ebc47d,0xc676c626 .long 0xb7d6dee8,0x51623247,0x79659313,0x0948d927,0xe9ab35ed,0x99700161,0x8ddde408,0x06cc32b4,0x061ef338,0x6f2fd664,0xc202e9ed,0x1606fa02,0x929ba99b,0x55388bc1,0x1e81df69,0xc4428c5e .long 0xf91b0b2a,0xce2028ae,0xf03dfd3f,0xce870a23,0x0affe8ed,0x66ec2c87,0x284d0c00,0xb205fb46,0x44cefa48,0xbf5dffe7,0xa19876d7,0xb6fc37a8,0x08b72863,0xbecfa84c,0x2576374f,0xd7205ff5 .long 0x8887de41,0x80330d32,0x869ea534,0x5de0df0c,0x3c56ea17,0x13f42753,0x452b1a78,0xeb1f6069,0xe30ea15c,0x50474396,0xc1494125,0x575816a1,0xfe6bb38f,0xbe1ce55b,0x96ae30f7,0xb901a948 .long 0xd8fc3548,0xe5af0f08,0xd73bfd08,0x5010b5d0,0x53fe655a,0x993d2880,0x1c1309fd,0x99f2630b,0xb4e3b76f,0xd8677baf,0xb840784b,0x14e51ddc,0xbf0092ce,0x326c750c,0xf528320f,0xc83d306b .long 0x77d4715c,0xc4456715,0x6b703235,0xd30019f9,0xd669e986,0x207ccb2e,0xf6dbfc28,0x57c824af,0xd8f92a23,0xf0eb532f,0x9bb98fd2,0x4a557fd4,0xc1e6199a,0xa57acea7,0x8b94b1ed,0x0c663820 .long 0xf83a9266,0x9b42be8f,0x0101bd45,0xc7741c97,0x07bd9ceb,0x95770c11,0x8b2e0744,0x1f50250a,0x1477b654,0xf762eec8,0x15efe59a,0xc65b900e,0x9546a897,0x88c96148,0xc30b4d7c,0x7e8025b3 .long 0x12045cf9,0xae4065ef,0x9ccce8bd,0x6fcb2caf,0xf2cf6525,0x1fa0ba4e,0xcb72c312,0xf683125d,0xe312410e,0xa01da4ea,0x6cd8e830,0x67e28677,0x98fb3f07,0xabd95752,0xeef649a5,0x05f11e11 .long 0x9d3472c2,0xba47faef,0xc77d1345,0x3adff697,0xdd15afee,0x4761fa04,0xb9e69462,0x64f1f61a,0x9bfb9093,0xfa691fab,0xa1133dfe,0x3df8ae8f,0x58cc710d,0xcd5f8967,0x16c7fe79,0xfbb88d50 .long 0xe88c50d1,0x8e011b4c,0xa8771c4f,0x7532e807,0xe2278ee4,0x64c78a48,0x3845072a,0x0b283e83,0x49e69274,0x98a6f291,0x1868b21c,0xb96e9668,0xb1a8908e,0x38f0adc2,0x1feb829d,0x90afcff7 .long 0x210b0856,0x9915a383,0xdef04889,0xa5a80602,0x7c64d509,0x800e9af9,0xb8996f6f,0x81382d0b,0x81927e27,0x490eba53,0x4af50182,0x46c63b32,0xd3ad62ce,0x784c5fd9,0xf8ae8736,0xe4fa1870 .long 0xd7466b25,0x4ec9d0bc,0xdb235c65,0x84ddbe1a,0x163c1688,0x5e2645ee,0x00eba747,0x570bd00e,0x128bfa0f,0xfa51b629,0x6c1d3b68,0x92fce1bd,0xb66778b1,0x3e7361dc,0x5561d2bb,0x9c7d249d .long 0x0bbc6229,0xa40b28bf,0xdfd91497,0x1c83c05e,0xf083df05,0x5f9f5154,0xeee66c9d,0xbac38b3c,0xec0dfcfd,0xf71db7e3,0x8b0a8416,0xf2ecda8e,0x7812aa66,0x52fddd86,0x4e6f4272,0x2896ef10 .long 0x0fe9a745,0xff27186a,0x49ca70db,0x08249fcd,0x441cac49,0x7425a2e6,0xece5ff57,0xf4a0885a,0x7d7ead58,0x6e2cb731,0x1898d104,0xf96cf7d6,0x4f2c9a89,0xafe67c9d,0x1c7bf5bc,0x89895a50 .long 0x573cecfa,0xdc7cb8e5,0xd15f03e6,0x66497eae,0x3f084420,0x6bc0de69,0xacd532b0,0x323b9b36,0x0115a3c1,0xcfed390a,0x2d65ca0e,0x9414c40b,0x2f530c78,0x641406bd,0x833438f2,0x29369a44 .long 0x903fa271,0x996884f5,0xb9da921e,0xe6da0fd2,0x5db01e54,0xa6f2f269,0x6876214e,0x1ee3e9bd,0xe27a9497,0xa26e181c,0x8e215e04,0x36d254e4,0x252cabca,0x42f32a6c,0x80b57614,0x99481487 .long 0x40d9cae1,0x4c4dfe69,0x11a10f09,0x05869580,0x3491b64b,0xca287b57,0x3fd4a53b,0x77862d5d,0x50349126,0xbf94856e,0x71c5268f,0x2be30bd1,0xcbb650a6,0x10393f19,0x778cf9fd,0x639531fe .long 0xb2935359,0x02556a11,0xaf8c126e,0xda38aa96,0x0960167f,0x47dbe6c2,0x501901cd,0x37bbabb6,0x2c947778,0xb6e979e0,0x7a1a1dc6,0xd69a5175,0x9d9faf0c,0xc3ed5095,0x1d5fa5f0,0x4dd9c096 .long 0x64f16ea8,0xa0c4304d,0x7e718623,0x8b1cac16,0x7c67f03e,0x0b576546,0xcbd88c01,0x559cf5ad,0x0e2af19a,0x074877bb,0xa1228c92,0x1f717ec1,0x326e8920,0x70bcb800,0x4f312804,0xec6e2c5c .long 0x3fca4752,0x426aea7d,0x2211f62a,0xf12c0949,0x7be7b6b5,0x24beecd8,0x36d7a27d,0xb77eaf4c,0xfda78fd3,0x154c2781,0x264eeabe,0x848a83b0,0x4ffe2bc4,0x81287ef0,0xb6b6fc2a,0x7b6d88c6 .long 0xce417d99,0x805fb947,0x8b916cc4,0x4b93dcc3,0x21273323,0x72e65bb3,0x6ea9886e,0xbcc1badd,0x4bc5ee85,0x0e223011,0xc18ee1e4,0xa561be74,0xa6bcf1f1,0x762fd2d4,0x95231489,0x50e6a5a4 .long 0xa00b500b,0xca96001f,0x5d7dcdf5,0x5c098cfc,0x8c446a85,0xa64e2d2e,0x971f3c62,0xbae9bcf1,0x8435a2c5,0x4ec22683,0x4bad4643,0x8ceaed6c,0xccccf4e3,0xe9f8fb47,0x1ce3b21e,0xbd4f3fa4 .long 0xa3db3292,0xd79fb110,0xb536c66a,0xe28a37da,0x8e49e6a9,0x279ce87b,0xfdcec8e3,0x70ccfe8d,0x3ba464b2,0x2193e4e0,0xaca9a398,0x0f39d60e,0xf82c12ab,0x7d7932af,0x91e7e0f7,0xd8ff50ed .long 0xfa28a7e0,0xea961058,0x0bf5ec74,0xc726cf25,0xdb229666,0xe74d55c8,0xa57f5799,0x0bd9abbf,0x4dfc47b3,0x7479ef07,0x0c52f91d,0xd9c65fc3,0x36a8bde2,0x8e0283fe,0x7d4b7280,0xa32a8b5e .long 0x12e83233,0x6a677c61,0xdcc9bf28,0x0fbb3512,0x0d780f61,0x562e8ea5,0x1dc4e89c,0x0db8b22b,0x89be0144,0x0a6fd1fb,0xca57113b,0x8c77d246,0xff09c91c,0x4639075d,0x5060824c,0x5b47b17f .long 0x16287b52,0x58aea2b0,0xd0cd8eb0,0xa1343520,0xc5d58573,0x6148b4d0,0x291c68ae,0xdd2b6170,0x1da3b3b7,0xa61b3929,0x08c4ac10,0x5f946d79,0x7217d583,0x4105d4a5,0x25e6de5e,0x5061da3d .long 0xec1b4991,0x3113940d,0x36f485ae,0xf12195e1,0x731a2ee0,0xa7507fb2,0x6e9e196e,0x95057a8e,0x2e130136,0xa3c2c911,0x33c60d15,0x97dfbb36,0xb300ee2b,0xcaf3c581,0xf4bac8b8,0x77f25d90 .long 0x6d840cd6,0xdb1c4f98,0xe634288c,0x471d62c0,0xcec8a161,0x8ec2f85e,0xfa6f4ae2,0x41f37cbc,0x4b709985,0x6793a20f,0xefa8985b,0x7a7bd33b,0x938e6446,0x2c6a3fbd,0x2a8d47c1,0x19042619 .long 0xcc36975f,0x16848667,0x9d5f1dfb,0x02acf168,0x613baa94,0x62d41ad4,0x9f684670,0xb56fbb92,0xe9e40569,0xce610d0d,0x35489fef,0x7b99c65f,0x3df18b97,0x0c88ad1b,0x5d0e9edb,0x81b7d9be .long 0xc716cc0a,0xd85218c0,0x85691c49,0xf4b5ff90,0xce356ac6,0xa4fd666b,0x4b327a7a,0x17c72895,0xda6be7de,0xf93d5085,0x3301d34e,0xff71530e,0xd8f448e8,0x4cd96442,0x2ed18ffa,0x9283d331 .long 0x2a849870,0x4d33dd99,0x41576335,0xa716964b,0x179be0e5,0xff5e3a9b,0x83b13632,0x5b9d6b1b,0xa52f313b,0x3b8bd7d4,0x637a4660,0xc9dd95a0,0x0b3e218f,0x30035962,0xc7b28a3c,0xce1481a3 .long 0x43228d83,0xab41b43a,0x4ad63f99,0x24ae1c30,0x46a51229,0x8e525f1a,0xcd26d2b4,0x14af860f,0x3f714aa1,0xd6baef61,0xeb78795e,0xf51865ad,0xe6a9d694,0xd3e21fce,0x8a37b527,0x82ceb1dd .size ecp_nistz256_precomputed,.-ecp_nistz256_precomputed .text .align 64 .Lpoly: .quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 .LRR: .quad 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, 0x00000004fffffffd .LOne: .long 1,1,1,1,1,1,1,1 .LTwo: .long 2,2,2,2,2,2,2,2 .LThree: .long 3,3,3,3,3,3,3,3 .LONE_mont: .quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe .Lord: .quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 .LordK: .quad 0xccd1c8aaee00bc4f .globl ecp_nistz256_mul_by_2 .type ecp_nistz256_mul_by_2,@function .align 64 ecp_nistz256_mul_by_2: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Lmul_by_2_body: movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 addq %r8,%r8 movq 16(%rsi),%r10 adcq %r9,%r9 movq 24(%rsi),%r11 leaq .Lpoly(%rip),%rsi movq %r8,%rax adcq %r10,%r10 adcq %r11,%r11 movq %r9,%rdx adcq $0,%r13 subq 0(%rsi),%r8 movq %r10,%rcx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r12 sbbq 24(%rsi),%r11 sbbq $0,%r13 cmovcq %rax,%r8 cmovcq %rdx,%r9 movq %r8,0(%rdi) cmovcq %rcx,%r10 movq %r9,8(%rdi) cmovcq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lmul_by_2_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 .globl ecp_nistz256_div_by_2 .type ecp_nistz256_div_by_2,@function .align 32 ecp_nistz256_div_by_2: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Ldiv_by_2_body: movq 0(%rsi),%r8 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq %r8,%rax movq 24(%rsi),%r11 leaq .Lpoly(%rip),%rsi movq %r9,%rdx xorq %r13,%r13 addq 0(%rsi),%r8 movq %r10,%rcx adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 movq %r11,%r12 adcq 24(%rsi),%r11 adcq $0,%r13 xorq %rsi,%rsi testq $1,%rax cmovzq %rax,%r8 cmovzq %rdx,%r9 cmovzq %rcx,%r10 cmovzq %r12,%r11 cmovzq %rsi,%r13 movq %r9,%rax shrq $1,%r8 shlq $63,%rax movq %r10,%rdx shrq $1,%r9 orq %rax,%r8 shlq $63,%rdx movq %r11,%rcx shrq $1,%r10 orq %rdx,%r9 shlq $63,%rcx shrq $1,%r11 shlq $63,%r13 orq %rcx,%r10 orq %r13,%r11 movq %r8,0(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Ldiv_by_2_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 .globl ecp_nistz256_mul_by_3 .type ecp_nistz256_mul_by_3,@function .align 32 ecp_nistz256_mul_by_3: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Lmul_by_3_body: movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 addq %r8,%r8 movq 16(%rsi),%r10 adcq %r9,%r9 movq 24(%rsi),%r11 movq %r8,%rax adcq %r10,%r10 adcq %r11,%r11 movq %r9,%rdx adcq $0,%r13 subq $-1,%r8 movq %r10,%rcx sbbq .Lpoly+8(%rip),%r9 sbbq $0,%r10 movq %r11,%r12 sbbq .Lpoly+24(%rip),%r11 sbbq $0,%r13 cmovcq %rax,%r8 cmovcq %rdx,%r9 cmovcq %rcx,%r10 cmovcq %r12,%r11 xorq %r13,%r13 addq 0(%rsi),%r8 adcq 8(%rsi),%r9 movq %r8,%rax adcq 16(%rsi),%r10 adcq 24(%rsi),%r11 movq %r9,%rdx adcq $0,%r13 subq $-1,%r8 movq %r10,%rcx sbbq .Lpoly+8(%rip),%r9 sbbq $0,%r10 movq %r11,%r12 sbbq .Lpoly+24(%rip),%r11 sbbq $0,%r13 cmovcq %rax,%r8 cmovcq %rdx,%r9 movq %r8,0(%rdi) cmovcq %rcx,%r10 movq %r9,8(%rdi) cmovcq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lmul_by_3_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 .globl ecp_nistz256_add .type ecp_nistz256_add,@function .align 32 ecp_nistz256_add: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Ladd_body: movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 leaq .Lpoly(%rip),%rsi addq 0(%rdx),%r8 adcq 8(%rdx),%r9 movq %r8,%rax adcq 16(%rdx),%r10 adcq 24(%rdx),%r11 movq %r9,%rdx adcq $0,%r13 subq 0(%rsi),%r8 movq %r10,%rcx sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r12 sbbq 24(%rsi),%r11 sbbq $0,%r13 cmovcq %rax,%r8 cmovcq %rdx,%r9 movq %r8,0(%rdi) cmovcq %rcx,%r10 movq %r9,8(%rdi) cmovcq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Ladd_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_add,.-ecp_nistz256_add .globl ecp_nistz256_sub .type ecp_nistz256_sub,@function .align 32 ecp_nistz256_sub: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Lsub_body: movq 0(%rsi),%r8 xorq %r13,%r13 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 leaq .Lpoly(%rip),%rsi subq 0(%rdx),%r8 sbbq 8(%rdx),%r9 movq %r8,%rax sbbq 16(%rdx),%r10 sbbq 24(%rdx),%r11 movq %r9,%rdx sbbq $0,%r13 addq 0(%rsi),%r8 movq %r10,%rcx adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 movq %r11,%r12 adcq 24(%rsi),%r11 testq %r13,%r13 cmovzq %rax,%r8 cmovzq %rdx,%r9 movq %r8,0(%rdi) cmovzq %rcx,%r10 movq %r9,8(%rdi) cmovzq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lsub_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_sub,.-ecp_nistz256_sub .globl ecp_nistz256_neg .type ecp_nistz256_neg,@function .align 32 ecp_nistz256_neg: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Lneg_body: xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 xorq %r11,%r11 xorq %r13,%r13 subq 0(%rsi),%r8 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r8,%rax sbbq 24(%rsi),%r11 leaq .Lpoly(%rip),%rsi movq %r9,%rdx sbbq $0,%r13 addq 0(%rsi),%r8 movq %r10,%rcx adcq 8(%rsi),%r9 adcq 16(%rsi),%r10 movq %r11,%r12 adcq 24(%rsi),%r11 testq %r13,%r13 cmovzq %rax,%r8 cmovzq %rdx,%r9 movq %r8,0(%rdi) cmovzq %rcx,%r10 movq %r9,8(%rdi) cmovzq %r12,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lneg_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_neg,.-ecp_nistz256_neg .globl ecp_nistz256_ord_mul_mont .type ecp_nistz256_ord_mul_mont,@function .align 32 ecp_nistz256_ord_mul_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_mul_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lord_mul_body: movq 0(%rdx),%rax movq %rdx,%rbx leaq .Lord(%rip),%r14 movq .LordK(%rip),%r15 movq %rax,%rcx mulq 0(%rsi) movq %rax,%r8 movq %rcx,%rax movq %rdx,%r9 mulq 8(%rsi) addq %rax,%r9 movq %rcx,%rax adcq $0,%rdx movq %rdx,%r10 mulq 16(%rsi) addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %r8,%r13 imulq %r15,%r8 movq %rdx,%r11 mulq 24(%rsi) addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%r12 mulq 0(%r14) movq %r8,%rbp addq %rax,%r13 movq %r8,%rax adcq $0,%rdx movq %rdx,%rcx subq %r8,%r10 sbbq $0,%r8 mulq 8(%r14) addq %rcx,%r9 adcq $0,%rdx addq %rax,%r9 movq %rbp,%rax adcq %rdx,%r10 movq %rbp,%rdx adcq $0,%r8 shlq $32,%rax shrq $32,%rdx subq %rax,%r11 movq 8(%rbx),%rax sbbq %rdx,%rbp addq %r8,%r11 adcq %rbp,%r12 adcq $0,%r13 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r9 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r10 adcq $0,%rdx addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %r9,%rcx imulq %r15,%r9 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r12 adcq $0,%rdx xorq %r8,%r8 addq %rax,%r12 movq %r9,%rax adcq %rdx,%r13 adcq $0,%r8 mulq 0(%r14) movq %r9,%rbp addq %rax,%rcx movq %r9,%rax adcq %rdx,%rcx subq %r9,%r11 sbbq $0,%r9 mulq 8(%r14) addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %rbp,%rax adcq %rdx,%r11 movq %rbp,%rdx adcq $0,%r9 shlq $32,%rax shrq $32,%rdx subq %rax,%r12 movq 16(%rbx),%rax sbbq %rdx,%rbp addq %r9,%r12 adcq %rbp,%r13 adcq $0,%r8 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r10 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r11 adcq $0,%rdx addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %rcx,%rax adcq $0,%rdx movq %r10,%rcx imulq %r15,%r10 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r13 adcq $0,%rdx xorq %r9,%r9 addq %rax,%r13 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 mulq 0(%r14) movq %r10,%rbp addq %rax,%rcx movq %r10,%rax adcq %rdx,%rcx subq %r10,%r12 sbbq $0,%r10 mulq 8(%r14) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq %rdx,%r12 movq %rbp,%rdx adcq $0,%r10 shlq $32,%rax shrq $32,%rdx subq %rax,%r13 movq 24(%rbx),%rax sbbq %rdx,%rbp addq %r10,%r13 adcq %rbp,%r8 adcq $0,%r9 movq %rax,%rcx mulq 0(%rsi) addq %rax,%r11 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 8(%rsi) addq %rbp,%r12 adcq $0,%rdx addq %rax,%r12 movq %rcx,%rax adcq $0,%rdx movq %rdx,%rbp mulq 16(%rsi) addq %rbp,%r13 adcq $0,%rdx addq %rax,%r13 movq %rcx,%rax adcq $0,%rdx movq %r11,%rcx imulq %r15,%r11 movq %rdx,%rbp mulq 24(%rsi) addq %rbp,%r8 adcq $0,%rdx xorq %r10,%r10 addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 mulq 0(%r14) movq %r11,%rbp addq %rax,%rcx movq %r11,%rax adcq %rdx,%rcx subq %r11,%r13 sbbq $0,%r11 mulq 8(%r14) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq %rdx,%r13 movq %rbp,%rdx adcq $0,%r11 shlq $32,%rax shrq $32,%rdx subq %rax,%r8 sbbq %rdx,%rbp addq %r11,%r8 adcq %rbp,%r9 adcq $0,%r10 movq %r12,%rsi subq 0(%r14),%r12 movq %r13,%r11 sbbq 8(%r14),%r13 movq %r8,%rcx sbbq 16(%r14),%r8 movq %r9,%rbp sbbq 24(%r14),%r9 sbbq $0,%r10 cmovcq %rsi,%r12 cmovcq %r11,%r13 cmovcq %rcx,%r8 cmovcq %rbp,%r9 movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_mul_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont .globl ecp_nistz256_ord_sqr_mont .type ecp_nistz256_ord_sqr_mont,@function .align 32 ecp_nistz256_ord_sqr_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lecp_nistz256_ord_sqr_montx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lord_sqr_body: movq 0(%rsi),%r8 movq 8(%rsi),%rax movq 16(%rsi),%r14 movq 24(%rsi),%r15 leaq .Lord(%rip),%rsi movq %rdx,%rbx jmp .Loop_ord_sqr .align 32 .Loop_ord_sqr: movq %rax,%rbp mulq %r8 movq %rax,%r9 .byte 102,72,15,110,205 movq %r14,%rax movq %rdx,%r10 mulq %r8 addq %rax,%r10 movq %r15,%rax .byte 102,73,15,110,214 adcq $0,%rdx movq %rdx,%r11 mulq %r8 addq %rax,%r11 movq %r15,%rax .byte 102,73,15,110,223 adcq $0,%rdx movq %rdx,%r12 mulq %r14 movq %rax,%r13 movq %r14,%rax movq %rdx,%r14 mulq %rbp addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r15 mulq %rbp addq %rax,%r12 adcq $0,%rdx addq %r15,%r12 adcq %rdx,%r13 adcq $0,%r14 xorq %r15,%r15 movq %r8,%rax addq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 adcq %r14,%r14 adcq $0,%r15 mulq %rax movq %rax,%r8 .byte 102,72,15,126,200 movq %rdx,%rbp mulq %rax addq %rbp,%r9 adcq %rax,%r10 .byte 102,72,15,126,208 adcq $0,%rdx movq %rdx,%rbp mulq %rax addq %rbp,%r11 adcq %rax,%r12 .byte 102,72,15,126,216 adcq $0,%rdx movq %rdx,%rbp movq %r8,%rcx imulq 32(%rsi),%r8 mulq %rax addq %rbp,%r13 adcq %rax,%r14 movq 0(%rsi),%rax adcq %rdx,%r15 mulq %r8 movq %r8,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r8,%r10 sbbq $0,%rbp mulq %r8 addq %rcx,%r9 adcq $0,%rdx addq %rax,%r9 movq %r8,%rax adcq %rdx,%r10 movq %r8,%rdx adcq $0,%rbp movq %r9,%rcx imulq 32(%rsi),%r9 shlq $32,%rax shrq $32,%rdx subq %rax,%r11 movq 0(%rsi),%rax sbbq %rdx,%r8 addq %rbp,%r11 adcq $0,%r8 mulq %r9 movq %r9,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r9,%r11 sbbq $0,%rbp mulq %r9 addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %r9,%rax adcq %rdx,%r11 movq %r9,%rdx adcq $0,%rbp movq %r10,%rcx imulq 32(%rsi),%r10 shlq $32,%rax shrq $32,%rdx subq %rax,%r8 movq 0(%rsi),%rax sbbq %rdx,%r9 addq %rbp,%r8 adcq $0,%r9 mulq %r10 movq %r10,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r10,%r8 sbbq $0,%rbp mulq %r10 addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %r10,%rax adcq %rdx,%r8 movq %r10,%rdx adcq $0,%rbp movq %r11,%rcx imulq 32(%rsi),%r11 shlq $32,%rax shrq $32,%rdx subq %rax,%r9 movq 0(%rsi),%rax sbbq %rdx,%r10 addq %rbp,%r9 adcq $0,%r10 mulq %r11 movq %r11,%rbp addq %rax,%rcx movq 8(%rsi),%rax adcq %rdx,%rcx subq %r11,%r9 sbbq $0,%rbp mulq %r11 addq %rcx,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 movq %r11,%rdx adcq $0,%rbp shlq $32,%rax shrq $32,%rdx subq %rax,%r10 sbbq %rdx,%r11 addq %rbp,%r10 adcq $0,%r11 xorq %rdx,%rdx addq %r12,%r8 adcq %r13,%r9 movq %r8,%r12 adcq %r14,%r10 adcq %r15,%r11 movq %r9,%rax adcq $0,%rdx subq 0(%rsi),%r8 movq %r10,%r14 sbbq 8(%rsi),%r9 sbbq 16(%rsi),%r10 movq %r11,%r15 sbbq 24(%rsi),%r11 sbbq $0,%rdx cmovcq %r12,%r8 cmovncq %r9,%rax cmovncq %r10,%r14 cmovncq %r11,%r15 decq %rbx jnz .Loop_ord_sqr movq %r8,0(%rdi) movq %rax,8(%rdi) pxor %xmm1,%xmm1 movq %r14,16(%rdi) pxor %xmm2,%xmm2 movq %r15,24(%rdi) pxor %xmm3,%xmm3 movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lord_sqr_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_mul_montx,@function +.align 32 +ecp_nistz256_ord_mul_montx: +.cfi_startproc +.Lecp_nistz256_ord_mul_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx + +.type ecp_nistz256_ord_sqr_montx,@function +.align 32 +ecp_nistz256_ord_sqr_montx: +.cfi_startproc +.Lecp_nistz256_ord_sqr_montx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax +.byte 102,73,15,110,206 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 +.byte 102,73,15,110,215 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx +.byte 102,73,15,110,216 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp +.byte 102,72,15,126,202 + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax +.byte 102,72,15,126,210 + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 +.byte 102,72,15,126,218 + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx + + + + .globl ecp_nistz256_to_mont .type ecp_nistz256_to_mont,@function .align 32 ecp_nistz256_to_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx leaq .LRR(%rip),%rdx jmp .Lmul_mont .cfi_endproc .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont .globl ecp_nistz256_mul_mont .type ecp_nistz256_mul_mont,@function .align 32 ecp_nistz256_mul_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx .Lmul_mont: pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lmul_body: + cmpl $0x80100,%ecx + je .Lmul_montx movq %rdx,%rbx movq 0(%rdx),%rax movq 0(%rsi),%r9 movq 8(%rsi),%r10 movq 16(%rsi),%r11 movq 24(%rsi),%r12 call __ecp_nistz256_mul_montq + jmp .Lmul_mont_done + +.align 32 +.Lmul_montx: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx .Lmul_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lmul_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont .type __ecp_nistz256_mul_montq,@function .align 32 __ecp_nistz256_mul_montq: .cfi_startproc movq %rax,%rbp mulq %r9 movq .Lpoly+8(%rip),%r14 movq %rax,%r8 movq %rbp,%rax movq %rdx,%r9 mulq %r10 movq .Lpoly+24(%rip),%r15 addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r10 mulq %r11 addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r12 addq %rax,%r11 movq %r8,%rax adcq $0,%rdx xorq %r13,%r13 movq %rdx,%r12 movq %r8,%rbp shlq $32,%r8 mulq %r15 shrq $32,%rbp addq %r8,%r9 adcq %rbp,%r10 adcq %rax,%r11 movq 8(%rbx),%rax adcq %rdx,%r12 adcq $0,%r13 xorq %r8,%r8 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r9 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r10 adcq $0,%rdx addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %r9,%rax adcq %rdx,%r13 adcq $0,%r8 movq %r9,%rbp shlq $32,%r9 mulq %r15 shrq $32,%rbp addq %r9,%r10 adcq %rbp,%r11 adcq %rax,%r12 movq 16(%rbx),%rax adcq %rdx,%r13 adcq $0,%r8 xorq %r9,%r9 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r10 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r11 adcq $0,%rdx addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r13 adcq $0,%rdx addq %rax,%r13 movq %r10,%rax adcq %rdx,%r8 adcq $0,%r9 movq %r10,%rbp shlq $32,%r10 mulq %r15 shrq $32,%rbp addq %r10,%r11 adcq %rbp,%r12 adcq %rax,%r13 movq 24(%rbx),%rax adcq %rdx,%r8 adcq $0,%r9 xorq %r10,%r10 movq %rax,%rbp mulq 0(%rsi) addq %rax,%r11 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 8(%rsi) addq %rcx,%r12 adcq $0,%rdx addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 16(%rsi) addq %rcx,%r13 adcq $0,%rdx addq %rax,%r13 movq %rbp,%rax adcq $0,%rdx movq %rdx,%rcx mulq 24(%rsi) addq %rcx,%r8 adcq $0,%rdx addq %rax,%r8 movq %r11,%rax adcq %rdx,%r9 adcq $0,%r10 movq %r11,%rbp shlq $32,%r11 mulq %r15 shrq $32,%rbp addq %r11,%r12 adcq %rbp,%r13 movq %r12,%rcx adcq %rax,%r8 adcq %rdx,%r9 movq %r13,%rbp adcq $0,%r10 subq $-1,%r12 movq %r8,%rbx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%rdx sbbq %r15,%r9 sbbq $0,%r10 cmovcq %rcx,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rbx,%r8 movq %r13,8(%rdi) cmovcq %rdx,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq .globl ecp_nistz256_sqr_mont .type ecp_nistz256_sqr_mont,@function .align 32 ecp_nistz256_sqr_mont: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lsqr_body: + cmpl $0x80100,%ecx + je .Lsqr_montx movq 0(%rsi),%rax movq 8(%rsi),%r14 movq 16(%rsi),%r15 movq 24(%rsi),%r8 call __ecp_nistz256_sqr_montq + jmp .Lsqr_mont_done + +.align 32 +.Lsqr_montx: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx .Lsqr_mont_done: movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbx .cfi_restore %rbx movq 40(%rsp),%rbp .cfi_restore %rbp leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lsqr_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont .type __ecp_nistz256_sqr_montq,@function .align 32 __ecp_nistz256_sqr_montq: .cfi_startproc movq %rax,%r13 mulq %r14 movq %rax,%r9 movq %r15,%rax movq %rdx,%r10 mulq %r13 addq %rax,%r10 movq %r8,%rax adcq $0,%rdx movq %rdx,%r11 mulq %r13 addq %rax,%r11 movq %r15,%rax adcq $0,%rdx movq %rdx,%r12 mulq %r14 addq %rax,%r11 movq %r8,%rax adcq $0,%rdx movq %rdx,%rbp mulq %r14 addq %rax,%r12 movq %r8,%rax adcq $0,%rdx addq %rbp,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %r15 xorq %r15,%r15 addq %rax,%r13 movq 0(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 addq %r9,%r9 adcq %r10,%r10 adcq %r11,%r11 adcq %r12,%r12 adcq %r13,%r13 adcq %r14,%r14 adcq $0,%r15 mulq %rax movq %rax,%r8 movq 8(%rsi),%rax movq %rdx,%rcx mulq %rax addq %rcx,%r9 adcq %rax,%r10 movq 16(%rsi),%rax adcq $0,%rdx movq %rdx,%rcx mulq %rax addq %rcx,%r11 adcq %rax,%r12 movq 24(%rsi),%rax adcq $0,%rdx movq %rdx,%rcx mulq %rax addq %rcx,%r13 adcq %rax,%r14 movq %r8,%rax adcq %rdx,%r15 movq .Lpoly+8(%rip),%rsi movq .Lpoly+24(%rip),%rbp movq %r8,%rcx shlq $32,%r8 mulq %rbp shrq $32,%rcx addq %r8,%r9 adcq %rcx,%r10 adcq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %r9,%rcx shlq $32,%r9 movq %rdx,%r8 mulq %rbp shrq $32,%rcx addq %r9,%r10 adcq %rcx,%r11 adcq %rax,%r8 movq %r10,%rax adcq $0,%rdx movq %r10,%rcx shlq $32,%r10 movq %rdx,%r9 mulq %rbp shrq $32,%rcx addq %r10,%r11 adcq %rcx,%r8 adcq %rax,%r9 movq %r11,%rax adcq $0,%rdx movq %r11,%rcx shlq $32,%r11 movq %rdx,%r10 mulq %rbp shrq $32,%rcx addq %r11,%r8 adcq %rcx,%r9 adcq %rax,%r10 adcq $0,%rdx xorq %r11,%r11 addq %r8,%r12 adcq %r9,%r13 movq %r12,%r8 adcq %r10,%r14 adcq %rdx,%r15 movq %r13,%r9 adcq $0,%r11 subq $-1,%r12 movq %r14,%r10 sbbq %rsi,%r13 sbbq $0,%r14 movq %r15,%rcx sbbq %rbp,%r15 sbbq $0,%r11 cmovcq %r8,%r12 cmovcq %r9,%r13 movq %r12,0(%rdi) cmovcq %r10,%r14 movq %r13,8(%rdi) cmovcq %rcx,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + addq %rbp,%r9 + adcq %rcx,%r10 + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx + + + + + + .globl ecp_nistz256_from_mont .type ecp_nistz256_from_mont,@function .align 32 ecp_nistz256_from_mont: .cfi_startproc pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-16 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-24 .Lfrom_body: movq 0(%rsi),%rax movq .Lpoly+24(%rip),%r13 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq %rax,%r8 movq .Lpoly+8(%rip),%r12 movq %rax,%rcx shlq $32,%r8 mulq %r13 shrq $32,%rcx addq %r8,%r9 adcq %rcx,%r10 adcq %rax,%r11 movq %r9,%rax adcq $0,%rdx movq %r9,%rcx shlq $32,%r9 movq %rdx,%r8 mulq %r13 shrq $32,%rcx addq %r9,%r10 adcq %rcx,%r11 adcq %rax,%r8 movq %r10,%rax adcq $0,%rdx movq %r10,%rcx shlq $32,%r10 movq %rdx,%r9 mulq %r13 shrq $32,%rcx addq %r10,%r11 adcq %rcx,%r8 adcq %rax,%r9 movq %r11,%rax adcq $0,%rdx movq %r11,%rcx shlq $32,%r11 movq %rdx,%r10 mulq %r13 shrq $32,%rcx addq %r11,%r8 adcq %rcx,%r9 movq %r8,%rcx adcq %rax,%r10 movq %r9,%rsi adcq $0,%rdx subq $-1,%r8 movq %r10,%rax sbbq %r12,%r9 sbbq $0,%r10 movq %rdx,%r11 sbbq %r13,%rdx sbbq %r13,%r13 cmovnzq %rcx,%r8 cmovnzq %rsi,%r9 movq %r8,0(%rdi) cmovnzq %rax,%r10 movq %r9,8(%rdi) cmovzq %rdx,%r11 movq %r10,16(%rdi) movq %r11,24(%rdi) movq 0(%rsp),%r13 .cfi_restore %r13 movq 8(%rsp),%r12 .cfi_restore %r12 leaq 16(%rsp),%rsp .cfi_adjust_cfa_offset -16 .Lfrom_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont .globl ecp_nistz256_scatter_w5 .type ecp_nistz256_scatter_w5,@function .align 32 ecp_nistz256_scatter_w5: .cfi_startproc leal -3(%rdx,%rdx,2),%edx movdqa 0(%rsi),%xmm0 shll $5,%edx movdqa 16(%rsi),%xmm1 movdqa 32(%rsi),%xmm2 movdqa 48(%rsi),%xmm3 movdqa 64(%rsi),%xmm4 movdqa 80(%rsi),%xmm5 movdqa %xmm0,0(%rdi,%rdx,1) movdqa %xmm1,16(%rdi,%rdx,1) movdqa %xmm2,32(%rdi,%rdx,1) movdqa %xmm3,48(%rdi,%rdx,1) movdqa %xmm4,64(%rdi,%rdx,1) movdqa %xmm5,80(%rdi,%rdx,1) .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5 .globl ecp_nistz256_gather_w5 .type ecp_nistz256_gather_w5,@function .align 32 ecp_nistz256_gather_w5: .cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%eax + testl $32,%eax + jnz .Lavx2_gather_w5 movdqa .LOne(%rip),%xmm0 movd %edx,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 pxor %xmm6,%xmm6 pxor %xmm7,%xmm7 movdqa %xmm0,%xmm8 pshufd $0,%xmm1,%xmm1 movq $16,%rax .Lselect_loop_sse_w5: movdqa %xmm8,%xmm15 paddd %xmm0,%xmm8 pcmpeqd %xmm1,%xmm15 movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 movdqa 32(%rsi),%xmm11 movdqa 48(%rsi),%xmm12 movdqa 64(%rsi),%xmm13 movdqa 80(%rsi),%xmm14 leaq 96(%rsi),%rsi pand %xmm15,%xmm9 pand %xmm15,%xmm10 por %xmm9,%xmm2 pand %xmm15,%xmm11 por %xmm10,%xmm3 pand %xmm15,%xmm12 por %xmm11,%xmm4 pand %xmm15,%xmm13 por %xmm12,%xmm5 pand %xmm15,%xmm14 por %xmm13,%xmm6 por %xmm14,%xmm7 decq %rax jnz .Lselect_loop_sse_w5 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) movdqu %xmm6,64(%rdi) movdqu %xmm7,80(%rdi) .byte 0xf3,0xc3 .cfi_endproc .LSEH_end_ecp_nistz256_gather_w5: .size ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5 .globl ecp_nistz256_scatter_w7 .type ecp_nistz256_scatter_w7,@function .align 32 ecp_nistz256_scatter_w7: .cfi_startproc movdqu 0(%rsi),%xmm0 shll $6,%edx movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqa %xmm0,0(%rdi,%rdx,1) movdqa %xmm1,16(%rdi,%rdx,1) movdqa %xmm2,32(%rdi,%rdx,1) movdqa %xmm3,48(%rdi,%rdx,1) .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7 .globl ecp_nistz256_gather_w7 .type ecp_nistz256_gather_w7,@function .align 32 ecp_nistz256_gather_w7: .cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%eax + testl $32,%eax + jnz .Lavx2_gather_w7 movdqa .LOne(%rip),%xmm8 movd %edx,%xmm1 pxor %xmm2,%xmm2 pxor %xmm3,%xmm3 pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa %xmm8,%xmm0 pshufd $0,%xmm1,%xmm1 movq $64,%rax .Lselect_loop_sse_w7: movdqa %xmm8,%xmm15 paddd %xmm0,%xmm8 movdqa 0(%rsi),%xmm9 movdqa 16(%rsi),%xmm10 pcmpeqd %xmm1,%xmm15 movdqa 32(%rsi),%xmm11 movdqa 48(%rsi),%xmm12 leaq 64(%rsi),%rsi pand %xmm15,%xmm9 pand %xmm15,%xmm10 por %xmm9,%xmm2 pand %xmm15,%xmm11 por %xmm10,%xmm3 pand %xmm15,%xmm12 por %xmm11,%xmm4 prefetcht0 255(%rsi) por %xmm12,%xmm5 decq %rax jnz .Lselect_loop_sse_w7 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqu %xmm4,32(%rdi) movdqu %xmm5,48(%rdi) .byte 0xf3,0xc3 .cfi_endproc .LSEH_end_ecp_nistz256_gather_w7: .size ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7 + + +.type ecp_nistz256_avx2_gather_w5,@function +.align 32 +ecp_nistz256_avx2_gather_w5: +.cfi_startproc +.Lavx2_gather_w5: + vzeroupper + vmovdqa .LTwo(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa .LOne(%rip),%ymm5 + vmovdqa .LTwo(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +.Lselect_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.LSEH_end_ecp_nistz256_avx2_gather_w5: +.size ecp_nistz256_avx2_gather_w5,.-ecp_nistz256_avx2_gather_w5 + + + .globl ecp_nistz256_avx2_gather_w7 .type ecp_nistz256_avx2_gather_w7,@function .align 32 ecp_nistz256_avx2_gather_w7: .cfi_startproc -.byte 0x0f,0x0b +.Lavx2_gather_w7: + vzeroupper + vmovdqa .LThree(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa .LOne(%rip),%ymm4 + vmovdqa .LTwo(%rip),%ymm8 + vmovdqa .LThree(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +.Lselect_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper .byte 0xf3,0xc3 .cfi_endproc +.LSEH_end_ecp_nistz256_avx2_gather_w7: .size ecp_nistz256_avx2_gather_w7,.-ecp_nistz256_avx2_gather_w7 .type __ecp_nistz256_add_toq,@function .align 32 __ecp_nistz256_add_toq: .cfi_startproc xorq %r11,%r11 addq 0(%rbx),%r12 adcq 8(%rbx),%r13 movq %r12,%rax adcq 16(%rbx),%r8 adcq 24(%rbx),%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq .type __ecp_nistz256_sub_fromq,@function .align 32 __ecp_nistz256_sub_fromq: .cfi_startproc subq 0(%rbx),%r12 sbbq 8(%rbx),%r13 movq %r12,%rax sbbq 16(%rbx),%r8 sbbq 24(%rbx),%r9 movq %r13,%rbp sbbq %r11,%r11 addq $-1,%r12 movq %r8,%rcx adcq %r14,%r13 adcq $0,%r8 movq %r9,%r10 adcq %r15,%r9 testq %r11,%r11 cmovzq %rax,%r12 cmovzq %rbp,%r13 movq %r12,0(%rdi) cmovzq %rcx,%r8 movq %r13,8(%rdi) cmovzq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq .type __ecp_nistz256_subq,@function .align 32 __ecp_nistz256_subq: .cfi_startproc subq %r12,%rax sbbq %r13,%rbp movq %rax,%r12 sbbq %r8,%rcx sbbq %r9,%r10 movq %rbp,%r13 sbbq %r11,%r11 addq $-1,%rax movq %rcx,%r8 adcq %r14,%rbp adcq $0,%rcx movq %r10,%r9 adcq %r15,%r10 testq %r11,%r11 cmovnzq %rax,%r12 cmovnzq %rbp,%r13 cmovnzq %rcx,%r8 cmovnzq %r10,%r9 .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_subq,.-__ecp_nistz256_subq .type __ecp_nistz256_mul_by_2q,@function .align 32 __ecp_nistz256_mul_by_2q: .cfi_startproc xorq %r11,%r11 addq %r12,%r12 adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 cmovcq %rbp,%r13 movq %r12,0(%rdi) cmovcq %rcx,%r8 movq %r13,8(%rdi) cmovcq %r10,%r9 movq %r8,16(%rdi) movq %r9,24(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q .globl ecp_nistz256_point_double .type ecp_nistz256_point_double,@function .align 32 ecp_nistz256_point_double: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lpoint_doublex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $160+8,%rsp .cfi_adjust_cfa_offset 32*5+8 .Lpoint_doubleq_body: .Lpoint_double_shortcutq: movdqu 0(%rsi),%xmm0 movq %rsi,%rbx movdqu 16(%rsi),%xmm1 movq 32+0(%rsi),%r12 movq 32+8(%rsi),%r13 movq 32+16(%rsi),%r8 movq 32+24(%rsi),%r9 movq .Lpoly+8(%rip),%r14 movq .Lpoly+24(%rip),%r15 movdqa %xmm0,96(%rsp) movdqa %xmm1,96+16(%rsp) leaq 32(%rdi),%r10 leaq 64(%rdi),%r11 .byte 102,72,15,110,199 .byte 102,73,15,110,202 .byte 102,73,15,110,211 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_by_2q movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 leaq 64-0(%rsi),%rsi leaq 64(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 0(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 32(%rbx),%rax movq 64+0(%rbx),%r9 movq 64+8(%rbx),%r10 movq 64+16(%rbx),%r11 movq 64+24(%rbx),%r12 leaq 64-0(%rbx),%rsi leaq 32(%rbx),%rbx .byte 102,72,15,126,215 call __ecp_nistz256_mul_montq call __ecp_nistz256_mul_by_2q movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 32(%rsp),%rdi call __ecp_nistz256_add_toq movq 96+0(%rsp),%r12 movq 96+8(%rsp),%r13 leaq 64(%rsp),%rbx movq 96+16(%rsp),%r8 movq 96+24(%rsp),%r9 leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 .byte 102,72,15,126,207 call __ecp_nistz256_sqr_montq xorq %r9,%r9 movq %r12,%rax addq $-1,%r12 movq %r13,%r10 adcq %rsi,%r13 movq %r14,%rcx adcq $0,%r14 movq %r15,%r8 adcq %rbp,%r15 adcq $0,%r9 xorq %rsi,%rsi testq $1,%rax cmovzq %rax,%r12 cmovzq %r10,%r13 cmovzq %rcx,%r14 cmovzq %r8,%r15 cmovzq %rsi,%r9 movq %r13,%rax shrq $1,%r12 shlq $63,%rax movq %r14,%r10 shrq $1,%r13 orq %rax,%r12 shlq $63,%r10 movq %r15,%rcx shrq $1,%r14 orq %r10,%r13 shlq $63,%rcx movq %r12,0(%rdi) shrq $1,%r15 movq %r13,8(%rdi) shlq $63,%r9 orq %rcx,%r14 orq %r9,%r15 movq %r14,16(%rdi) movq %r15,24(%rdi) movq 64(%rsp),%rax leaq 64(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2q leaq 32(%rsp),%rbx leaq 32(%rsp),%rdi call __ecp_nistz256_add_toq movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 128(%rsp),%rdi call __ecp_nistz256_mul_by_2q movq 0+32(%rsp),%rax movq 8+32(%rsp),%r14 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r15 movq 24+32(%rsp),%r8 .byte 102,72,15,126,199 call __ecp_nistz256_sqr_montq leaq 128(%rsp),%rbx movq %r14,%r8 movq %r15,%r9 movq %rsi,%r14 movq %rbp,%r15 call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 0(%rsp),%rdi call __ecp_nistz256_subq movq 32(%rsp),%rax leaq 32(%rsp),%rbx movq %r12,%r14 xorl %ecx,%ecx movq %r12,0+0(%rsp) movq %r13,%r10 movq %r13,0+8(%rsp) cmovzq %r8,%r11 movq %r8,0+16(%rsp) leaq 0-0(%rsp),%rsi cmovzq %r9,%r12 movq %r9,0+24(%rsp) movq %r14,%r9 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq .byte 102,72,15,126,203 .byte 102,72,15,126,207 call __ecp_nistz256_sub_fromq leaq 160+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_doubleq_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_point_double,.-ecp_nistz256_point_double .globl ecp_nistz256_point_add .type ecp_nistz256_point_add,@function .align 32 ecp_nistz256_point_add: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lpoint_addx pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $576+8,%rsp .cfi_adjust_cfa_offset 32*18+8 .Lpoint_addq_body: movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq %rsi,%rbx movq %rdx,%rsi movdqa %xmm0,384(%rsp) movdqa %xmm1,384+16(%rsp) movdqa %xmm2,416(%rsp) movdqa %xmm3,416+16(%rsp) movdqa %xmm4,448(%rsp) movdqa %xmm5,448+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rsi),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 por %xmm3,%xmm5 movdqu 48(%rsi),%xmm3 movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,480(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,480+16(%rsp) movdqu 64(%rsi),%xmm0 movdqu 80(%rsi),%xmm1 movdqa %xmm2,512(%rsp) movdqa %xmm3,512+16(%rsp) por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm0,%xmm1 .byte 102,72,15,110,199 leaq 64-0(%rsi),%rsi movq %rax,544+0(%rsp) movq %r14,544+8(%rsp) movq %r15,544+16(%rsp) movq %r8,544+24(%rsp) leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm1,%xmm4 por %xmm1,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 por %xmm3,%xmm4 pxor %xmm3,%xmm3 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 movq 64+0(%rbx),%rax movq 64+8(%rbx),%r14 movq 64+16(%rbx),%r15 movq 64+24(%rbx),%r8 .byte 102,72,15,110,203 leaq 64-0(%rbx),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq movq 416(%rsp),%rax leaq 416(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 224(%rsp),%rdi call __ecp_nistz256_mul_montq movq 512(%rsp),%rax leaq 512(%rsp),%rbx movq 0+256(%rsp),%r9 movq 8+256(%rsp),%r10 leaq 0+256(%rsp),%rsi movq 16+256(%rsp),%r11 movq 24+256(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 224(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq orq %r13,%r12 movdqa %xmm4,%xmm2 orq %r8,%r12 orq %r9,%r12 por %xmm5,%xmm2 .byte 102,73,15,110,220 movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+96(%rsp),%r9 movq 8+96(%rsp),%r10 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r11 movq 24+96(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montq movq 480(%rsp),%rax leaq 480(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 160(%rsp),%rbx leaq 0(%rsp),%rdi call __ecp_nistz256_sub_fromq orq %r13,%r12 orq %r8,%r12 orq %r9,%r12 .byte 102,73,15,126,208 .byte 102,73,15,126,217 orq %r8,%r12 orq %r9,%r12 .byte 0x3e jnz .Ladd_proceedq .Ladd_doubleq: .byte 102,72,15,126,206 .byte 102,72,15,126,199 addq $416,%rsp .cfi_adjust_cfa_offset -416 jmp .Lpoint_double_shortcutq .cfi_adjust_cfa_offset 416 .align 32 .Ladd_proceedq: movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 96(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+0(%rsp),%r9 movq 8+0(%rsp),%r10 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r11 movq 24+0(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montq movq 0+0(%rsp),%rax movq 8+0(%rsp),%r14 leaq 0+0(%rsp),%rsi movq 16+0(%rsp),%r15 movq 24+0(%rsp),%r8 leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 544(%rsp),%rax leaq 544(%rsp),%rbx movq 0+352(%rsp),%r9 movq 8+352(%rsp),%r10 leaq 0+352(%rsp),%rsi movq 16+352(%rsp),%r11 movq 24+352(%rsp),%r12 leaq 352(%rsp),%rdi call __ecp_nistz256_mul_montq movq 0(%rsp),%rax leaq 0(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 128(%rsp),%rdi call __ecp_nistz256_mul_montq movq 160(%rsp),%rax leaq 160(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 192(%rsp),%rdi call __ecp_nistz256_mul_montq xorq %r11,%r11 addq %r12,%r12 leaq 96(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq leaq 128(%rsp),%rbx leaq 288(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 192+0(%rsp),%rax movq 192+8(%rsp),%rbp movq 192+16(%rsp),%rcx movq 192+24(%rsp),%r10 leaq 320(%rsp),%rdi call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+224(%rsp),%r9 movq 8+224(%rsp),%r10 leaq 0+224(%rsp),%rsi movq 16+224(%rsp),%r11 movq 24+224(%rsp),%r12 leaq 256(%rsp),%rdi call __ecp_nistz256_mul_montq movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 320(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 256(%rsp),%rbx leaq 320(%rsp),%rdi call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 352(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 352+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 544(%rsp),%xmm2 pand 544+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 480(%rsp),%xmm2 pand 480+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 320(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 320+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 512(%rsp),%xmm2 pand 512+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) .Ladd_doneq: leaq 576+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpoint_addq_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_point_add,.-ecp_nistz256_point_add .globl ecp_nistz256_point_add_affine .type ecp_nistz256_point_add_affine,@function .align 32 ecp_nistz256_point_add_affine: .cfi_startproc + movl $0x80100,%ecx + andl OPENSSL_ia32cap_P+8(%rip),%ecx + cmpl $0x80100,%ecx + je .Lpoint_add_affinex pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $480+8,%rsp .cfi_adjust_cfa_offset 32*15+8 .Ladd_affineq_body: movdqu 0(%rsi),%xmm0 movq %rdx,%rbx movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 movdqu 48(%rsi),%xmm3 movdqu 64(%rsi),%xmm4 movdqu 80(%rsi),%xmm5 movq 64+0(%rsi),%rax movq 64+8(%rsi),%r14 movq 64+16(%rsi),%r15 movq 64+24(%rsi),%r8 movdqa %xmm0,320(%rsp) movdqa %xmm1,320+16(%rsp) movdqa %xmm2,352(%rsp) movdqa %xmm3,352+16(%rsp) movdqa %xmm4,384(%rsp) movdqa %xmm5,384+16(%rsp) por %xmm4,%xmm5 movdqu 0(%rbx),%xmm0 pshufd $0xb1,%xmm5,%xmm3 movdqu 16(%rbx),%xmm1 movdqu 32(%rbx),%xmm2 por %xmm3,%xmm5 movdqu 48(%rbx),%xmm3 movdqa %xmm0,416(%rsp) pshufd $0x1e,%xmm5,%xmm4 movdqa %xmm1,416+16(%rsp) por %xmm0,%xmm1 .byte 102,72,15,110,199 movdqa %xmm2,448(%rsp) movdqa %xmm3,448+16(%rsp) por %xmm2,%xmm3 por %xmm4,%xmm5 pxor %xmm4,%xmm4 por %xmm1,%xmm3 leaq 64-0(%rsi),%rsi leaq 32(%rsp),%rdi call __ecp_nistz256_sqr_montq pcmpeqd %xmm4,%xmm5 pshufd $0xb1,%xmm3,%xmm4 movq 0(%rbx),%rax movq %r12,%r9 por %xmm3,%xmm4 pshufd $0,%xmm5,%xmm5 pshufd $0x1e,%xmm4,%xmm3 movq %r13,%r10 por %xmm3,%xmm4 pxor %xmm3,%xmm3 movq %r14,%r11 pcmpeqd %xmm3,%xmm4 pshufd $0,%xmm4,%xmm4 leaq 32-0(%rsp),%rsi movq %r15,%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 320(%rsp),%rbx leaq 64(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq movq 384(%rsp),%rax leaq 384(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 288(%rsp),%rdi call __ecp_nistz256_mul_montq movq 448(%rsp),%rax leaq 448(%rsp),%rbx movq 0+32(%rsp),%r9 movq 8+32(%rsp),%r10 leaq 0+32(%rsp),%rsi movq 16+32(%rsp),%r11 movq 24+32(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 352(%rsp),%rbx leaq 96(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+64(%rsp),%rax movq 8+64(%rsp),%r14 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r15 movq 24+64(%rsp),%r8 leaq 128(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 0+96(%rsp),%rax movq 8+96(%rsp),%r14 leaq 0+96(%rsp),%rsi movq 16+96(%rsp),%r15 movq 24+96(%rsp),%r8 leaq 192(%rsp),%rdi call __ecp_nistz256_sqr_montq movq 128(%rsp),%rax leaq 128(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 160(%rsp),%rdi call __ecp_nistz256_mul_montq movq 320(%rsp),%rax leaq 320(%rsp),%rbx movq 0+128(%rsp),%r9 movq 8+128(%rsp),%r10 leaq 0+128(%rsp),%rsi movq 16+128(%rsp),%r11 movq 24+128(%rsp),%r12 leaq 0(%rsp),%rdi call __ecp_nistz256_mul_montq xorq %r11,%r11 addq %r12,%r12 leaq 192(%rsp),%rsi adcq %r13,%r13 movq %r12,%rax adcq %r8,%r8 adcq %r9,%r9 movq %r13,%rbp adcq $0,%r11 subq $-1,%r12 movq %r8,%rcx sbbq %r14,%r13 sbbq $0,%r8 movq %r9,%r10 sbbq %r15,%r9 sbbq $0,%r11 cmovcq %rax,%r12 movq 0(%rsi),%rax cmovcq %rbp,%r13 movq 8(%rsi),%rbp cmovcq %rcx,%r8 movq 16(%rsi),%rcx cmovcq %r10,%r9 movq 24(%rsi),%r10 call __ecp_nistz256_subq leaq 160(%rsp),%rbx leaq 224(%rsp),%rdi call __ecp_nistz256_sub_fromq movq 0+0(%rsp),%rax movq 0+8(%rsp),%rbp movq 0+16(%rsp),%rcx movq 0+24(%rsp),%r10 leaq 64(%rsp),%rdi call __ecp_nistz256_subq movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r8,16(%rdi) movq %r9,24(%rdi) movq 352(%rsp),%rax leaq 352(%rsp),%rbx movq 0+160(%rsp),%r9 movq 8+160(%rsp),%r10 leaq 0+160(%rsp),%rsi movq 16+160(%rsp),%r11 movq 24+160(%rsp),%r12 leaq 32(%rsp),%rdi call __ecp_nistz256_mul_montq movq 96(%rsp),%rax leaq 96(%rsp),%rbx movq 0+64(%rsp),%r9 movq 8+64(%rsp),%r10 leaq 0+64(%rsp),%rsi movq 16+64(%rsp),%r11 movq 24+64(%rsp),%r12 leaq 64(%rsp),%rdi call __ecp_nistz256_mul_montq leaq 32(%rsp),%rbx leaq 256(%rsp),%rdi call __ecp_nistz256_sub_fromq .byte 102,72,15,126,199 movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 288(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 288+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand .LONE_mont(%rip),%xmm2 pand .LONE_mont+16(%rip),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 384(%rsp),%xmm2 pand 384+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,64(%rdi) movdqu %xmm3,80(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 224(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 224+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 416(%rsp),%xmm2 pand 416+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 320(%rsp),%xmm2 pand 320+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,0(%rdi) movdqu %xmm3,16(%rdi) movdqa %xmm5,%xmm0 movdqa %xmm5,%xmm1 pandn 256(%rsp),%xmm0 movdqa %xmm5,%xmm2 pandn 256+16(%rsp),%xmm1 movdqa %xmm5,%xmm3 pand 448(%rsp),%xmm2 pand 448+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 pandn %xmm2,%xmm0 movdqa %xmm4,%xmm2 pandn %xmm3,%xmm1 movdqa %xmm4,%xmm3 pand 352(%rsp),%xmm2 pand 352+16(%rsp),%xmm3 por %xmm0,%xmm2 por %xmm1,%xmm3 movdqu %xmm2,32(%rdi) movdqu %xmm3,48(%rdi) leaq 480+56(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbx .cfi_restore %rbx movq -8(%rsi),%rbp .cfi_restore %rbp leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Ladd_affineq_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +.type __ecp_nistz256_add_tox,@function +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,@function +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,@function +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,@function +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.type ecp_nistz256_point_doublex,@function +.align 32 +ecp_nistz256_point_doublex: +.cfi_startproc +.Lpoint_doublex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: + +.Lpoint_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 +.byte 102,72,15,110,199 +.byte 102,73,15,110,202 +.byte 102,73,15,110,211 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx +.byte 102,72,15,126,215 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 +.byte 102,72,15,126,207 + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 +.byte 102,72,15,126,199 + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + +.byte 102,72,15,126,203 +.byte 102,72,15,126,207 + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex +.type ecp_nistz256_point_addx,@function +.align 32 +ecp_nistz256_point_addx: +.cfi_startproc +.Lpoint_addx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 +.byte 102,72,15,110,203 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 +.byte 102,73,15,110,220 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + +.byte 102,73,15,126,208 +.byte 102,73,15,126,217 + + orq %r8,%r12 + orq %r9,%r12 + + +.byte 0x3e + jnz .Ladd_proceedx + +.Ladd_doublex: +.byte 102,72,15,126,206 +.byte 102,72,15,126,199 + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutx +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_donex: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx +.type ecp_nistz256_point_add_affinex,@function +.align 32 +ecp_nistz256_point_add_affinex: +.cfi_startproc +.Lpoint_add_affinex: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 +.byte 102,72,15,110,199 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + +.byte 102,72,15,126,199 + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex Index: stable/12/secure/lib/libcrypto/amd64/ghash-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/ghash-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/ghash-x86_64.S (revision 364963) @@ -1,1378 +1,1849 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from ghash-x86_64.pl. */ .text .globl gcm_gmult_4bit .type gcm_gmult_4bit,@function .align 16 gcm_gmult_4bit: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $280,%rsp .cfi_adjust_cfa_offset 280 .Lgmult_prologue: movzbq 15(%rdi),%r8 leaq .Lrem_4bit(%rip),%r11 xorq %rax,%rax xorq %rbx,%rbx movb %r8b,%al movb %r8b,%bl shlb $4,%al movq $14,%rcx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 andb $0xf0,%bl movq %r8,%rdx jmp .Loop1 .align 16 .Loop1: shrq $4,%r8 andq $0xf,%rdx movq %r9,%r10 movb (%rdi,%rcx,1),%al shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 shlq $60,%r10 xorq (%rsi,%rbx,1),%r9 movb %al,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx shlb $4,%al xorq %r10,%r8 decq %rcx js .Lbreak1 shrq $4,%r8 andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 jmp .Loop1 .align 16 .Lbreak1: shrq $4,%r8 andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rax,1),%r8 shlq $60,%r10 xorq (%rsi,%rax,1),%r9 andb $0xf0,%bl xorq (%r11,%rdx,8),%r9 movq %r8,%rdx xorq %r10,%r8 shrq $4,%r8 andq $0xf,%rdx movq %r9,%r10 shrq $4,%r9 xorq 8(%rsi,%rbx,1),%r8 shlq $60,%r10 xorq (%rsi,%rbx,1),%r9 xorq %r10,%r8 xorq (%r11,%rdx,8),%r9 bswapq %r8 bswapq %r9 movq %r8,8(%rdi) movq %r9,(%rdi) leaq 280+48(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lgmult_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size gcm_gmult_4bit,.-gcm_gmult_4bit .globl gcm_ghash_4bit .type gcm_ghash_4bit,@function .align 16 gcm_ghash_4bit: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $280,%rsp .cfi_adjust_cfa_offset 280 .Lghash_prologue: movq %rdx,%r14 movq %rcx,%r15 subq $-128,%rsi leaq 16+128(%rsp),%rbp xorl %edx,%edx movq 0+0-128(%rsi),%r8 movq 0+8-128(%rsi),%rax movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq 16+0-128(%rsi),%r9 shlb $4,%dl movq 16+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,0(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,0(%rbp) movq 32+0-128(%rsi),%r8 shlb $4,%dl movq %rax,0-128(%rbp) movq 32+8-128(%rsi),%rax shlq $60,%r10 movb %dl,1(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,8(%rbp) movq 48+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,8-128(%rbp) movq 48+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,2(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,16(%rbp) movq 64+0-128(%rsi),%r8 shlb $4,%dl movq %rax,16-128(%rbp) movq 64+8-128(%rsi),%rax shlq $60,%r10 movb %dl,3(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,24(%rbp) movq 80+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,24-128(%rbp) movq 80+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,4(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,32(%rbp) movq 96+0-128(%rsi),%r8 shlb $4,%dl movq %rax,32-128(%rbp) movq 96+8-128(%rsi),%rax shlq $60,%r10 movb %dl,5(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,40(%rbp) movq 112+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,40-128(%rbp) movq 112+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,6(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,48(%rbp) movq 128+0-128(%rsi),%r8 shlb $4,%dl movq %rax,48-128(%rbp) movq 128+8-128(%rsi),%rax shlq $60,%r10 movb %dl,7(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,56(%rbp) movq 144+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,56-128(%rbp) movq 144+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,8(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,64(%rbp) movq 160+0-128(%rsi),%r8 shlb $4,%dl movq %rax,64-128(%rbp) movq 160+8-128(%rsi),%rax shlq $60,%r10 movb %dl,9(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,72(%rbp) movq 176+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,72-128(%rbp) movq 176+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,10(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,80(%rbp) movq 192+0-128(%rsi),%r8 shlb $4,%dl movq %rax,80-128(%rbp) movq 192+8-128(%rsi),%rax shlq $60,%r10 movb %dl,11(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,88(%rbp) movq 208+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,88-128(%rbp) movq 208+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,12(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,96(%rbp) movq 224+0-128(%rsi),%r8 shlb $4,%dl movq %rax,96-128(%rbp) movq 224+8-128(%rsi),%rax shlq $60,%r10 movb %dl,13(%rsp) orq %r10,%rbx movb %al,%dl shrq $4,%rax movq %r8,%r10 shrq $4,%r8 movq %r9,104(%rbp) movq 240+0-128(%rsi),%r9 shlb $4,%dl movq %rbx,104-128(%rbp) movq 240+8-128(%rsi),%rbx shlq $60,%r10 movb %dl,14(%rsp) orq %r10,%rax movb %bl,%dl shrq $4,%rbx movq %r9,%r10 shrq $4,%r9 movq %r8,112(%rbp) shlb $4,%dl movq %rax,112-128(%rbp) shlq $60,%r10 movb %dl,15(%rsp) orq %r10,%rbx movq %r9,120(%rbp) movq %rbx,120-128(%rbp) addq $-128,%rsi movq 8(%rdi),%r8 movq 0(%rdi),%r9 addq %r14,%r15 leaq .Lrem_8bit(%rip),%r11 jmp .Louter_loop .align 16 .Louter_loop: xorq (%r14),%r9 movq 8(%r14),%rdx leaq 16(%r14),%r14 xorq %r8,%rdx movq %r9,(%rdi) movq %rdx,8(%rdi) shrq $32,%rdx xorq %rax,%rax roll $8,%edx movb %dl,%al movzbl %dl,%ebx shlb $4,%al shrl $4,%ebx roll $8,%edx movq 8(%rsi,%rax,1),%r8 movq (%rsi,%rax,1),%r9 movb %dl,%al movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx xorq %r8,%r12 movq %r9,%r10 shrq $8,%r8 movzbq %r12b,%r12 shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 movl 8(%rdi),%edx shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 movl 4(%rdi),%edx shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 movl 0(%rdi),%edx shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 shrl $4,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r12,2),%r12 movzbl %dl,%ebx shlb $4,%al movzbq (%rsp,%rcx,1),%r13 shrl $4,%ebx shlq $48,%r12 xorq %r8,%r13 movq %r9,%r10 xorq %r12,%r9 shrq $8,%r8 movzbq %r13b,%r13 shrq $8,%r9 xorq -128(%rbp,%rcx,8),%r8 shlq $56,%r10 xorq (%rbp,%rcx,8),%r9 roll $8,%edx xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 movb %dl,%al xorq %r10,%r8 movzwq (%r11,%r13,2),%r13 movzbl %dl,%ecx shlb $4,%al movzbq (%rsp,%rbx,1),%r12 andl $240,%ecx shlq $48,%r13 xorq %r8,%r12 movq %r9,%r10 xorq %r13,%r9 shrq $8,%r8 movzbq %r12b,%r12 movl -4(%rdi),%edx shrq $8,%r9 xorq -128(%rbp,%rbx,8),%r8 shlq $56,%r10 xorq (%rbp,%rbx,8),%r9 movzwq (%r11,%r12,2),%r12 xorq 8(%rsi,%rax,1),%r8 xorq (%rsi,%rax,1),%r9 shlq $48,%r12 xorq %r10,%r8 xorq %r12,%r9 movzbq %r8b,%r13 shrq $4,%r8 movq %r9,%r10 shlb $4,%r13b shrq $4,%r9 xorq 8(%rsi,%rcx,1),%r8 movzwq (%r11,%r13,2),%r13 shlq $60,%r10 xorq (%rsi,%rcx,1),%r9 xorq %r10,%r8 shlq $48,%r13 bswapq %r8 xorq %r13,%r9 bswapq %r9 cmpq %r15,%r14 jb .Louter_loop movq %r8,8(%rdi) movq %r9,(%rdi) leaq 280+48(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq 0(%rsi),%rsp .cfi_def_cfa_register %rsp .Lghash_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size gcm_ghash_4bit,.-gcm_ghash_4bit .globl gcm_init_clmul .type gcm_init_clmul,@function .align 16 gcm_init_clmul: .cfi_startproc .L_init_clmul: movdqu (%rsi),%xmm2 pshufd $78,%xmm2,%xmm2 pshufd $255,%xmm2,%xmm4 movdqa %xmm2,%xmm3 psllq $1,%xmm2 pxor %xmm5,%xmm5 psrlq $63,%xmm3 pcmpgtd %xmm4,%xmm5 pslldq $8,%xmm3 por %xmm3,%xmm2 pand .L0x1c2_polynomial(%rip),%xmm5 pxor %xmm5,%xmm2 pshufd $78,%xmm2,%xmm6 movdqa %xmm2,%xmm0 pxor %xmm2,%xmm6 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm2,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm2,%xmm3 movdqu %xmm2,0(%rdi) pxor %xmm0,%xmm4 movdqu %xmm0,16(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,32(%rdi) movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm5 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,222,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 pshufd $78,%xmm5,%xmm3 pshufd $78,%xmm0,%xmm4 pxor %xmm5,%xmm3 movdqu %xmm5,48(%rdi) pxor %xmm0,%xmm4 movdqu %xmm0,64(%rdi) .byte 102,15,58,15,227,8 movdqu %xmm4,80(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size gcm_init_clmul,.-gcm_init_clmul .globl gcm_gmult_clmul .type gcm_gmult_clmul,@function .align 16 gcm_gmult_clmul: .cfi_startproc .L_gmult_clmul: movdqu (%rdi),%xmm0 movdqa .Lbswap_mask(%rip),%xmm5 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm4 .byte 102,15,56,0,197 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,220,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .byte 102,15,56,0,197 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size gcm_gmult_clmul,.-gcm_gmult_clmul .globl gcm_ghash_clmul .type gcm_ghash_clmul,@function .align 32 gcm_ghash_clmul: .cfi_startproc .L_ghash_clmul: movdqa .Lbswap_mask(%rip),%xmm10 movdqu (%rdi),%xmm0 movdqu (%rsi),%xmm2 movdqu 32(%rsi),%xmm7 .byte 102,65,15,56,0,194 subq $0x10,%rcx jz .Lodd_tail movdqu 16(%rsi),%xmm6 movl OPENSSL_ia32cap_P+4(%rip),%eax cmpq $0x30,%rcx jb .Lskip4x andl $71303168,%eax cmpl $4194304,%eax je .Lskip4x subq $0x30,%rcx movq $0xA040608020C0E000,%rax movdqu 48(%rsi),%xmm14 movdqu 64(%rsi),%xmm15 movdqu 48(%rdx),%xmm3 movdqu 32(%rdx),%xmm11 .byte 102,65,15,56,0,218 .byte 102,69,15,56,0,218 movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 movdqa %xmm11,%xmm13 pshufd $78,%xmm11,%xmm12 pxor %xmm11,%xmm12 .byte 102,68,15,58,68,222,0 .byte 102,68,15,58,68,238,17 .byte 102,68,15,58,68,231,16 xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 xorps %xmm12,%xmm4 movdqu 16(%rdx),%xmm11 movdqu 0(%rdx),%xmm8 .byte 102,69,15,56,0,218 .byte 102,69,15,56,0,194 movdqa %xmm11,%xmm13 pshufd $78,%xmm11,%xmm12 pxor %xmm8,%xmm0 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,69,15,58,68,238,17 .byte 102,68,15,58,68,231,0 xorps %xmm11,%xmm3 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx subq $0x40,%rcx jc .Ltail4x jmp .Lmod4_loop .align 32 .Lmod4_loop: .byte 102,65,15,58,68,199,0 xorps %xmm12,%xmm4 movdqu 48(%rdx),%xmm11 .byte 102,69,15,56,0,218 .byte 102,65,15,58,68,207,17 xorps %xmm3,%xmm0 movdqu 32(%rdx),%xmm3 movdqa %xmm11,%xmm13 .byte 102,68,15,58,68,199,16 pshufd $78,%xmm11,%xmm12 xorps %xmm5,%xmm1 pxor %xmm11,%xmm12 .byte 102,65,15,56,0,218 movups 32(%rsi),%xmm7 xorps %xmm4,%xmm8 .byte 102,68,15,58,68,218,0 pshufd $78,%xmm3,%xmm4 pxor %xmm0,%xmm8 movdqa %xmm3,%xmm5 pxor %xmm1,%xmm8 pxor %xmm3,%xmm4 movdqa %xmm8,%xmm9 .byte 102,68,15,58,68,234,17 pslldq $8,%xmm8 psrldq $8,%xmm9 pxor %xmm8,%xmm0 movdqa .L7_mask(%rip),%xmm8 pxor %xmm9,%xmm1 .byte 102,76,15,110,200 pand %xmm0,%xmm8 .byte 102,69,15,56,0,200 pxor %xmm0,%xmm9 .byte 102,68,15,58,68,231,0 psllq $57,%xmm9 movdqa %xmm9,%xmm8 pslldq $8,%xmm9 .byte 102,15,58,68,222,0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 movdqu 0(%rdx),%xmm8 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,238,17 xorps %xmm11,%xmm3 movdqu 16(%rdx),%xmm11 .byte 102,69,15,56,0,218 .byte 102,15,58,68,231,16 xorps %xmm13,%xmm5 movups 80(%rsi),%xmm7 .byte 102,69,15,56,0,194 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 movdqa %xmm11,%xmm13 pxor %xmm12,%xmm4 pshufd $78,%xmm11,%xmm12 pxor %xmm9,%xmm0 pxor %xmm8,%xmm1 pxor %xmm11,%xmm12 .byte 102,69,15,58,68,222,0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 movdqa %xmm0,%xmm1 .byte 102,69,15,58,68,238,17 xorps %xmm11,%xmm3 pshufd $78,%xmm0,%xmm8 pxor %xmm0,%xmm8 .byte 102,68,15,58,68,231,0 xorps %xmm13,%xmm5 leaq 64(%rdx),%rdx subq $0x40,%rcx jnc .Lmod4_loop .Ltail4x: .byte 102,65,15,58,68,199,0 .byte 102,65,15,58,68,207,17 .byte 102,68,15,58,68,199,16 xorps %xmm12,%xmm4 xorps %xmm3,%xmm0 xorps %xmm5,%xmm1 pxor %xmm0,%xmm1 pxor %xmm4,%xmm8 pxor %xmm1,%xmm8 pxor %xmm0,%xmm1 movdqa %xmm8,%xmm9 psrldq $8,%xmm8 pslldq $8,%xmm9 pxor %xmm8,%xmm1 pxor %xmm9,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 addq $0x40,%rcx jz .Ldone movdqu 32(%rsi),%xmm7 subq $0x10,%rcx jz .Lodd_tail .Lskip4x: movdqu (%rdx),%xmm8 movdqu 16(%rdx),%xmm3 .byte 102,69,15,56,0,194 .byte 102,65,15,56,0,218 pxor %xmm8,%xmm0 movdqa %xmm3,%xmm5 pshufd $78,%xmm3,%xmm4 pxor %xmm3,%xmm4 .byte 102,15,58,68,218,0 .byte 102,15,58,68,234,17 .byte 102,15,58,68,231,0 leaq 32(%rdx),%rdx nop subq $0x20,%rcx jbe .Leven_tail nop jmp .Lmod_loop .align 32 .Lmod_loop: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 pshufd $78,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 .byte 102,15,58,68,231,16 pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 movdqu (%rdx),%xmm9 pxor %xmm0,%xmm8 .byte 102,69,15,56,0,202 movdqu 16(%rdx),%xmm3 pxor %xmm1,%xmm8 pxor %xmm9,%xmm1 pxor %xmm8,%xmm4 .byte 102,65,15,56,0,218 movdqa %xmm4,%xmm8 psrldq $8,%xmm8 pslldq $8,%xmm4 pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm3,%xmm5 movdqa %xmm0,%xmm9 movdqa %xmm0,%xmm8 psllq $5,%xmm0 pxor %xmm0,%xmm8 .byte 102,15,58,68,218,0 psllq $1,%xmm0 pxor %xmm8,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm8 pslldq $8,%xmm0 psrldq $8,%xmm8 pxor %xmm9,%xmm0 pshufd $78,%xmm5,%xmm4 pxor %xmm8,%xmm1 pxor %xmm5,%xmm4 movdqa %xmm0,%xmm9 psrlq $1,%xmm0 .byte 102,15,58,68,234,17 pxor %xmm9,%xmm1 pxor %xmm0,%xmm9 psrlq $5,%xmm0 pxor %xmm9,%xmm0 leaq 32(%rdx),%rdx psrlq $1,%xmm0 .byte 102,15,58,68,231,0 pxor %xmm1,%xmm0 subq $0x20,%rcx ja .Lmod_loop .Leven_tail: movdqa %xmm0,%xmm1 movdqa %xmm4,%xmm8 pshufd $78,%xmm0,%xmm4 pxor %xmm0,%xmm4 .byte 102,15,58,68,198,0 .byte 102,15,58,68,206,17 .byte 102,15,58,68,231,16 pxor %xmm3,%xmm0 pxor %xmm5,%xmm1 pxor %xmm0,%xmm8 pxor %xmm1,%xmm8 pxor %xmm8,%xmm4 movdqa %xmm4,%xmm8 psrldq $8,%xmm8 pslldq $8,%xmm4 pxor %xmm8,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 testq %rcx,%rcx jnz .Ldone .Lodd_tail: movdqu (%rdx),%xmm8 .byte 102,69,15,56,0,194 pxor %xmm8,%xmm0 movdqa %xmm0,%xmm1 pshufd $78,%xmm0,%xmm3 pxor %xmm0,%xmm3 .byte 102,15,58,68,194,0 .byte 102,15,58,68,202,17 .byte 102,15,58,68,223,0 pxor %xmm0,%xmm3 pxor %xmm1,%xmm3 movdqa %xmm3,%xmm4 psrldq $8,%xmm3 pslldq $8,%xmm4 pxor %xmm3,%xmm1 pxor %xmm4,%xmm0 movdqa %xmm0,%xmm4 movdqa %xmm0,%xmm3 psllq $5,%xmm0 pxor %xmm0,%xmm3 psllq $1,%xmm0 pxor %xmm3,%xmm0 psllq $57,%xmm0 movdqa %xmm0,%xmm3 pslldq $8,%xmm0 psrldq $8,%xmm3 pxor %xmm4,%xmm0 pxor %xmm3,%xmm1 movdqa %xmm0,%xmm4 psrlq $1,%xmm0 pxor %xmm4,%xmm1 pxor %xmm0,%xmm4 psrlq $5,%xmm0 pxor %xmm4,%xmm0 psrlq $1,%xmm0 pxor %xmm1,%xmm0 .Ldone: .byte 102,65,15,56,0,194 movdqu %xmm0,(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size gcm_ghash_clmul,.-gcm_ghash_clmul .globl gcm_init_avx .type gcm_init_avx,@function .align 32 gcm_init_avx: .cfi_startproc - jmp .L_init_clmul + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 .cfi_endproc .size gcm_init_avx,.-gcm_init_avx .globl gcm_gmult_avx .type gcm_gmult_avx,@function .align 32 gcm_gmult_avx: .cfi_startproc jmp .L_gmult_clmul .cfi_endproc .size gcm_gmult_avx,.-gcm_gmult_avx .globl gcm_ghash_avx .type gcm_ghash_avx,@function .align 32 gcm_ghash_avx: .cfi_startproc - jmp .L_ghash_clmul + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 .cfi_endproc .size gcm_ghash_avx,.-gcm_ghash_avx .align 64 .Lbswap_mask: .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .L0x1c2_polynomial: .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 .L7_mask: .long 7,0,7,0 .L7_mask_poly: .long 7,0,450,0 .align 64 .type .Lrem_4bit,@object .Lrem_4bit: .long 0,0,0,471859200,0,943718400,0,610271232 .long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 .long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 .long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 .type .Lrem_8bit,@object .Lrem_8bit: .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE .byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 Index: stable/12/secure/lib/libcrypto/amd64/poly1305-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/poly1305-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/poly1305-x86_64.S (revision 364963) @@ -1,285 +1,2070 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from poly1305-x86_64.pl. */ .text .globl poly1305_init .hidden poly1305_init .globl poly1305_blocks .hidden poly1305_blocks .globl poly1305_emit .hidden poly1305_emit .type poly1305_init,@function .align 32 poly1305_init: .cfi_startproc xorq %rax,%rax movq %rax,0(%rdi) movq %rax,8(%rdi) movq %rax,16(%rdi) cmpq $0,%rsi je .Lno_key leaq poly1305_blocks(%rip),%r10 leaq poly1305_emit(%rip),%r11 + movq OPENSSL_ia32cap_P+4(%rip),%r9 + leaq poly1305_blocks_avx(%rip),%rax + leaq poly1305_emit_avx(%rip),%rcx + btq $28,%r9 + cmovcq %rax,%r10 + cmovcq %rcx,%r11 + leaq poly1305_blocks_avx2(%rip),%rax + btq $37,%r9 + cmovcq %rax,%r10 movq $0x0ffffffc0fffffff,%rax movq $0x0ffffffc0ffffffc,%rcx andq 0(%rsi),%rax andq 8(%rsi),%rcx movq %rax,24(%rdi) movq %rcx,32(%rdi) movq %r10,0(%rdx) movq %r11,8(%rdx) movl $1,%eax .Lno_key: .byte 0xf3,0xc3 .cfi_endproc .size poly1305_init,.-poly1305_init .type poly1305_blocks,@function .align 32 poly1305_blocks: .cfi_startproc .Lblocks: shrq $4,%rdx jz .Lno_data pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 .Lblocks_body: movq %rdx,%r15 movq 24(%rdi),%r11 movq 32(%rdi),%r13 movq 0(%rdi),%r14 movq 8(%rdi),%rbx movq 16(%rdi),%rbp movq %r13,%r12 shrq $2,%r13 movq %r12,%rax addq %r12,%r13 jmp .Loop .align 32 .Loop: addq 0(%rsi),%r14 adcq 8(%rsi),%rbx leaq 16(%rsi),%rsi adcq %rcx,%rbp mulq %r14 movq %rax,%r9 movq %r11,%rax movq %rdx,%r10 mulq %r14 movq %rax,%r14 movq %r11,%rax movq %rdx,%r8 mulq %rbx addq %rax,%r9 movq %r13,%rax adcq %rdx,%r10 mulq %rbx movq %rbp,%rbx addq %rax,%r14 adcq %rdx,%r8 imulq %r13,%rbx addq %rbx,%r9 movq %r8,%rbx adcq $0,%r10 imulq %r11,%rbp addq %r9,%rbx movq $-4,%rax adcq %rbp,%r10 andq %r10,%rax movq %r10,%rbp shrq $2,%r10 andq $3,%rbp addq %r10,%rax addq %rax,%r14 adcq $0,%rbx adcq $0,%rbp movq %r12,%rax decq %r15 jnz .Loop movq %r14,0(%rdi) movq %rbx,8(%rdi) movq %rbp,16(%rdi) movq 0(%rsp),%r15 .cfi_restore %r15 movq 8(%rsp),%r14 .cfi_restore %r14 movq 16(%rsp),%r13 .cfi_restore %r13 movq 24(%rsp),%r12 .cfi_restore %r12 movq 32(%rsp),%rbp .cfi_restore %rbp movq 40(%rsp),%rbx .cfi_restore %rbx leaq 48(%rsp),%rsp .cfi_adjust_cfa_offset -48 .Lno_data: .Lblocks_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size poly1305_blocks,.-poly1305_blocks .type poly1305_emit,@function .align 32 poly1305_emit: .cfi_startproc .Lemit: movq 0(%rdi),%r8 movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq %r8,%rax addq $5,%r8 movq %r9,%rcx adcq $0,%r9 adcq $0,%r10 shrq $2,%r10 cmovnzq %r8,%rax cmovnzq %r9,%rcx addq 0(%rdx),%rax adcq 8(%rdx),%rcx movq %rax,0(%rsi) movq %rcx,8(%rsi) .byte 0xf3,0xc3 .cfi_endproc .size poly1305_emit,.-poly1305_emit +.type __poly1305_block,@function +.align 32 +__poly1305_block: +.cfi_startproc + mulq %r14 + movq %rax,%r9 + movq %r11,%rax + movq %rdx,%r10 + + mulq %r14 + movq %rax,%r14 + movq %r11,%rax + movq %rdx,%r8 + + mulq %rbx + addq %rax,%r9 + movq %r13,%rax + adcq %rdx,%r10 + + mulq %rbx + movq %rbp,%rbx + addq %rax,%r14 + adcq %rdx,%r8 + + imulq %r13,%rbx + addq %rbx,%r9 + movq %r8,%rbx + adcq $0,%r10 + + imulq %r11,%rbp + addq %r9,%rbx + movq $-4,%rax + adcq %rbp,%r10 + + andq %r10,%rax + movq %r10,%rbp + shrq $2,%r10 + andq $3,%rbp + addq %r10,%rax + addq %rax,%r14 + adcq $0,%rbx + adcq $0,%rbp + .byte 0xf3,0xc3 +.cfi_endproc +.size __poly1305_block,.-__poly1305_block + +.type __poly1305_init_avx,@function +.align 32 +__poly1305_init_avx: +.cfi_startproc + movq %r11,%r14 + movq %r12,%rbx + xorq %rbp,%rbp + + leaq 48+64(%rdi),%rdi + + movq %r12,%rax + call __poly1305_block + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + movq %r14,%r8 + andl %r14d,%eax + movq %r11,%r9 + andl %r11d,%edx + movl %eax,-64(%rdi) + shrq $26,%r8 + movl %edx,-60(%rdi) + shrq $26,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + andl %r8d,%eax + andl %r9d,%edx + movl %eax,-48(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-44(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,-32(%rdi) + shrq $26,%r8 + movl %edx,-28(%rdi) + shrq $26,%r9 + + movq %rbx,%rax + movq %r12,%rdx + shlq $12,%rax + shlq $12,%rdx + orq %r8,%rax + orq %r9,%rdx + andl $0x3ffffff,%eax + andl $0x3ffffff,%edx + movl %eax,-16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,-12(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,0(%rdi) + movq %rbx,%r8 + movl %edx,4(%rdi) + movq %r12,%r9 + + movl $0x3ffffff,%eax + movl $0x3ffffff,%edx + shrq $14,%r8 + shrq $14,%r9 + andl %r8d,%eax + andl %r9d,%edx + movl %eax,16(%rdi) + leal (%rax,%rax,4),%eax + movl %edx,20(%rdi) + leal (%rdx,%rdx,4),%edx + movl %eax,32(%rdi) + shrq $26,%r8 + movl %edx,36(%rdi) + shrq $26,%r9 + + movq %rbp,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,48(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r9d,52(%rdi) + leaq (%r9,%r9,4),%r9 + movl %r8d,64(%rdi) + movl %r9d,68(%rdi) + + movq %r12,%rax + call __poly1305_block + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-52(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-36(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-20(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-4(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,12(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,28(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,44(%rdi) + + movq %rbp,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,60(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,76(%rdi) + + movq %r12,%rax + call __poly1305_block + + movl $0x3ffffff,%eax + movq %r14,%r8 + andl %r14d,%eax + shrq $26,%r8 + movl %eax,-56(%rdi) + + movl $0x3ffffff,%edx + andl %r8d,%edx + movl %edx,-40(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,-24(%rdi) + + movq %rbx,%rax + shlq $12,%rax + orq %r8,%rax + andl $0x3ffffff,%eax + movl %eax,-8(%rdi) + leal (%rax,%rax,4),%eax + movq %rbx,%r8 + movl %eax,8(%rdi) + + movl $0x3ffffff,%edx + shrq $14,%r8 + andl %r8d,%edx + movl %edx,24(%rdi) + leal (%rdx,%rdx,4),%edx + shrq $26,%r8 + movl %edx,40(%rdi) + + movq %rbp,%rax + shlq $24,%rax + orq %rax,%r8 + movl %r8d,56(%rdi) + leaq (%r8,%r8,4),%r8 + movl %r8d,72(%rdi) + + leaq -48-64(%rdi),%rdi + .byte 0xf3,0xc3 +.cfi_endproc +.size __poly1305_init_avx,.-__poly1305_init_avx + +.type poly1305_blocks_avx,@function +.align 32 +poly1305_blocks_avx: +.cfi_startproc + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx: + andq $-16,%rdx + jz .Lno_data_avx + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx + + testq $31,%rdx + jz .Leven_avx + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lblocks_avx_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%ebp + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %rbp,%r8 + shlq $40,%r8 + shrq $24,%rbp + addq %r8,%rbx + adcq $0,%rbp + + movq $-4,%r9 + movq %rbp,%r8 + andq %rbp,%r9 + shrq $2,%r8 + andq $3,%rbp + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%rbp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + + call __poly1305_block + + testq %rcx,%rcx + jz .Lstore_base2_64_avx + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%rbp + + subq $16,%r15 + jz .Lstore_base2_26_avx + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + jmp .Lproceed_avx + +.align 32 +.Lstore_base2_64_avx: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %rbp,16(%rdi) + jmp .Ldone_avx + +.align 16 +.Lstore_base2_26_avx: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %ebp,16(%rdi) +.align 16 +.Ldone_avx: + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx: +.Lblocks_avx_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc + +.align 32 +.Lbase2_64_avx: +.cfi_startproc + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lbase2_64_avx_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%ebp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $31,%rdx + jz .Linit_avx + + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + subq $16,%r15 + + call __poly1305_block + +.Linit_avx: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%rbp + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + movl $1,20(%rdi) + + call __poly1305_init_avx + +.Lproceed_avx: + movq %r15,%rdx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx_epilogue: + jmp .Ldo_avx +.cfi_endproc + +.align 32 +.Leven_avx: +.cfi_startproc + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx: + leaq -88(%rsp),%r11 +.cfi_def_cfa %r11,0x60 + subq $0x178,%rsp + subq $64,%rdx + leaq -32(%rsi),%rax + cmovcq %rax,%rsi + + vmovdqu 48(%rdi),%xmm14 + leaq 112(%rdi),%rdi + leaq .Lconst(%rip),%rcx + + + + vmovdqu 32(%rsi),%xmm5 + vmovdqu 48(%rsi),%xmm6 + vmovdqa 64(%rcx),%xmm15 + + vpsrldq $6,%xmm5,%xmm7 + vpsrldq $6,%xmm6,%xmm8 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + vpsrlq $40,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + jbe .Lskip_loop_avx + + + vmovdqu -48(%rdi),%xmm11 + vmovdqu -32(%rdi),%xmm12 + vpshufd $0xEE,%xmm14,%xmm13 + vpshufd $0x44,%xmm14,%xmm10 + vmovdqa %xmm13,-144(%r11) + vmovdqa %xmm10,0(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vmovdqu -16(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-128(%r11) + vmovdqa %xmm11,16(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqu 0(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-112(%r11) + vmovdqa %xmm12,32(%rsp) + vpshufd $0xEE,%xmm10,%xmm14 + vmovdqu 16(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm14,-96(%r11) + vmovdqa %xmm10,48(%rsp) + vpshufd $0xEE,%xmm11,%xmm13 + vmovdqu 32(%rdi),%xmm10 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm13,-80(%r11) + vmovdqa %xmm11,64(%rsp) + vpshufd $0xEE,%xmm12,%xmm14 + vmovdqu 48(%rdi),%xmm11 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm14,-64(%r11) + vmovdqa %xmm12,80(%rsp) + vpshufd $0xEE,%xmm10,%xmm13 + vmovdqu 64(%rdi),%xmm12 + vpshufd $0x44,%xmm10,%xmm10 + vmovdqa %xmm13,-48(%r11) + vmovdqa %xmm10,96(%rsp) + vpshufd $0xEE,%xmm11,%xmm14 + vpshufd $0x44,%xmm11,%xmm11 + vmovdqa %xmm14,-32(%r11) + vmovdqa %xmm11,112(%rsp) + vpshufd $0xEE,%xmm12,%xmm13 + vmovdqa 0(%rsp),%xmm14 + vpshufd $0x44,%xmm12,%xmm12 + vmovdqa %xmm13,-16(%r11) + vmovdqa %xmm12,128(%rsp) + + jmp .Loop_avx + +.align 32 +.Loop_avx: + + + + + + + + + + + + + + + + + + + + + vpmuludq %xmm5,%xmm14,%xmm10 + vpmuludq %xmm6,%xmm14,%xmm11 + vmovdqa %xmm2,32(%r11) + vpmuludq %xmm7,%xmm14,%xmm12 + vmovdqa 16(%rsp),%xmm2 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vmovdqa %xmm0,0(%r11) + vpmuludq 32(%rsp),%xmm9,%xmm0 + vmovdqa %xmm1,16(%r11) + vpmuludq %xmm8,%xmm2,%xmm1 + vpaddq %xmm0,%xmm10,%xmm10 + vpaddq %xmm1,%xmm14,%xmm14 + vmovdqa %xmm3,48(%r11) + vpmuludq %xmm7,%xmm2,%xmm0 + vpmuludq %xmm6,%xmm2,%xmm1 + vpaddq %xmm0,%xmm13,%xmm13 + vmovdqa 48(%rsp),%xmm3 + vpaddq %xmm1,%xmm12,%xmm12 + vmovdqa %xmm4,64(%r11) + vpmuludq %xmm5,%xmm2,%xmm2 + vpmuludq %xmm7,%xmm3,%xmm0 + vpaddq %xmm2,%xmm11,%xmm11 + + vmovdqa 64(%rsp),%xmm4 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm3,%xmm1 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm1,%xmm13,%xmm13 + vmovdqa 80(%rsp),%xmm2 + vpaddq %xmm3,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm4,%xmm0 + vpmuludq %xmm8,%xmm4,%xmm4 + vpaddq %xmm0,%xmm11,%xmm11 + vmovdqa 96(%rsp),%xmm3 + vpaddq %xmm4,%xmm10,%xmm10 + + vmovdqa 128(%rsp),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm1 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm1,%xmm14,%xmm14 + vpaddq %xmm2,%xmm13,%xmm13 + vpmuludq %xmm9,%xmm3,%xmm0 + vpmuludq %xmm8,%xmm3,%xmm1 + vpaddq %xmm0,%xmm12,%xmm12 + vmovdqu 0(%rsi),%xmm0 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm3,%xmm3 + vpmuludq %xmm7,%xmm4,%xmm7 + vpaddq %xmm3,%xmm10,%xmm10 + + vmovdqu 16(%rsi),%xmm1 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm8,%xmm4,%xmm8 + vpmuludq %xmm9,%xmm4,%xmm9 + vpsrldq $6,%xmm0,%xmm2 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm9,%xmm13,%xmm13 + vpsrldq $6,%xmm1,%xmm3 + vpmuludq 112(%rsp),%xmm5,%xmm9 + vpmuludq %xmm6,%xmm4,%xmm5 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpaddq %xmm9,%xmm14,%xmm14 + vmovdqa -144(%r11),%xmm9 + vpaddq %xmm5,%xmm10,%xmm10 + + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + + vpsrldq $5,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpand 0(%rcx),%xmm4,%xmm4 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + leaq 32(%rsi),%rax + leaq 64(%rsi),%rsi + subq $64,%rdx + cmovcq %rax,%rsi + + + + + + + + + + + vpmuludq %xmm0,%xmm9,%xmm5 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vmovdqa -128(%r11),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm5 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm5,%xmm12,%xmm12 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpmuludq -112(%r11),%xmm4,%xmm5 + vpaddq %xmm9,%xmm14,%xmm14 + + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm2,%xmm7,%xmm6 + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -96(%r11),%xmm8 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm7,%xmm6 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm6,%xmm12,%xmm12 + vpaddq %xmm7,%xmm11,%xmm11 + + vmovdqa -80(%r11),%xmm9 + vpmuludq %xmm2,%xmm8,%xmm5 + vpmuludq %xmm1,%xmm8,%xmm6 + vpaddq %xmm5,%xmm14,%xmm14 + vpaddq %xmm6,%xmm13,%xmm13 + vmovdqa -64(%r11),%xmm7 + vpmuludq %xmm0,%xmm8,%xmm8 + vpmuludq %xmm4,%xmm9,%xmm5 + vpaddq %xmm8,%xmm12,%xmm12 + vpaddq %xmm5,%xmm11,%xmm11 + vmovdqa -48(%r11),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm9 + vpmuludq %xmm1,%xmm7,%xmm6 + vpaddq %xmm9,%xmm10,%xmm10 + + vmovdqa -16(%r11),%xmm9 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm7,%xmm7 + vpmuludq %xmm4,%xmm8,%xmm5 + vpaddq %xmm7,%xmm13,%xmm13 + vpaddq %xmm5,%xmm12,%xmm12 + vmovdqu 32(%rsi),%xmm5 + vpmuludq %xmm3,%xmm8,%xmm7 + vpmuludq %xmm2,%xmm8,%xmm8 + vpaddq %xmm7,%xmm11,%xmm11 + vmovdqu 48(%rsi),%xmm6 + vpaddq %xmm8,%xmm10,%xmm10 + + vpmuludq %xmm2,%xmm9,%xmm2 + vpmuludq %xmm3,%xmm9,%xmm3 + vpsrldq $6,%xmm5,%xmm7 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm9,%xmm4 + vpsrldq $6,%xmm6,%xmm8 + vpaddq %xmm3,%xmm12,%xmm2 + vpaddq %xmm4,%xmm13,%xmm3 + vpmuludq -32(%r11),%xmm0,%xmm4 + vpmuludq %xmm1,%xmm9,%xmm0 + vpunpckhqdq %xmm6,%xmm5,%xmm9 + vpaddq %xmm4,%xmm14,%xmm4 + vpaddq %xmm0,%xmm10,%xmm0 + + vpunpcklqdq %xmm6,%xmm5,%xmm5 + vpunpcklqdq %xmm8,%xmm7,%xmm8 + + + vpsrldq $5,%xmm9,%xmm9 + vpsrlq $26,%xmm5,%xmm6 + vmovdqa 0(%rsp),%xmm14 + vpand %xmm15,%xmm5,%xmm5 + vpsrlq $4,%xmm8,%xmm7 + vpand %xmm15,%xmm6,%xmm6 + vpand 0(%rcx),%xmm9,%xmm9 + vpsrlq $30,%xmm8,%xmm8 + vpand %xmm15,%xmm7,%xmm7 + vpand %xmm15,%xmm8,%xmm8 + vpor 32(%rcx),%xmm9,%xmm9 + + + + + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm11,%xmm1 + + vpsrlq $26,%xmm4,%xmm10 + vpand %xmm15,%xmm4,%xmm4 + + vpsrlq $26,%xmm1,%xmm11 + vpand %xmm15,%xmm1,%xmm1 + vpaddq %xmm11,%xmm2,%xmm2 + + vpaddq %xmm10,%xmm0,%xmm0 + vpsllq $2,%xmm10,%xmm10 + vpaddq %xmm10,%xmm0,%xmm0 + + vpsrlq $26,%xmm2,%xmm12 + vpand %xmm15,%xmm2,%xmm2 + vpaddq %xmm12,%xmm3,%xmm3 + + vpsrlq $26,%xmm0,%xmm10 + vpand %xmm15,%xmm0,%xmm0 + vpaddq %xmm10,%xmm1,%xmm1 + + vpsrlq $26,%xmm3,%xmm13 + vpand %xmm15,%xmm3,%xmm3 + vpaddq %xmm13,%xmm4,%xmm4 + + ja .Loop_avx + +.Lskip_loop_avx: + + + + vpshufd $0x10,%xmm14,%xmm14 + addq $32,%rdx + jnz .Long_tail_avx + + vpaddq %xmm2,%xmm7,%xmm7 + vpaddq %xmm0,%xmm5,%xmm5 + vpaddq %xmm1,%xmm6,%xmm6 + vpaddq %xmm3,%xmm8,%xmm8 + vpaddq %xmm4,%xmm9,%xmm9 + +.Long_tail_avx: + vmovdqa %xmm2,32(%r11) + vmovdqa %xmm0,0(%r11) + vmovdqa %xmm1,16(%r11) + vmovdqa %xmm3,48(%r11) + vmovdqa %xmm4,64(%r11) + + + + + + + + vpmuludq %xmm7,%xmm14,%xmm12 + vpmuludq %xmm5,%xmm14,%xmm10 + vpshufd $0x10,-48(%rdi),%xmm2 + vpmuludq %xmm6,%xmm14,%xmm11 + vpmuludq %xmm8,%xmm14,%xmm13 + vpmuludq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm8,%xmm2,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpshufd $0x10,-32(%rdi),%xmm3 + vpmuludq %xmm7,%xmm2,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpshufd $0x10,-16(%rdi),%xmm4 + vpmuludq %xmm6,%xmm2,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm11,%xmm11 + vpmuludq %xmm9,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + vpshufd $0x10,0(%rdi),%xmm2 + vpmuludq %xmm7,%xmm4,%xmm1 + vpaddq %xmm1,%xmm14,%xmm14 + vpmuludq %xmm6,%xmm4,%xmm0 + vpaddq %xmm0,%xmm13,%xmm13 + vpshufd $0x10,16(%rdi),%xmm3 + vpmuludq %xmm5,%xmm4,%xmm4 + vpaddq %xmm4,%xmm12,%xmm12 + vpmuludq %xmm9,%xmm2,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpshufd $0x10,32(%rdi),%xmm4 + vpmuludq %xmm8,%xmm2,%xmm2 + vpaddq %xmm2,%xmm10,%xmm10 + + vpmuludq %xmm6,%xmm3,%xmm0 + vpaddq %xmm0,%xmm14,%xmm14 + vpmuludq %xmm5,%xmm3,%xmm3 + vpaddq %xmm3,%xmm13,%xmm13 + vpshufd $0x10,48(%rdi),%xmm2 + vpmuludq %xmm9,%xmm4,%xmm1 + vpaddq %xmm1,%xmm12,%xmm12 + vpshufd $0x10,64(%rdi),%xmm3 + vpmuludq %xmm8,%xmm4,%xmm0 + vpaddq %xmm0,%xmm11,%xmm11 + vpmuludq %xmm7,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpmuludq %xmm5,%xmm2,%xmm2 + vpaddq %xmm2,%xmm14,%xmm14 + vpmuludq %xmm9,%xmm3,%xmm1 + vpaddq %xmm1,%xmm13,%xmm13 + vpmuludq %xmm8,%xmm3,%xmm0 + vpaddq %xmm0,%xmm12,%xmm12 + vpmuludq %xmm7,%xmm3,%xmm1 + vpaddq %xmm1,%xmm11,%xmm11 + vpmuludq %xmm6,%xmm3,%xmm3 + vpaddq %xmm3,%xmm10,%xmm10 + + jz .Lshort_tail_avx + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + + vpsrldq $6,%xmm0,%xmm2 + vpsrldq $6,%xmm1,%xmm3 + vpunpckhqdq %xmm1,%xmm0,%xmm4 + vpunpcklqdq %xmm1,%xmm0,%xmm0 + vpunpcklqdq %xmm3,%xmm2,%xmm3 + + vpsrlq $40,%xmm4,%xmm4 + vpsrlq $26,%xmm0,%xmm1 + vpand %xmm15,%xmm0,%xmm0 + vpsrlq $4,%xmm3,%xmm2 + vpand %xmm15,%xmm1,%xmm1 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm15,%xmm2,%xmm2 + vpand %xmm15,%xmm3,%xmm3 + vpor 32(%rcx),%xmm4,%xmm4 + + vpshufd $0x32,-64(%rdi),%xmm9 + vpaddq 0(%r11),%xmm0,%xmm0 + vpaddq 16(%r11),%xmm1,%xmm1 + vpaddq 32(%r11),%xmm2,%xmm2 + vpaddq 48(%r11),%xmm3,%xmm3 + vpaddq 64(%r11),%xmm4,%xmm4 + + + + + vpmuludq %xmm0,%xmm9,%xmm5 + vpaddq %xmm5,%xmm10,%xmm10 + vpmuludq %xmm1,%xmm9,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpshufd $0x32,-48(%rdi),%xmm7 + vpmuludq %xmm3,%xmm9,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm4,%xmm9,%xmm9 + vpaddq %xmm9,%xmm14,%xmm14 + + vpmuludq %xmm3,%xmm7,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpshufd $0x32,-32(%rdi),%xmm8 + vpmuludq %xmm2,%xmm7,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpshufd $0x32,-16(%rdi),%xmm9 + vpmuludq %xmm1,%xmm7,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm11,%xmm11 + vpmuludq %xmm4,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + + vpshufd $0x32,0(%rdi),%xmm7 + vpmuludq %xmm2,%xmm9,%xmm6 + vpaddq %xmm6,%xmm14,%xmm14 + vpmuludq %xmm1,%xmm9,%xmm5 + vpaddq %xmm5,%xmm13,%xmm13 + vpshufd $0x32,16(%rdi),%xmm8 + vpmuludq %xmm0,%xmm9,%xmm9 + vpaddq %xmm9,%xmm12,%xmm12 + vpmuludq %xmm4,%xmm7,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpshufd $0x32,32(%rdi),%xmm9 + vpmuludq %xmm3,%xmm7,%xmm7 + vpaddq %xmm7,%xmm10,%xmm10 + + vpmuludq %xmm1,%xmm8,%xmm5 + vpaddq %xmm5,%xmm14,%xmm14 + vpmuludq %xmm0,%xmm8,%xmm8 + vpaddq %xmm8,%xmm13,%xmm13 + vpshufd $0x32,48(%rdi),%xmm7 + vpmuludq %xmm4,%xmm9,%xmm6 + vpaddq %xmm6,%xmm12,%xmm12 + vpshufd $0x32,64(%rdi),%xmm8 + vpmuludq %xmm3,%xmm9,%xmm5 + vpaddq %xmm5,%xmm11,%xmm11 + vpmuludq %xmm2,%xmm9,%xmm9 + vpaddq %xmm9,%xmm10,%xmm10 + + vpmuludq %xmm0,%xmm7,%xmm7 + vpaddq %xmm7,%xmm14,%xmm14 + vpmuludq %xmm4,%xmm8,%xmm6 + vpaddq %xmm6,%xmm13,%xmm13 + vpmuludq %xmm3,%xmm8,%xmm5 + vpaddq %xmm5,%xmm12,%xmm12 + vpmuludq %xmm2,%xmm8,%xmm6 + vpaddq %xmm6,%xmm11,%xmm11 + vpmuludq %xmm1,%xmm8,%xmm8 + vpaddq %xmm8,%xmm10,%xmm10 + +.Lshort_tail_avx: + + + + vpsrldq $8,%xmm14,%xmm9 + vpsrldq $8,%xmm13,%xmm8 + vpsrldq $8,%xmm11,%xmm6 + vpsrldq $8,%xmm10,%xmm5 + vpsrldq $8,%xmm12,%xmm7 + vpaddq %xmm8,%xmm13,%xmm13 + vpaddq %xmm9,%xmm14,%xmm14 + vpaddq %xmm5,%xmm10,%xmm10 + vpaddq %xmm6,%xmm11,%xmm11 + vpaddq %xmm7,%xmm12,%xmm12 + + + + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm14,%xmm4 + vpand %xmm15,%xmm14,%xmm14 + + vpsrlq $26,%xmm11,%xmm1 + vpand %xmm15,%xmm11,%xmm11 + vpaddq %xmm1,%xmm12,%xmm12 + + vpaddq %xmm4,%xmm10,%xmm10 + vpsllq $2,%xmm4,%xmm4 + vpaddq %xmm4,%xmm10,%xmm10 + + vpsrlq $26,%xmm12,%xmm2 + vpand %xmm15,%xmm12,%xmm12 + vpaddq %xmm2,%xmm13,%xmm13 + + vpsrlq $26,%xmm10,%xmm0 + vpand %xmm15,%xmm10,%xmm10 + vpaddq %xmm0,%xmm11,%xmm11 + + vpsrlq $26,%xmm13,%xmm3 + vpand %xmm15,%xmm13,%xmm13 + vpaddq %xmm3,%xmm14,%xmm14 + + vmovd %xmm10,-112(%rdi) + vmovd %xmm11,-108(%rdi) + vmovd %xmm12,-104(%rdi) + vmovd %xmm13,-100(%rdi) + vmovd %xmm14,-96(%rdi) + leaq 88(%r11),%rsp +.cfi_def_cfa %rsp,8 + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_blocks_avx,.-poly1305_blocks_avx + +.type poly1305_emit_avx,@function +.align 32 +poly1305_emit_avx: +.cfi_startproc + cmpl $0,20(%rdi) + je .Lemit + + movl 0(%rdi),%eax + movl 4(%rdi),%ecx + movl 8(%rdi),%r8d + movl 12(%rdi),%r11d + movl 16(%rdi),%r10d + + shlq $26,%rcx + movq %r8,%r9 + shlq $52,%r8 + addq %rcx,%rax + shrq $12,%r9 + addq %rax,%r8 + adcq $0,%r9 + + shlq $14,%r11 + movq %r10,%rax + shrq $24,%r10 + addq %r11,%r9 + shlq $40,%rax + addq %rax,%r9 + adcq $0,%r10 + + movq %r10,%rax + movq %r10,%rcx + andq $3,%r10 + shrq $2,%rax + andq $-4,%rcx + addq %rcx,%rax + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + + movq %r8,%rax + addq $5,%r8 + movq %r9,%rcx + adcq $0,%r9 + adcq $0,%r10 + shrq $2,%r10 + cmovnzq %r8,%rax + cmovnzq %r9,%rcx + + addq 0(%rdx),%rax + adcq 8(%rdx),%rcx + movq %rax,0(%rsi) + movq %rcx,8(%rsi) + + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_emit_avx,.-poly1305_emit_avx +.type poly1305_blocks_avx2,@function +.align 32 +poly1305_blocks_avx2: +.cfi_startproc + movl 20(%rdi),%r8d + cmpq $128,%rdx + jae .Lblocks_avx2 + testl %r8d,%r8d + jz .Lblocks + +.Lblocks_avx2: + andq $-16,%rdx + jz .Lno_data_avx2 + + vzeroupper + + testl %r8d,%r8d + jz .Lbase2_64_avx2 + + testq $63,%rdx + jz .Leven_avx2 + + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lblocks_avx2_body: + + movq %rdx,%r15 + + movq 0(%rdi),%r8 + movq 8(%rdi),%r9 + movl 16(%rdi),%ebp + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + + movl %r8d,%r14d + andq $-2147483648,%r8 + movq %r9,%r12 + movl %r9d,%ebx + andq $-2147483648,%r9 + + shrq $6,%r8 + shlq $52,%r12 + addq %r8,%r14 + shrq $12,%rbx + shrq $18,%r9 + addq %r12,%r14 + adcq %r9,%rbx + + movq %rbp,%r8 + shlq $40,%r8 + shrq $24,%rbp + addq %r8,%rbx + adcq $0,%rbp + + movq $-4,%r9 + movq %rbp,%r8 + andq %rbp,%r9 + shrq $2,%r8 + andq $3,%rbp + addq %r9,%r8 + addq %r8,%r14 + adcq $0,%rbx + adcq $0,%rbp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + +.Lbase2_26_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + subq $16,%r15 + + call __poly1305_block + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_26_pre_avx2 + + testq %rcx,%rcx + jz .Lstore_base2_64_avx2 + + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r11 + movq %rbx,%r12 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r11 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r11,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r12 + andq $0x3ffffff,%rbx + orq %r12,%rbp + + testq %r15,%r15 + jz .Lstore_base2_26_avx2 + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + jmp .Lproceed_avx2 + +.align 32 +.Lstore_base2_64_avx2: + movq %r14,0(%rdi) + movq %rbx,8(%rdi) + movq %rbp,16(%rdi) + jmp .Ldone_avx2 + +.align 16 +.Lstore_base2_26_avx2: + movl %eax,0(%rdi) + movl %edx,4(%rdi) + movl %r14d,8(%rdi) + movl %ebx,12(%rdi) + movl %ebp,16(%rdi) +.align 16 +.Ldone_avx2: + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lno_data_avx2: +.Lblocks_avx2_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc + +.align 32 +.Lbase2_64_avx2: +.cfi_startproc + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lbase2_64_avx2_body: + + movq %rdx,%r15 + + movq 24(%rdi),%r11 + movq 32(%rdi),%r13 + + movq 0(%rdi),%r14 + movq 8(%rdi),%rbx + movl 16(%rdi),%ebp + + movq %r13,%r12 + movq %r13,%rax + shrq $2,%r13 + addq %r12,%r13 + + testq $63,%rdx + jz .Linit_avx2 + +.Lbase2_64_pre_avx2: + addq 0(%rsi),%r14 + adcq 8(%rsi),%rbx + leaq 16(%rsi),%rsi + adcq %rcx,%rbp + subq $16,%r15 + + call __poly1305_block + movq %r12,%rax + + testq $63,%r15 + jnz .Lbase2_64_pre_avx2 + +.Linit_avx2: + + movq %r14,%rax + movq %r14,%rdx + shrq $52,%r14 + movq %rbx,%r8 + movq %rbx,%r9 + shrq $26,%rdx + andq $0x3ffffff,%rax + shlq $12,%r8 + andq $0x3ffffff,%rdx + shrq $14,%rbx + orq %r8,%r14 + shlq $24,%rbp + andq $0x3ffffff,%r14 + shrq $40,%r9 + andq $0x3ffffff,%rbx + orq %r9,%rbp + + vmovd %eax,%xmm0 + vmovd %edx,%xmm1 + vmovd %r14d,%xmm2 + vmovd %ebx,%xmm3 + vmovd %ebp,%xmm4 + movl $1,20(%rdi) + + call __poly1305_init_avx + +.Lproceed_avx2: + movq %r15,%rdx + movl OPENSSL_ia32cap_P+8(%rip),%r10d + movl $3221291008,%r11d + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbp +.cfi_restore %rbp + movq 40(%rsp),%rbx +.cfi_restore %rbx + leaq 48(%rsp),%rax + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lbase2_64_avx2_epilogue: + jmp .Ldo_avx2 +.cfi_endproc + +.align 32 +.Leven_avx2: +.cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%r10d + vmovd 0(%rdi),%xmm0 + vmovd 4(%rdi),%xmm1 + vmovd 8(%rdi),%xmm2 + vmovd 12(%rdi),%xmm3 + vmovd 16(%rdi),%xmm4 + +.Ldo_avx2: + leaq -8(%rsp),%r11 +.cfi_def_cfa %r11,16 + subq $0x128,%rsp + leaq .Lconst(%rip),%rcx + leaq 48+64(%rdi),%rdi + vmovdqa 96(%rcx),%ymm7 + + + vmovdqu -64(%rdi),%xmm9 + andq $-512,%rsp + vmovdqu -48(%rdi),%xmm10 + vmovdqu -32(%rdi),%xmm6 + vmovdqu -16(%rdi),%xmm11 + vmovdqu 0(%rdi),%xmm12 + vmovdqu 16(%rdi),%xmm13 + leaq 144(%rsp),%rax + vmovdqu 32(%rdi),%xmm14 + vpermd %ymm9,%ymm7,%ymm9 + vmovdqu 48(%rdi),%xmm15 + vpermd %ymm10,%ymm7,%ymm10 + vmovdqu 64(%rdi),%xmm5 + vpermd %ymm6,%ymm7,%ymm6 + vmovdqa %ymm9,0(%rsp) + vpermd %ymm11,%ymm7,%ymm11 + vmovdqa %ymm10,32-144(%rax) + vpermd %ymm12,%ymm7,%ymm12 + vmovdqa %ymm6,64-144(%rax) + vpermd %ymm13,%ymm7,%ymm13 + vmovdqa %ymm11,96-144(%rax) + vpermd %ymm14,%ymm7,%ymm14 + vmovdqa %ymm12,128-144(%rax) + vpermd %ymm15,%ymm7,%ymm15 + vmovdqa %ymm13,160-144(%rax) + vpermd %ymm5,%ymm7,%ymm5 + vmovdqa %ymm14,192-144(%rax) + vmovdqa %ymm15,224-144(%rax) + vmovdqa %ymm5,256-144(%rax) + vmovdqa 64(%rcx),%ymm5 + + + + vmovdqu 0(%rsi),%xmm7 + vmovdqu 16(%rsi),%xmm8 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpsrldq $6,%ymm7,%ymm9 + vpsrldq $6,%ymm8,%ymm10 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + vpunpcklqdq %ymm10,%ymm9,%ymm9 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + + vpsrlq $30,%ymm9,%ymm10 + vpsrlq $4,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + vpsrlq $40,%ymm6,%ymm6 + vpand %ymm5,%ymm9,%ymm9 + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + vpaddq %ymm2,%ymm9,%ymm2 + subq $64,%rdx + jz .Ltail_avx2 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + + + + + + + + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqa 0(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqa 32(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqa 96(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqa 48(%rax),%ymm10 + vmovdqa 112(%rax),%ymm5 + + + + + + + + + + + + + + + + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 64(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + vmovdqa -16(%rax),%ymm8 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vmovdqu 0(%rsi),%xmm7 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + vinserti128 $1,32(%rsi),%ymm7,%ymm7 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vmovdqu 16(%rsi),%xmm8 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqa 16(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + vinserti128 $1,48(%rsi),%ymm8,%ymm8 + leaq 64(%rsi),%rsi + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpsrldq $6,%ymm7,%ymm9 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpsrldq $6,%ymm8,%ymm10 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpunpckhqdq %ymm8,%ymm7,%ymm6 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpunpcklqdq %ymm8,%ymm7,%ymm7 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpunpcklqdq %ymm10,%ymm9,%ymm10 + vpmuludq 80(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + + + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $4,%ymm10,%ymm9 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpand %ymm5,%ymm9,%ymm9 + vpsrlq $26,%ymm7,%ymm8 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpaddq %ymm9,%ymm2,%ymm2 + vpsrlq $30,%ymm10,%ymm10 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $40,%ymm6,%ymm6 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpand %ymm5,%ymm7,%ymm7 + vpand %ymm5,%ymm8,%ymm8 + vpand %ymm5,%ymm10,%ymm10 + vpor 32(%rcx),%ymm6,%ymm6 + + subq $64,%rdx + jnz .Loop_avx2 + +.byte 0x66,0x90 +.Ltail_avx2: + + + + + + + + vpaddq %ymm0,%ymm7,%ymm0 + vmovdqu 4(%rsp),%ymm7 + vpaddq %ymm1,%ymm8,%ymm1 + vmovdqu 36(%rsp),%ymm8 + vpaddq %ymm3,%ymm10,%ymm3 + vmovdqu 100(%rsp),%ymm9 + vpaddq %ymm4,%ymm6,%ymm4 + vmovdqu 52(%rax),%ymm10 + vmovdqu 116(%rax),%ymm5 + + vpmuludq %ymm2,%ymm7,%ymm13 + vpmuludq %ymm2,%ymm8,%ymm14 + vpmuludq %ymm2,%ymm9,%ymm15 + vpmuludq %ymm2,%ymm10,%ymm11 + vpmuludq %ymm2,%ymm5,%ymm12 + + vpmuludq %ymm0,%ymm8,%ymm6 + vpmuludq %ymm1,%ymm8,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq 68(%rsp),%ymm4,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm11,%ymm11 + + vpmuludq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm1,%ymm7,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vmovdqu -12(%rax),%ymm8 + vpaddq %ymm2,%ymm12,%ymm12 + vpmuludq %ymm3,%ymm7,%ymm6 + vpmuludq %ymm4,%ymm7,%ymm2 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm2,%ymm15,%ymm15 + + vpmuludq %ymm3,%ymm8,%ymm6 + vpmuludq %ymm4,%ymm8,%ymm2 + vpaddq %ymm6,%ymm11,%ymm11 + vpaddq %ymm2,%ymm12,%ymm12 + vmovdqu 20(%rax),%ymm2 + vpmuludq %ymm1,%ymm9,%ymm6 + vpmuludq %ymm0,%ymm9,%ymm9 + vpaddq %ymm6,%ymm14,%ymm14 + vpaddq %ymm9,%ymm13,%ymm13 + + vpmuludq %ymm1,%ymm2,%ymm6 + vpmuludq %ymm0,%ymm2,%ymm2 + vpaddq %ymm6,%ymm15,%ymm15 + vpaddq %ymm2,%ymm14,%ymm14 + vpmuludq %ymm3,%ymm10,%ymm6 + vpmuludq %ymm4,%ymm10,%ymm2 + vpaddq %ymm6,%ymm12,%ymm12 + vpaddq %ymm2,%ymm13,%ymm13 + + vpmuludq %ymm3,%ymm5,%ymm3 + vpmuludq %ymm4,%ymm5,%ymm4 + vpaddq %ymm3,%ymm13,%ymm2 + vpaddq %ymm4,%ymm14,%ymm3 + vpmuludq 84(%rax),%ymm0,%ymm4 + vpmuludq %ymm1,%ymm5,%ymm0 + vmovdqa 64(%rcx),%ymm5 + vpaddq %ymm4,%ymm15,%ymm4 + vpaddq %ymm0,%ymm11,%ymm0 + + + + + vpsrldq $8,%ymm12,%ymm8 + vpsrldq $8,%ymm2,%ymm9 + vpsrldq $8,%ymm3,%ymm10 + vpsrldq $8,%ymm4,%ymm6 + vpsrldq $8,%ymm0,%ymm7 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + + vpermq $0x2,%ymm3,%ymm10 + vpermq $0x2,%ymm4,%ymm6 + vpermq $0x2,%ymm0,%ymm7 + vpermq $0x2,%ymm12,%ymm8 + vpermq $0x2,%ymm2,%ymm9 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm6,%ymm4,%ymm4 + vpaddq %ymm7,%ymm0,%ymm0 + vpaddq %ymm8,%ymm12,%ymm12 + vpaddq %ymm9,%ymm2,%ymm2 + + + + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm12,%ymm1 + + vpsrlq $26,%ymm4,%ymm15 + vpand %ymm5,%ymm4,%ymm4 + + vpsrlq $26,%ymm1,%ymm12 + vpand %ymm5,%ymm1,%ymm1 + vpaddq %ymm12,%ymm2,%ymm2 + + vpaddq %ymm15,%ymm0,%ymm0 + vpsllq $2,%ymm15,%ymm15 + vpaddq %ymm15,%ymm0,%ymm0 + + vpsrlq $26,%ymm2,%ymm13 + vpand %ymm5,%ymm2,%ymm2 + vpaddq %ymm13,%ymm3,%ymm3 + + vpsrlq $26,%ymm0,%ymm11 + vpand %ymm5,%ymm0,%ymm0 + vpaddq %ymm11,%ymm1,%ymm1 + + vpsrlq $26,%ymm3,%ymm14 + vpand %ymm5,%ymm3,%ymm3 + vpaddq %ymm14,%ymm4,%ymm4 + + vmovd %xmm0,-112(%rdi) + vmovd %xmm1,-108(%rdi) + vmovd %xmm2,-104(%rdi) + vmovd %xmm3,-100(%rdi) + vmovd %xmm4,-96(%rdi) + leaq 8(%r11),%rsp +.cfi_def_cfa %rsp,8 + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size poly1305_blocks_avx2,.-poly1305_blocks_avx2 +.align 64 +.Lconst: +.Lmask24: +.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0 +.L129: +.long 16777216,0,16777216,0,16777216,0,16777216,0 +.Lmask26: +.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0 +.Lpermd_avx2: +.long 2,2,2,3,2,0,2,1 +.Lpermd_avx512: +.long 0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7 + +.L2_44_inp_permd: +.long 0,1,1,2,2,3,7,7 +.L2_44_inp_shift: +.quad 0,12,24,64 +.L2_44_mask: +.quad 0xfffffffffff,0xfffffffffff,0x3ffffffffff,0xffffffffffffffff +.L2_44_shift_rgt: +.quad 44,44,42,64 +.L2_44_shift_lft: +.quad 8,8,10,64 + +.align 64 +.Lx_mask44: +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.quad 0xfffffffffff,0xfffffffffff,0xfffffffffff,0xfffffffffff +.Lx_mask42: +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff +.quad 0x3ffffffffff,0x3ffffffffff,0x3ffffffffff,0x3ffffffffff .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 .globl xor128_encrypt_n_pad .type xor128_encrypt_n_pad,@function .align 16 xor128_encrypt_n_pad: .cfi_startproc subq %rdx,%rsi subq %rdx,%rdi movq %rcx,%r10 shrq $4,%rcx jz .Ltail_enc nop .Loop_enc_xmm: movdqu (%rsi,%rdx,1),%xmm0 pxor (%rdx),%xmm0 movdqu %xmm0,(%rdi,%rdx,1) movdqa %xmm0,(%rdx) leaq 16(%rdx),%rdx decq %rcx jnz .Loop_enc_xmm andq $15,%r10 jz .Ldone_enc .Ltail_enc: movq $16,%rcx subq %r10,%rcx xorl %eax,%eax .Loop_enc_byte: movb (%rsi,%rdx,1),%al xorb (%rdx),%al movb %al,(%rdi,%rdx,1) movb %al,(%rdx) leaq 1(%rdx),%rdx decq %r10 jnz .Loop_enc_byte xorl %eax,%eax .Loop_enc_pad: movb %al,(%rdx) leaq 1(%rdx),%rdx decq %rcx jnz .Loop_enc_pad .Ldone_enc: movq %rdx,%rax .byte 0xf3,0xc3 .cfi_endproc .size xor128_encrypt_n_pad,.-xor128_encrypt_n_pad .globl xor128_decrypt_n_pad .type xor128_decrypt_n_pad,@function .align 16 xor128_decrypt_n_pad: .cfi_startproc subq %rdx,%rsi subq %rdx,%rdi movq %rcx,%r10 shrq $4,%rcx jz .Ltail_dec nop .Loop_dec_xmm: movdqu (%rsi,%rdx,1),%xmm0 movdqa (%rdx),%xmm1 pxor %xmm0,%xmm1 movdqu %xmm1,(%rdi,%rdx,1) movdqa %xmm0,(%rdx) leaq 16(%rdx),%rdx decq %rcx jnz .Loop_dec_xmm pxor %xmm1,%xmm1 andq $15,%r10 jz .Ldone_dec .Ltail_dec: movq $16,%rcx subq %r10,%rcx xorl %eax,%eax xorq %r11,%r11 .Loop_dec_byte: movb (%rsi,%rdx,1),%r11b movb (%rdx),%al xorb %r11b,%al movb %al,(%rdi,%rdx,1) movb %r11b,(%rdx) leaq 1(%rdx),%rdx decq %r10 jnz .Loop_dec_byte xorl %eax,%eax .Loop_dec_pad: movb %al,(%rdx) leaq 1(%rdx),%rdx decq %rcx jnz .Loop_dec_pad .Ldone_dec: movq %rdx,%rax .byte 0xf3,0xc3 .cfi_endproc .size xor128_decrypt_n_pad,.-xor128_decrypt_n_pad Index: stable/12/secure/lib/libcrypto/amd64/rsaz-avx2.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/rsaz-avx2.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/rsaz-avx2.S (revision 364963) @@ -1,27 +1,1746 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from rsaz-avx2.pl. */ .text -.globl rsaz_avx2_eligible -.type rsaz_avx2_eligible,@function -rsaz_avx2_eligible: - xorl %eax,%eax - .byte 0xf3,0xc3 -.size rsaz_avx2_eligible,.-rsaz_avx2_eligible - .globl rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.globl rsaz_1024_norm2red_avx2 -.globl rsaz_1024_red2norm_avx2 -.globl rsaz_1024_scatter5_avx2 -.globl rsaz_1024_gather5_avx2 .type rsaz_1024_sqr_avx2,@function +.align 64 rsaz_1024_sqr_avx2: +.cfi_startproc + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + movq %rax,%rbp +.cfi_def_cfa_register %rbp + movq %rdx,%r13 + subq $832,%rsp + movq %r13,%r15 + subq $-128,%rdi + subq $-128,%rsi + subq $-128,%r13 + + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + vpxor %ymm9,%ymm9,%ymm9 + jz .Lsqr_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%r13),%ymm0 + andq $-2048,%rsp + vmovdqu 32-128(%r13),%ymm1 + vmovdqu 64-128(%r13),%ymm2 + vmovdqu 96-128(%r13),%ymm3 + vmovdqu 128-128(%r13),%ymm4 + vmovdqu 160-128(%r13),%ymm5 + vmovdqu 192-128(%r13),%ymm6 + vmovdqu 224-128(%r13),%ymm7 + vmovdqu 256-128(%r13),%ymm8 + leaq 832+128(%rsp),%r13 + vmovdqu %ymm0,0-128(%r13) + vmovdqu %ymm1,32-128(%r13) + vmovdqu %ymm2,64-128(%r13) + vmovdqu %ymm3,96-128(%r13) + vmovdqu %ymm4,128-128(%r13) + vmovdqu %ymm5,160-128(%r13) + vmovdqu %ymm6,192-128(%r13) + vmovdqu %ymm7,224-128(%r13) + vmovdqu %ymm8,256-128(%r13) + vmovdqu %ymm9,288-128(%r13) + +.Lsqr_1024_no_n_copy: + andq $-1024,%rsp + + vmovdqu 32-128(%rsi),%ymm1 + vmovdqu 64-128(%rsi),%ymm2 + vmovdqu 96-128(%rsi),%ymm3 + vmovdqu 128-128(%rsi),%ymm4 + vmovdqu 160-128(%rsi),%ymm5 + vmovdqu 192-128(%rsi),%ymm6 + vmovdqu 224-128(%rsi),%ymm7 + vmovdqu 256-128(%rsi),%ymm8 + + leaq 192(%rsp),%rbx + vmovdqu .Land_mask(%rip),%ymm15 + jmp .LOOP_GRANDE_SQR_1024 + +.align 32 +.LOOP_GRANDE_SQR_1024: + leaq 576+128(%rsp),%r9 + leaq 448(%rsp),%r12 + + + + + vpaddq %ymm1,%ymm1,%ymm1 + vpbroadcastq 0-128(%rsi),%ymm10 + vpaddq %ymm2,%ymm2,%ymm2 + vmovdqa %ymm1,0-128(%r9) + vpaddq %ymm3,%ymm3,%ymm3 + vmovdqa %ymm2,32-128(%r9) + vpaddq %ymm4,%ymm4,%ymm4 + vmovdqa %ymm3,64-128(%r9) + vpaddq %ymm5,%ymm5,%ymm5 + vmovdqa %ymm4,96-128(%r9) + vpaddq %ymm6,%ymm6,%ymm6 + vmovdqa %ymm5,128-128(%r9) + vpaddq %ymm7,%ymm7,%ymm7 + vmovdqa %ymm6,160-128(%r9) + vpaddq %ymm8,%ymm8,%ymm8 + vmovdqa %ymm7,192-128(%r9) + vpxor %ymm9,%ymm9,%ymm9 + vmovdqa %ymm8,224-128(%r9) + + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpbroadcastq 32-128(%rsi),%ymm11 + vmovdqu %ymm9,288-192(%rbx) + vpmuludq %ymm10,%ymm1,%ymm1 + vmovdqu %ymm9,320-448(%r12) + vpmuludq %ymm10,%ymm2,%ymm2 + vmovdqu %ymm9,352-448(%r12) + vpmuludq %ymm10,%ymm3,%ymm3 + vmovdqu %ymm9,384-448(%r12) + vpmuludq %ymm10,%ymm4,%ymm4 + vmovdqu %ymm9,416-448(%r12) + vpmuludq %ymm10,%ymm5,%ymm5 + vmovdqu %ymm9,448-448(%r12) + vpmuludq %ymm10,%ymm6,%ymm6 + vmovdqu %ymm9,480-448(%r12) + vpmuludq %ymm10,%ymm7,%ymm7 + vmovdqu %ymm9,512-448(%r12) + vpmuludq %ymm10,%ymm8,%ymm8 + vpbroadcastq 64-128(%rsi),%ymm10 + vmovdqu %ymm9,544-448(%r12) + + movq %rsi,%r15 + movl $4,%r14d + jmp .Lsqr_entry_1024 +.align 32 +.LOOP_SQR_1024: + vpbroadcastq 32-128(%r15),%ymm11 + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpaddq 0-192(%rbx),%ymm0,%ymm0 + vpmuludq 0-128(%r9),%ymm10,%ymm1 + vpaddq 32-192(%rbx),%ymm1,%ymm1 + vpmuludq 32-128(%r9),%ymm10,%ymm2 + vpaddq 64-192(%rbx),%ymm2,%ymm2 + vpmuludq 64-128(%r9),%ymm10,%ymm3 + vpaddq 96-192(%rbx),%ymm3,%ymm3 + vpmuludq 96-128(%r9),%ymm10,%ymm4 + vpaddq 128-192(%rbx),%ymm4,%ymm4 + vpmuludq 128-128(%r9),%ymm10,%ymm5 + vpaddq 160-192(%rbx),%ymm5,%ymm5 + vpmuludq 160-128(%r9),%ymm10,%ymm6 + vpaddq 192-192(%rbx),%ymm6,%ymm6 + vpmuludq 192-128(%r9),%ymm10,%ymm7 + vpaddq 224-192(%rbx),%ymm7,%ymm7 + vpmuludq 224-128(%r9),%ymm10,%ymm8 + vpbroadcastq 64-128(%r15),%ymm10 + vpaddq 256-192(%rbx),%ymm8,%ymm8 +.Lsqr_entry_1024: + vmovdqu %ymm0,0-192(%rbx) + vmovdqu %ymm1,32-192(%rbx) + + vpmuludq 32-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 32-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 64-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 96-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 128-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 160-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 192-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 224-128(%r9),%ymm11,%ymm0 + vpbroadcastq 96-128(%r15),%ymm11 + vpaddq 288-192(%rbx),%ymm0,%ymm0 + + vmovdqu %ymm2,64-192(%rbx) + vmovdqu %ymm3,96-192(%rbx) + + vpmuludq 64-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 64-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 96-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 128-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 160-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 224-128(%r9),%ymm10,%ymm1 + vpbroadcastq 128-128(%r15),%ymm10 + vpaddq 320-448(%r12),%ymm1,%ymm1 + + vmovdqu %ymm4,128-192(%rbx) + vmovdqu %ymm5,160-192(%rbx) + + vpmuludq 96-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 96-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq 128-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm0,%ymm0 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq 224-128(%r9),%ymm11,%ymm2 + vpbroadcastq 160-128(%r15),%ymm11 + vpaddq 352-448(%r12),%ymm2,%ymm2 + + vmovdqu %ymm6,192-192(%rbx) + vmovdqu %ymm7,224-192(%rbx) + + vpmuludq 128-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 128-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 160-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 192-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 224-128(%r9),%ymm10,%ymm3 + vpbroadcastq 192-128(%r15),%ymm10 + vpaddq 384-448(%r12),%ymm3,%ymm3 + + vmovdqu %ymm8,256-192(%rbx) + vmovdqu %ymm0,288-192(%rbx) + leaq 8(%rbx),%rbx + + vpmuludq 160-128(%rsi),%ymm11,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 224-128(%r9),%ymm11,%ymm4 + vpbroadcastq 224-128(%r15),%ymm11 + vpaddq 416-448(%r12),%ymm4,%ymm4 + + vmovdqu %ymm1,320-448(%r12) + vmovdqu %ymm2,352-448(%r12) + + vpmuludq 192-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpbroadcastq 256-128(%r15),%ymm0 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq 224-128(%r9),%ymm10,%ymm5 + vpbroadcastq 0+8-128(%r15),%ymm10 + vpaddq 448-448(%r12),%ymm5,%ymm5 + + vmovdqu %ymm3,384-448(%r12) + vmovdqu %ymm4,416-448(%r12) + leaq 8(%r15),%r15 + + vpmuludq 224-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 224-128(%r9),%ymm11,%ymm6 + vpaddq 480-448(%r12),%ymm6,%ymm6 + + vpmuludq 256-128(%rsi),%ymm0,%ymm7 + vmovdqu %ymm5,448-448(%r12) + vpaddq 512-448(%r12),%ymm7,%ymm7 + vmovdqu %ymm6,480-448(%r12) + vmovdqu %ymm7,512-448(%r12) + leaq 8(%r12),%r12 + + decl %r14d + jnz .LOOP_SQR_1024 + + vmovdqu 256(%rsp),%ymm8 + vmovdqu 288(%rsp),%ymm1 + vmovdqu 320(%rsp),%ymm2 + leaq 192(%rsp),%rbx + + vpsrlq $29,%ymm8,%ymm14 + vpand %ymm15,%ymm8,%ymm8 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + + vpermq $0x93,%ymm14,%ymm14 + vpxor %ymm9,%ymm9,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm8,%ymm8 + vpblendd $3,%ymm11,%ymm9,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,288-192(%rbx) + vmovdqu %ymm2,320-192(%rbx) + + movq (%rsp),%rax + movq 8(%rsp),%r10 + movq 16(%rsp),%r11 + movq 24(%rsp),%r12 + vmovdqu 32(%rsp),%ymm1 + vmovdqu 64-192(%rbx),%ymm2 + vmovdqu 96-192(%rbx),%ymm3 + vmovdqu 128-192(%rbx),%ymm4 + vmovdqu 160-192(%rbx),%ymm5 + vmovdqu 192-192(%rbx),%ymm6 + vmovdqu 224-192(%rbx),%ymm7 + + movq %rax,%r9 + imull %ecx,%eax + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + + movq %rax,%rdx + imulq -128(%r13),%rax + vpbroadcastq %xmm12,%ymm12 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax + shrq $29,%r9 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + addq %r9,%r10 + addq %rax,%r11 + imulq 24-128(%r13),%rdx + addq %rdx,%r12 + + movq %r10,%rax + imull %ecx,%eax + andl $0x1fffffff,%eax + + movl $9,%r14d + jmp .LOOP_REDUCE_1024 + +.align 32 +.LOOP_REDUCE_1024: + vmovd %eax,%xmm13 + vpbroadcastq %xmm13,%ymm13 + + vpmuludq 32-128(%r13),%ymm12,%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm10,%ymm1,%ymm1 + addq %rax,%r10 + vpmuludq 64-128(%r13),%ymm12,%ymm14 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm14,%ymm2,%ymm2 + vpmuludq 96-128(%r13),%ymm12,%ymm11 +.byte 0x67 + addq %rax,%r11 +.byte 0x67 + movq %rdx,%rax + imulq 16-128(%r13),%rax + shrq $29,%r10 + vpaddq %ymm11,%ymm3,%ymm3 + vpmuludq 128-128(%r13),%ymm12,%ymm10 + addq %rax,%r12 + addq %r10,%r11 + vpaddq %ymm10,%ymm4,%ymm4 + vpmuludq 160-128(%r13),%ymm12,%ymm14 + movq %r11,%rax + imull %ecx,%eax + vpaddq %ymm14,%ymm5,%ymm5 + vpmuludq 192-128(%r13),%ymm12,%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm11,%ymm6,%ymm6 + vpmuludq 224-128(%r13),%ymm12,%ymm10 + vpaddq %ymm10,%ymm7,%ymm7 + vpmuludq 256-128(%r13),%ymm12,%ymm14 + vmovd %eax,%xmm12 + + vpaddq %ymm14,%ymm8,%ymm8 + + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 32-8-128(%r13),%ymm13,%ymm11 + vmovdqu 96-8-128(%r13),%ymm14 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm1,%ymm1 + vpmuludq 64-8-128(%r13),%ymm13,%ymm10 + vmovdqu 128-8-128(%r13),%ymm11 + addq %rax,%r11 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm10,%ymm2,%ymm2 + addq %r12,%rax + shrq $29,%r11 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 160-8-128(%r13),%ymm10 + addq %r11,%rax + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 192-8-128(%r13),%ymm14 +.byte 0x67 + movq %rax,%r12 + imull %ecx,%eax + vpaddq %ymm11,%ymm4,%ymm4 + vpmuludq %ymm13,%ymm10,%ymm10 +.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm5,%ymm5 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 256-8-128(%r13),%ymm10 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 288-8-128(%r13),%ymm9 + vmovd %eax,%xmm0 + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm7,%ymm7 + vpmuludq %ymm13,%ymm10,%ymm10 + vmovdqu 32-16-128(%r13),%ymm14 + vpbroadcastq %xmm0,%ymm0 + vpaddq %ymm10,%ymm8,%ymm8 + vpmuludq %ymm13,%ymm9,%ymm9 + vmovdqu 64-16-128(%r13),%ymm11 + addq %rax,%r12 + + vmovdqu 32-24-128(%r13),%ymm13 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 96-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq %ymm0,%ymm13,%ymm13 + vpmuludq %ymm12,%ymm11,%ymm11 +.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq %ymm1,%ymm13,%ymm13 + vpaddq %ymm11,%ymm2,%ymm2 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 160-16-128(%r13),%ymm11 +.byte 0x67 + vmovq %xmm13,%rax + vmovdqu %ymm13,(%rsp) + vpaddq %ymm10,%ymm3,%ymm3 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 192-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq %ymm12,%ymm11,%ymm11 + vmovdqu 224-16-128(%r13),%ymm14 + vpaddq %ymm11,%ymm5,%ymm5 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 256-16-128(%r13),%ymm11 + vpaddq %ymm10,%ymm6,%ymm6 + vpmuludq %ymm12,%ymm14,%ymm14 + shrq $29,%r12 + vmovdqu 288-16-128(%r13),%ymm10 + addq %r12,%rax + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq %ymm12,%ymm11,%ymm11 + + movq %rax,%r9 + imull %ecx,%eax + vpaddq %ymm11,%ymm8,%ymm8 + vpmuludq %ymm12,%ymm10,%ymm10 + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + vmovdqu 96-24-128(%r13),%ymm11 +.byte 0x67 + vpaddq %ymm10,%ymm9,%ymm9 + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 64-24-128(%r13),%ymm0,%ymm14 + vmovdqu 128-24-128(%r13),%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + movq 8(%rsp),%r10 + vpaddq %ymm14,%ymm2,%ymm1 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 160-24-128(%r13),%ymm14 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax +.byte 0x67 + shrq $29,%r9 + movq 16(%rsp),%r11 + vpaddq %ymm11,%ymm3,%ymm2 + vpmuludq %ymm0,%ymm10,%ymm10 + vmovdqu 192-24-128(%r13),%ymm11 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + vpaddq %ymm10,%ymm4,%ymm3 + vpmuludq %ymm0,%ymm14,%ymm14 + vmovdqu 224-24-128(%r13),%ymm10 + imulq 24-128(%r13),%rdx + addq %rax,%r11 + leaq (%r9,%r10,1),%rax + vpaddq %ymm14,%ymm5,%ymm4 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 256-24-128(%r13),%ymm14 + movq %rax,%r10 + imull %ecx,%eax + vpmuludq %ymm0,%ymm10,%ymm10 + vpaddq %ymm11,%ymm6,%ymm5 + vmovdqu 288-24-128(%r13),%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm7,%ymm6 + vpmuludq %ymm0,%ymm14,%ymm14 + addq 24(%rsp),%rdx + vpaddq %ymm14,%ymm8,%ymm7 + vpmuludq %ymm0,%ymm11,%ymm11 + vpaddq %ymm11,%ymm9,%ymm8 + vmovq %r12,%xmm9 + movq %rdx,%r12 + + decl %r14d + jnz .LOOP_REDUCE_1024 + leaq 448(%rsp),%r12 + vpaddq %ymm9,%ymm13,%ymm0 + vpxor %ymm9,%ymm9,%ymm9 + + vpaddq 288-192(%rbx),%ymm0,%ymm0 + vpaddq 320-448(%r12),%ymm1,%ymm1 + vpaddq 352-448(%r12),%ymm2,%ymm2 + vpaddq 384-448(%r12),%ymm3,%ymm3 + vpaddq 416-448(%r12),%ymm4,%ymm4 + vpaddq 448-448(%r12),%ymm5,%ymm5 + vpaddq 480-448(%r12),%ymm6,%ymm6 + vpaddq 512-448(%r12),%ymm7,%ymm7 + vpaddq 544-448(%r12),%ymm8,%ymm8 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm13,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vmovdqu %ymm0,0-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,32-128(%rdi) + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vmovdqu %ymm2,64-128(%rdi) + vpaddq %ymm13,%ymm4,%ymm4 + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vmovdqu %ymm4,128-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vmovdqu %ymm5,160-128(%rdi) + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vmovdqu %ymm6,192-128(%rdi) + vpaddq %ymm13,%ymm8,%ymm8 + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + + movq %rdi,%rsi + decl %r8d + jne .LOOP_GRANDE_SQR_1024 + + vzeroall + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lsqr_1024_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.type rsaz_1024_mul_avx2,@function +.align 64 rsaz_1024_mul_avx2: -rsaz_1024_norm2red_avx2: +.cfi_startproc + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + movq %rax,%rbp +.cfi_def_cfa_register %rbp + vzeroall + movq %rdx,%r13 + subq $64,%rsp + + + + + + +.byte 0x67,0x67 + movq %rsi,%r15 + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + movq %rsi,%r15 + cmovnzq %r13,%rsi + cmovnzq %r15,%r13 + + movq %rcx,%r15 + subq $-128,%rsi + subq $-128,%rcx + subq $-128,%rdi + + andq $4095,%r15 + addq $320,%r15 +.byte 0x67,0x67 + shrq $12,%r15 + jz .Lmul_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%rcx),%ymm0 + andq $-512,%rsp + vmovdqu 32-128(%rcx),%ymm1 + vmovdqu 64-128(%rcx),%ymm2 + vmovdqu 96-128(%rcx),%ymm3 + vmovdqu 128-128(%rcx),%ymm4 + vmovdqu 160-128(%rcx),%ymm5 + vmovdqu 192-128(%rcx),%ymm6 + vmovdqu 224-128(%rcx),%ymm7 + vmovdqu 256-128(%rcx),%ymm8 + leaq 64+128(%rsp),%rcx + vmovdqu %ymm0,0-128(%rcx) + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm1,32-128(%rcx) + vpxor %ymm1,%ymm1,%ymm1 + vmovdqu %ymm2,64-128(%rcx) + vpxor %ymm2,%ymm2,%ymm2 + vmovdqu %ymm3,96-128(%rcx) + vpxor %ymm3,%ymm3,%ymm3 + vmovdqu %ymm4,128-128(%rcx) + vpxor %ymm4,%ymm4,%ymm4 + vmovdqu %ymm5,160-128(%rcx) + vpxor %ymm5,%ymm5,%ymm5 + vmovdqu %ymm6,192-128(%rcx) + vpxor %ymm6,%ymm6,%ymm6 + vmovdqu %ymm7,224-128(%rcx) + vpxor %ymm7,%ymm7,%ymm7 + vmovdqu %ymm8,256-128(%rcx) + vmovdqa %ymm0,%ymm8 + vmovdqu %ymm9,288-128(%rcx) +.Lmul_1024_no_n_copy: + andq $-64,%rsp + + movq (%r13),%rbx + vpbroadcastq (%r13),%ymm10 + vmovdqu %ymm0,(%rsp) + xorq %r9,%r9 +.byte 0x67 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + + vmovdqu .Land_mask(%rip),%ymm15 + movl $9,%r14d + vmovdqu %ymm9,288-128(%rdi) + jmp .Loop_mul_1024 + +.align 32 +.Loop_mul_1024: + vpsrlq $29,%ymm3,%ymm9 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r9,%rax + movq %rbx,%r10 + imulq 8-128(%rsi),%r10 + addq 8(%rsp),%r10 + + movq %rax,%r9 + imull %r8d,%eax + andl $0x1fffffff,%eax + + movq %rbx,%r11 + imulq 16-128(%rsi),%r11 + addq 16(%rsp),%r11 + + movq %rbx,%r12 + imulq 24-128(%rsi),%r12 + addq 24(%rsp),%r12 + vpmuludq 32-128(%rsi),%ymm10,%ymm0 + vmovd %eax,%xmm11 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq 64-128(%rsi),%ymm10,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 96-128(%rsi),%ymm10,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq 128-128(%rsi),%ymm10,%ymm0 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq 160-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 192-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq 224-128(%rsi),%ymm10,%ymm0 + vpermq $0x93,%ymm9,%ymm9 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq 256-128(%rsi),%ymm10,%ymm12 + vpbroadcastq 8(%r13),%ymm10 + vpaddq %ymm12,%ymm8,%ymm8 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%rcx),%rax + addq %rax,%r11 + shrq $29,%r9 + imulq 24-128(%rcx),%rdx + addq %rdx,%r12 + addq %r9,%r10 + + vpmuludq 32-128(%rcx),%ymm11,%ymm13 + vmovq %xmm10,%rbx + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 64-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm2,%ymm2 + vpmuludq 96-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 128-128(%rcx),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 160-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm5,%ymm5 + vpmuludq 192-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 224-128(%rcx),%ymm11,%ymm13 + vpblendd $3,%ymm14,%ymm9,%ymm12 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 256-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm0,%ymm8,%ymm8 + + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rsi),%ymm12 + movq %rbx,%rax + imulq 8-128(%rsi),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rsi),%ymm13 + + movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 + imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 + andl $0x1fffffff,%eax + + imulq 16-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovd %eax,%xmm11 + vmovdqu -8+96-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -8+128-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+160-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+192-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -8+224-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+256-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+288-128(%rsi),%ymm9 + vpaddq %ymm12,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm13,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm9,%ymm9 + vpbroadcastq 16(%r13),%ymm10 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rcx),%ymm0 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rcx),%ymm12 + shrq $29,%r10 + imulq 16-128(%rcx),%rdx + addq %rdx,%r12 + addq %r10,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -8+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rsi),%ymm0 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r11,%rax + + vmovdqu -16+64-128(%rsi),%ymm12 + movq %rax,%r11 + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 8-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -16+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -16+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -16+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 24(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rcx),%ymm0 + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r11 + vmovdqu -16+64-128(%rcx),%ymm12 + imulq 8-128(%rcx),%rdx + addq %rdx,%r12 + shrq $29,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -16+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+32-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+64-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm9,%ymm9 + + addq %r11,%r12 + imulq -128(%rsi),%rbx + addq %rbx,%r12 + + movq %r12,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -24+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -24+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -24+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 32(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + addq $32,%r13 + + vmovdqu -24+32-128(%rcx),%ymm0 + imulq -128(%rcx),%rax + addq %rax,%r12 + shrq $29,%r12 + + vmovdqu -24+64-128(%rcx),%ymm12 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -24+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm0 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu %ymm0,(%rsp) + vpaddq %ymm12,%ymm2,%ymm1 + vmovdqu -24+128-128(%rcx),%ymm0 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm2 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm3 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm4 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm5 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+288-128(%rcx),%ymm13 + movq %r12,%r9 + vpaddq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm11,%ymm12,%ymm12 + addq (%rsp),%r9 + vpaddq %ymm12,%ymm8,%ymm7 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovq %r12,%xmm12 + vpaddq %ymm13,%ymm9,%ymm8 + + decl %r14d + jnz .Loop_mul_1024 + vpaddq (%rsp),%ymm12,%ymm0 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm10,%ymm10 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpermq $0x93,%ymm11,%ymm11 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm10,%ymm10 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vmovdqu %ymm0,0-128(%rdi) + vmovdqu %ymm1,32-128(%rdi) + vmovdqu %ymm2,64-128(%rdi) + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vmovdqu %ymm4,128-128(%rdi) + vmovdqu %ymm5,160-128(%rdi) + vmovdqu %ymm6,192-128(%rdi) + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + vzeroupper + + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lmul_1024_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 +.globl rsaz_1024_red2norm_avx2 +.type rsaz_1024_red2norm_avx2,@function +.align 32 rsaz_1024_red2norm_avx2: +.cfi_startproc + subq $-128,%rsi + xorq %rax,%rax + movq -128(%rsi),%r8 + movq -120(%rsi),%r9 + movq -112(%rsi),%r10 + shlq $0,%r8 + shlq $29,%r9 + movq %r10,%r11 + shlq $58,%r10 + shrq $6,%r11 + addq %r8,%rax + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,0(%rdi) + movq %r11,%rax + movq -104(%rsi),%r8 + movq -96(%rsi),%r9 + shlq $23,%r8 + movq %r9,%r10 + shlq $52,%r9 + shrq $12,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,8(%rdi) + movq %r10,%rax + movq -88(%rsi),%r11 + movq -80(%rsi),%r8 + shlq $17,%r11 + movq %r8,%r9 + shlq $46,%r8 + shrq $18,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,16(%rdi) + movq %r9,%rax + movq -72(%rsi),%r10 + movq -64(%rsi),%r11 + shlq $11,%r10 + movq %r11,%r8 + shlq $40,%r11 + shrq $24,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,24(%rdi) + movq %r8,%rax + movq -56(%rsi),%r9 + movq -48(%rsi),%r10 + movq -40(%rsi),%r11 + shlq $5,%r9 + shlq $34,%r10 + movq %r11,%r8 + shlq $63,%r11 + shrq $1,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,32(%rdi) + movq %r8,%rax + movq -32(%rsi),%r9 + movq -24(%rsi),%r10 + shlq $28,%r9 + movq %r10,%r11 + shlq $57,%r10 + shrq $7,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,40(%rdi) + movq %r11,%rax + movq -16(%rsi),%r8 + movq -8(%rsi),%r9 + shlq $22,%r8 + movq %r9,%r10 + shlq $51,%r9 + shrq $13,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,48(%rdi) + movq %r10,%rax + movq 0(%rsi),%r11 + movq 8(%rsi),%r8 + shlq $16,%r11 + movq %r8,%r9 + shlq $45,%r8 + shrq $19,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,56(%rdi) + movq %r9,%rax + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + shlq $10,%r10 + movq %r11,%r8 + shlq $39,%r11 + shrq $25,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,64(%rdi) + movq %r8,%rax + movq 32(%rsi),%r9 + movq 40(%rsi),%r10 + movq 48(%rsi),%r11 + shlq $4,%r9 + shlq $33,%r10 + movq %r11,%r8 + shlq $62,%r11 + shrq $2,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,72(%rdi) + movq %r8,%rax + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + shlq $27,%r9 + movq %r10,%r11 + shlq $56,%r10 + shrq $8,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,80(%rdi) + movq %r11,%rax + movq 72(%rsi),%r8 + movq 80(%rsi),%r9 + shlq $21,%r8 + movq %r9,%r10 + shlq $50,%r9 + shrq $14,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,88(%rdi) + movq %r10,%rax + movq 88(%rsi),%r11 + movq 96(%rsi),%r8 + shlq $15,%r11 + movq %r8,%r9 + shlq $44,%r8 + shrq $20,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,96(%rdi) + movq %r9,%rax + movq 104(%rsi),%r10 + movq 112(%rsi),%r11 + shlq $9,%r10 + movq %r11,%r8 + shlq $38,%r11 + shrq $26,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,104(%rdi) + movq %r8,%rax + movq 120(%rsi),%r9 + movq 128(%rsi),%r10 + movq 136(%rsi),%r11 + shlq $3,%r9 + shlq $32,%r10 + movq %r11,%r8 + shlq $61,%r11 + shrq $3,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,112(%rdi) + movq %r8,%rax + movq 144(%rsi),%r9 + movq 152(%rsi),%r10 + shlq $26,%r9 + movq %r10,%r11 + shlq $55,%r10 + shrq $9,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,120(%rdi) + movq %r11,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 + +.globl rsaz_1024_norm2red_avx2 +.type rsaz_1024_norm2red_avx2,@function +.align 32 +rsaz_1024_norm2red_avx2: +.cfi_startproc + subq $-128,%rdi + movq (%rsi),%r8 + movl $0x1fffffff,%eax + movq 8(%rsi),%r9 + movq %r8,%r11 + shrq $0,%r11 + andq %rax,%r11 + movq %r11,-128(%rdi) + movq %r8,%r10 + shrq $29,%r10 + andq %rax,%r10 + movq %r10,-120(%rdi) + shrdq $58,%r9,%r8 + andq %rax,%r8 + movq %r8,-112(%rdi) + movq 16(%rsi),%r10 + movq %r9,%r8 + shrq $23,%r8 + andq %rax,%r8 + movq %r8,-104(%rdi) + shrdq $52,%r10,%r9 + andq %rax,%r9 + movq %r9,-96(%rdi) + movq 24(%rsi),%r11 + movq %r10,%r9 + shrq $17,%r9 + andq %rax,%r9 + movq %r9,-88(%rdi) + shrdq $46,%r11,%r10 + andq %rax,%r10 + movq %r10,-80(%rdi) + movq 32(%rsi),%r8 + movq %r11,%r10 + shrq $11,%r10 + andq %rax,%r10 + movq %r10,-72(%rdi) + shrdq $40,%r8,%r11 + andq %rax,%r11 + movq %r11,-64(%rdi) + movq 40(%rsi),%r9 + movq %r8,%r11 + shrq $5,%r11 + andq %rax,%r11 + movq %r11,-56(%rdi) + movq %r8,%r10 + shrq $34,%r10 + andq %rax,%r10 + movq %r10,-48(%rdi) + shrdq $63,%r9,%r8 + andq %rax,%r8 + movq %r8,-40(%rdi) + movq 48(%rsi),%r10 + movq %r9,%r8 + shrq $28,%r8 + andq %rax,%r8 + movq %r8,-32(%rdi) + shrdq $57,%r10,%r9 + andq %rax,%r9 + movq %r9,-24(%rdi) + movq 56(%rsi),%r11 + movq %r10,%r9 + shrq $22,%r9 + andq %rax,%r9 + movq %r9,-16(%rdi) + shrdq $51,%r11,%r10 + andq %rax,%r10 + movq %r10,-8(%rdi) + movq 64(%rsi),%r8 + movq %r11,%r10 + shrq $16,%r10 + andq %rax,%r10 + movq %r10,0(%rdi) + shrdq $45,%r8,%r11 + andq %rax,%r11 + movq %r11,8(%rdi) + movq 72(%rsi),%r9 + movq %r8,%r11 + shrq $10,%r11 + andq %rax,%r11 + movq %r11,16(%rdi) + shrdq $39,%r9,%r8 + andq %rax,%r8 + movq %r8,24(%rdi) + movq 80(%rsi),%r10 + movq %r9,%r8 + shrq $4,%r8 + andq %rax,%r8 + movq %r8,32(%rdi) + movq %r9,%r11 + shrq $33,%r11 + andq %rax,%r11 + movq %r11,40(%rdi) + shrdq $62,%r10,%r9 + andq %rax,%r9 + movq %r9,48(%rdi) + movq 88(%rsi),%r11 + movq %r10,%r9 + shrq $27,%r9 + andq %rax,%r9 + movq %r9,56(%rdi) + shrdq $56,%r11,%r10 + andq %rax,%r10 + movq %r10,64(%rdi) + movq 96(%rsi),%r8 + movq %r11,%r10 + shrq $21,%r10 + andq %rax,%r10 + movq %r10,72(%rdi) + shrdq $50,%r8,%r11 + andq %rax,%r11 + movq %r11,80(%rdi) + movq 104(%rsi),%r9 + movq %r8,%r11 + shrq $15,%r11 + andq %rax,%r11 + movq %r11,88(%rdi) + shrdq $44,%r9,%r8 + andq %rax,%r8 + movq %r8,96(%rdi) + movq 112(%rsi),%r10 + movq %r9,%r8 + shrq $9,%r8 + andq %rax,%r8 + movq %r8,104(%rdi) + shrdq $38,%r10,%r9 + andq %rax,%r9 + movq %r9,112(%rdi) + movq 120(%rsi),%r11 + movq %r10,%r9 + shrq $3,%r9 + andq %rax,%r9 + movq %r9,120(%rdi) + movq %r10,%r8 + shrq $32,%r8 + andq %rax,%r8 + movq %r8,128(%rdi) + shrdq $61,%r11,%r10 + andq %rax,%r10 + movq %r10,136(%rdi) + xorq %r8,%r8 + movq %r11,%r10 + shrq $26,%r10 + andq %rax,%r10 + movq %r10,144(%rdi) + shrdq $55,%r8,%r11 + andq %rax,%r11 + movq %r11,152(%rdi) + movq %r8,160(%rdi) + movq %r8,168(%rdi) + movq %r8,176(%rdi) + movq %r8,184(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 +.globl rsaz_1024_scatter5_avx2 +.type rsaz_1024_scatter5_avx2,@function +.align 32 rsaz_1024_scatter5_avx2: +.cfi_startproc + vzeroupper + vmovdqu .Lscatter_permd(%rip),%ymm5 + shll $4,%edx + leaq (%rdi,%rdx,1),%rdi + movl $9,%eax + jmp .Loop_scatter_1024 + +.align 32 +.Loop_scatter_1024: + vmovdqu (%rsi),%ymm0 + leaq 32(%rsi),%rsi + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,(%rdi) + leaq 512(%rdi),%rdi + decl %eax + jnz .Loop_scatter_1024 + + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 + +.globl rsaz_1024_gather5_avx2 +.type rsaz_1024_gather5_avx2,@function +.align 32 rsaz_1024_gather5_avx2: -.byte 0x0f,0x0b +.cfi_startproc + vzeroupper + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + leaq -256(%rsp),%rsp + andq $-32,%rsp + leaq .Linc(%rip),%r10 + leaq -128(%rsp),%rax + + vmovd %edx,%xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,0+128(%rax) + vpaddd %ymm5,%ymm2,%ymm0 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,32+128(%rax) + vpaddd %ymm5,%ymm3,%ymm1 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,64+128(%rax) + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vmovdqa %ymm3,96+128(%rax) + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,128+128(%rax) + vpaddd %ymm5,%ymm2,%ymm8 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,160+128(%rax) + vpaddd %ymm5,%ymm3,%ymm9 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,192+128(%rax) + vpaddd %ymm5,%ymm8,%ymm10 + vpcmpeqd %ymm4,%ymm8,%ymm8 + vmovdqa %ymm3,224+128(%rax) + vpaddd %ymm5,%ymm9,%ymm11 + vpcmpeqd %ymm4,%ymm9,%ymm9 + vpaddd %ymm5,%ymm10,%ymm12 + vpcmpeqd %ymm4,%ymm10,%ymm10 + vpaddd %ymm5,%ymm11,%ymm13 + vpcmpeqd %ymm4,%ymm11,%ymm11 + vpaddd %ymm5,%ymm12,%ymm14 + vpcmpeqd %ymm4,%ymm12,%ymm12 + vpaddd %ymm5,%ymm13,%ymm15 + vpcmpeqd %ymm4,%ymm13,%ymm13 + vpcmpeqd %ymm4,%ymm14,%ymm14 + vpcmpeqd %ymm4,%ymm15,%ymm15 + + vmovdqa -32(%r10),%ymm7 + leaq 128(%rsi),%rsi + movl $9,%edx + +.Loop_gather_1024: + vmovdqa 0-128(%rsi),%ymm0 + vmovdqa 32-128(%rsi),%ymm1 + vmovdqa 64-128(%rsi),%ymm2 + vmovdqa 96-128(%rsi),%ymm3 + vpand 0+128(%rax),%ymm0,%ymm0 + vpand 32+128(%rax),%ymm1,%ymm1 + vpand 64+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm1,%ymm4 + vpand 96+128(%rax),%ymm3,%ymm3 + vmovdqa 128-128(%rsi),%ymm0 + vmovdqa 160-128(%rsi),%ymm1 + vpor %ymm2,%ymm3,%ymm5 + vmovdqa 192-128(%rsi),%ymm2 + vmovdqa 224-128(%rsi),%ymm3 + vpand 128+128(%rax),%ymm0,%ymm0 + vpand 160+128(%rax),%ymm1,%ymm1 + vpand 192+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm4,%ymm4 + vpand 224+128(%rax),%ymm3,%ymm3 + vpand 256-128(%rsi),%ymm8,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 288-128(%rsi),%ymm9,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 320-128(%rsi),%ymm10,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 352-128(%rsi),%ymm11,%ymm3 + vpor %ymm0,%ymm4,%ymm4 + vpand 384-128(%rsi),%ymm12,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 416-128(%rsi),%ymm13,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 448-128(%rsi),%ymm14,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 480-128(%rsi),%ymm15,%ymm3 + leaq 512(%rsi),%rsi + vpor %ymm0,%ymm4,%ymm4 + vpor %ymm1,%ymm5,%ymm5 + vpor %ymm2,%ymm4,%ymm4 + vpor %ymm3,%ymm5,%ymm5 + + vpor %ymm5,%ymm4,%ymm4 + vextracti128 $1,%ymm4,%xmm5 + vpor %xmm4,%xmm5,%xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,(%rdi) + leaq 32(%rdi),%rdi + decl %edx + jnz .Loop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + vzeroupper + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp .byte 0xf3,0xc3 -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: +.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 + +.globl rsaz_avx2_eligible +.type rsaz_avx2_eligible,@function +.align 32 +rsaz_avx2_eligible: + movl OPENSSL_ia32cap_P+8(%rip),%eax + movl $524544,%ecx + movl $0,%edx + andl %eax,%ecx + cmpl $524544,%ecx + cmovel %edx,%eax + andl $32,%eax + shrl $5,%eax + .byte 0xf3,0xc3 +.size rsaz_avx2_eligible,.-rsaz_avx2_eligible + +.align 64 +.Land_mask: +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff +.Lscatter_permd: +.long 0,2,4,6,7,7,7,7 +.Lgather_permd: +.long 0,7,1,7,2,7,3,7 +.Linc: +.long 0,0,0,0, 1,1,1,1 +.long 2,2,2,2, 3,3,3,3 +.long 4,4,4,4, 4,4,4,4 +.align 64 Index: stable/12/secure/lib/libcrypto/amd64/rsaz-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/rsaz-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/rsaz-x86_64.S (revision 364963) @@ -1,1353 +1,2017 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from rsaz-x86_64.pl. */ .text .globl rsaz_512_sqr .type rsaz_512_sqr,@function .align 32 rsaz_512_sqr: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lsqr_body: .byte 102,72,15,110,202 movq (%rsi),%rdx movq 8(%rsi),%rax movq %rcx,128(%rsp) + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Loop_sqrx jmp .Loop_sqr .align 32 .Loop_sqr: movl %r8d,128+8(%rsp) movq %rdx,%rbx movq %rax,%rbp mulq %rdx movq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 mulq %rbx addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r12 movq 48(%rsi),%rax movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx addq %rax,%r14 movq %rbx,%rax adcq $0,%rdx xorq %rcx,%rcx addq %r8,%r8 movq %rdx,%r15 adcq $0,%rcx mulq %rax addq %r8,%rdx adcq $0,%rcx movq %rax,(%rsp) movq %rdx,8(%rsp) movq 16(%rsi),%rax mulq %rbp addq %rax,%r10 movq 24(%rsi),%rax movq %rdx,%rbx adcq $0,%rbx mulq %rbp addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx addq %rbx,%r11 movq %rdx,%rbx adcq $0,%rbx mulq %rbp addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx addq %rbx,%r12 movq %rdx,%rbx adcq $0,%rbx mulq %rbp addq %rax,%r13 movq 48(%rsi),%rax adcq $0,%rdx addq %rbx,%r13 movq %rdx,%rbx adcq $0,%rbx mulq %rbp addq %rax,%r14 movq 56(%rsi),%rax adcq $0,%rdx addq %rbx,%r14 movq %rdx,%rbx adcq $0,%rbx mulq %rbp addq %rax,%r15 movq %rbp,%rax adcq $0,%rdx addq %rbx,%r15 adcq $0,%rdx xorq %rbx,%rbx addq %r9,%r9 movq %rdx,%r8 adcq %r10,%r10 adcq $0,%rbx mulq %rax addq %rcx,%rax movq 16(%rsi),%rbp addq %rax,%r9 movq 24(%rsi),%rax adcq %rdx,%r10 adcq $0,%rbx movq %r9,16(%rsp) movq %r10,24(%rsp) mulq %rbp addq %rax,%r12 movq 32(%rsi),%rax movq %rdx,%rcx adcq $0,%rcx mulq %rbp addq %rax,%r13 movq 40(%rsi),%rax adcq $0,%rdx addq %rcx,%r13 movq %rdx,%rcx adcq $0,%rcx mulq %rbp addq %rax,%r14 movq 48(%rsi),%rax adcq $0,%rdx addq %rcx,%r14 movq %rdx,%rcx adcq $0,%rcx mulq %rbp addq %rax,%r15 movq 56(%rsi),%rax adcq $0,%rdx addq %rcx,%r15 movq %rdx,%rcx adcq $0,%rcx mulq %rbp addq %rax,%r8 movq %rbp,%rax adcq $0,%rdx addq %rcx,%r8 adcq $0,%rdx xorq %rcx,%rcx addq %r11,%r11 movq %rdx,%r9 adcq %r12,%r12 adcq $0,%rcx mulq %rax addq %rbx,%rax movq 24(%rsi),%r10 addq %rax,%r11 movq 32(%rsi),%rax adcq %rdx,%r12 adcq $0,%rcx movq %r11,32(%rsp) movq %r12,40(%rsp) movq %rax,%r11 mulq %r10 addq %rax,%r14 movq 40(%rsi),%rax movq %rdx,%rbx adcq $0,%rbx movq %rax,%r12 mulq %r10 addq %rax,%r15 movq 48(%rsi),%rax adcq $0,%rdx addq %rbx,%r15 movq %rdx,%rbx adcq $0,%rbx movq %rax,%rbp mulq %r10 addq %rax,%r8 movq 56(%rsi),%rax adcq $0,%rdx addq %rbx,%r8 movq %rdx,%rbx adcq $0,%rbx mulq %r10 addq %rax,%r9 movq %r10,%rax adcq $0,%rdx addq %rbx,%r9 adcq $0,%rdx xorq %rbx,%rbx addq %r13,%r13 movq %rdx,%r10 adcq %r14,%r14 adcq $0,%rbx mulq %rax addq %rcx,%rax addq %rax,%r13 movq %r12,%rax adcq %rdx,%r14 adcq $0,%rbx movq %r13,48(%rsp) movq %r14,56(%rsp) mulq %r11 addq %rax,%r8 movq %rbp,%rax movq %rdx,%rcx adcq $0,%rcx mulq %r11 addq %rax,%r9 movq 56(%rsi),%rax adcq $0,%rdx addq %rcx,%r9 movq %rdx,%rcx adcq $0,%rcx movq %rax,%r14 mulq %r11 addq %rax,%r10 movq %r11,%rax adcq $0,%rdx addq %rcx,%r10 adcq $0,%rdx xorq %rcx,%rcx addq %r15,%r15 movq %rdx,%r11 adcq %r8,%r8 adcq $0,%rcx mulq %rax addq %rbx,%rax addq %rax,%r15 movq %rbp,%rax adcq %rdx,%r8 adcq $0,%rcx movq %r15,64(%rsp) movq %r8,72(%rsp) mulq %r12 addq %rax,%r10 movq %r14,%rax movq %rdx,%rbx adcq $0,%rbx mulq %r12 addq %rax,%r11 movq %r12,%rax adcq $0,%rdx addq %rbx,%r11 adcq $0,%rdx xorq %rbx,%rbx addq %r9,%r9 movq %rdx,%r12 adcq %r10,%r10 adcq $0,%rbx mulq %rax addq %rcx,%rax addq %rax,%r9 movq %r14,%rax adcq %rdx,%r10 adcq $0,%rbx movq %r9,80(%rsp) movq %r10,88(%rsp) mulq %rbp addq %rax,%r12 movq %rbp,%rax adcq $0,%rdx xorq %rcx,%rcx addq %r11,%r11 movq %rdx,%r13 adcq %r12,%r12 adcq $0,%rcx mulq %rax addq %rbx,%rax addq %rax,%r11 movq %r14,%rax adcq %rdx,%r12 adcq $0,%rcx movq %r11,96(%rsp) movq %r12,104(%rsp) xorq %rbx,%rbx addq %r13,%r13 adcq $0,%rbx mulq %rax addq %rcx,%rax addq %r13,%rax adcq %rbx,%rdx movq (%rsp),%r8 movq 8(%rsp),%r9 movq 16(%rsp),%r10 movq 24(%rsp),%r11 movq 32(%rsp),%r12 movq 40(%rsp),%r13 movq 48(%rsp),%r14 movq 56(%rsp),%r15 .byte 102,72,15,126,205 movq %rax,112(%rsp) movq %rdx,120(%rsp) call __rsaz_512_reduce addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 adcq 88(%rsp),%r11 adcq 96(%rsp),%r12 adcq 104(%rsp),%r13 adcq 112(%rsp),%r14 adcq 120(%rsp),%r15 sbbq %rcx,%rcx call __rsaz_512_subtract movq %r8,%rdx movq %r9,%rax movl 128+8(%rsp),%r8d movq %rdi,%rsi decl %r8d jnz .Loop_sqr + jmp .Lsqr_tail +.align 32 +.Loop_sqrx: + movl %r8d,128+8(%rsp) +.byte 102,72,15,110,199 + + mulxq %rax,%r8,%r9 + movq %rax,%rbx + + mulxq 16(%rsi),%rcx,%r10 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r11 + adcxq %rcx,%r9 + +.byte 0xc4,0x62,0xf3,0xf6,0xa6,0x20,0x00,0x00,0x00 + adcxq %rax,%r10 + +.byte 0xc4,0x62,0xfb,0xf6,0xae,0x28,0x00,0x00,0x00 + adcxq %rcx,%r11 + + mulxq 48(%rsi),%rcx,%r14 + adcxq %rax,%r12 + adcxq %rcx,%r13 + + mulxq 56(%rsi),%rax,%r15 + adcxq %rax,%r14 + adcxq %rbp,%r15 + + mulxq %rdx,%rax,%rdi + movq %rbx,%rdx + xorq %rcx,%rcx + adoxq %r8,%r8 + adcxq %rdi,%r8 + adoxq %rbp,%rcx + adcxq %rbp,%rcx + + movq %rax,(%rsp) + movq %r8,8(%rsp) + + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x10,0x00,0x00,0x00 + adoxq %rax,%r10 + adcxq %rbx,%r11 + + mulxq 24(%rsi),%rdi,%r8 + adoxq %rdi,%r11 +.byte 0x66 + adcxq %r8,%r12 + + mulxq 32(%rsi),%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + + mulxq 40(%rsi),%rdi,%r8 + adoxq %rdi,%r13 + adcxq %r8,%r14 + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adoxq %rax,%r14 + adcxq %rbx,%r15 + +.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 + adoxq %rdi,%r15 + adcxq %rbp,%r8 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r8 +.byte 0x48,0x8b,0x96,0x10,0x00,0x00,0x00 + + xorq %rbx,%rbx + adoxq %r9,%r9 + + adcxq %rcx,%rax + adoxq %r10,%r10 + adcxq %rax,%r9 + adoxq %rbp,%rbx + adcxq %rdi,%r10 + adcxq %rbp,%rbx + + movq %r9,16(%rsp) +.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 + + + mulxq 24(%rsi),%rdi,%r9 + adoxq %rdi,%r12 + adcxq %r9,%r13 + + mulxq 32(%rsi),%rax,%rcx + adoxq %rax,%r13 + adcxq %rcx,%r14 + +.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x28,0x00,0x00,0x00 + adoxq %rdi,%r14 + adcxq %r9,%r15 + +.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 + adoxq %rax,%r15 + adcxq %rcx,%r8 + + mulxq 56(%rsi),%rdi,%r9 + adoxq %rdi,%r8 + adcxq %rbp,%r9 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r9 + movq 24(%rsi),%rdx + + xorq %rcx,%rcx + adoxq %r11,%r11 + + adcxq %rbx,%rax + adoxq %r12,%r12 + adcxq %rax,%r11 + adoxq %rbp,%rcx + adcxq %rdi,%r12 + adcxq %rbp,%rcx + + movq %r11,32(%rsp) + movq %r12,40(%rsp) + + + mulxq 32(%rsi),%rax,%rbx + adoxq %rax,%r14 + adcxq %rbx,%r15 + + mulxq 40(%rsi),%rdi,%r10 + adoxq %rdi,%r15 + adcxq %r10,%r8 + + mulxq 48(%rsi),%rax,%rbx + adoxq %rax,%r8 + adcxq %rbx,%r9 + + mulxq 56(%rsi),%rdi,%r10 + adoxq %rdi,%r9 + adcxq %rbp,%r10 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r10 + movq 32(%rsi),%rdx + + xorq %rbx,%rbx + adoxq %r13,%r13 + + adcxq %rcx,%rax + adoxq %r14,%r14 + adcxq %rax,%r13 + adoxq %rbp,%rbx + adcxq %rdi,%r14 + adcxq %rbp,%rbx + + movq %r13,48(%rsp) + movq %r14,56(%rsp) + + + mulxq 40(%rsi),%rdi,%r11 + adoxq %rdi,%r8 + adcxq %r11,%r9 + + mulxq 48(%rsi),%rax,%rcx + adoxq %rax,%r9 + adcxq %rcx,%r10 + + mulxq 56(%rsi),%rdi,%r11 + adoxq %rdi,%r10 + adcxq %rbp,%r11 + mulxq %rdx,%rax,%rdi + movq 40(%rsi),%rdx + adoxq %rbp,%r11 + + xorq %rcx,%rcx + adoxq %r15,%r15 + + adcxq %rbx,%rax + adoxq %r8,%r8 + adcxq %rax,%r15 + adoxq %rbp,%rcx + adcxq %rdi,%r8 + adcxq %rbp,%rcx + + movq %r15,64(%rsp) + movq %r8,72(%rsp) + + +.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 + adoxq %rax,%r10 + adcxq %rbx,%r11 + +.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 + adoxq %rdi,%r11 + adcxq %rbp,%r12 + mulxq %rdx,%rax,%rdi + adoxq %rbp,%r12 + movq 48(%rsi),%rdx + + xorq %rbx,%rbx + adoxq %r9,%r9 + + adcxq %rcx,%rax + adoxq %r10,%r10 + adcxq %rax,%r9 + adcxq %rdi,%r10 + adoxq %rbp,%rbx + adcxq %rbp,%rbx + + movq %r9,80(%rsp) + movq %r10,88(%rsp) + + +.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 + adoxq %rax,%r12 + adoxq %rbp,%r13 + + mulxq %rdx,%rax,%rdi + xorq %rcx,%rcx + movq 56(%rsi),%rdx + adoxq %r11,%r11 + + adcxq %rbx,%rax + adoxq %r12,%r12 + adcxq %rax,%r11 + adoxq %rbp,%rcx + adcxq %rdi,%r12 + adcxq %rbp,%rcx + +.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 +.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 + + + mulxq %rdx,%rax,%rdx + xorq %rbx,%rbx + adoxq %r13,%r13 + + adcxq %rcx,%rax + adoxq %rbp,%rbx + adcxq %r13,%rax + adcxq %rdx,%rbx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + movq %rax,112(%rsp) + movq %rbx,120(%rsp) + + call __rsaz_512_reducex + + addq 64(%rsp),%r8 + adcq 72(%rsp),%r9 + adcq 80(%rsp),%r10 + adcq 88(%rsp),%r11 + adcq 96(%rsp),%r12 + adcq 104(%rsp),%r13 + adcq 112(%rsp),%r14 + adcq 120(%rsp),%r15 + sbbq %rcx,%rcx + + call __rsaz_512_subtract + + movq %r8,%rdx + movq %r9,%rax + movl 128+8(%rsp),%r8d + movq %rdi,%rsi + + decl %r8d + jnz .Loop_sqrx + +.Lsqr_tail: + leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lsqr_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size rsaz_512_sqr,.-rsaz_512_sqr .globl rsaz_512_mul .type rsaz_512_mul,@function .align 32 rsaz_512_mul: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_body: .byte 102,72,15,110,199 .byte 102,72,15,110,201 movq %r8,128(%rsp) + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx movq (%rdx),%rbx movq %rdx,%rbp call __rsaz_512_mul .byte 102,72,15,126,199 .byte 102,72,15,126,205 movq (%rsp),%r8 movq 8(%rsp),%r9 movq 16(%rsp),%r10 movq 24(%rsp),%r11 movq 32(%rsp),%r12 movq 40(%rsp),%r13 movq 48(%rsp),%r14 movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_tail + +.align 32 +.Lmulx: + movq %rdx,%rbp + movq (%rdx),%rdx + call __rsaz_512_mulx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex +.Lmul_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 adcq 88(%rsp),%r11 adcq 96(%rsp),%r12 adcq 104(%rsp),%r13 adcq 112(%rsp),%r14 adcq 120(%rsp),%r15 sbbq %rcx,%rcx call __rsaz_512_subtract leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size rsaz_512_mul,.-rsaz_512_mul .globl rsaz_512_mul_gather4 .type rsaz_512_mul_gather4,@function .align 32 rsaz_512_mul_gather4: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $152,%rsp .cfi_adjust_cfa_offset 152 .Lmul_gather4_body: movd %r9d,%xmm8 movdqa .Linc+16(%rip),%xmm1 movdqa .Linc(%rip),%xmm0 pshufd $0,%xmm8,%xmm8 movdqa %xmm1,%xmm7 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm8,%xmm0 movdqa %xmm7,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm8,%xmm1 movdqa %xmm7,%xmm4 paddd %xmm2,%xmm3 pcmpeqd %xmm8,%xmm2 movdqa %xmm7,%xmm5 paddd %xmm3,%xmm4 pcmpeqd %xmm8,%xmm3 movdqa %xmm7,%xmm6 paddd %xmm4,%xmm5 pcmpeqd %xmm8,%xmm4 paddd %xmm5,%xmm6 pcmpeqd %xmm8,%xmm5 paddd %xmm6,%xmm7 pcmpeqd %xmm8,%xmm6 pcmpeqd %xmm8,%xmm7 movdqa 0(%rdx),%xmm8 movdqa 16(%rdx),%xmm9 movdqa 32(%rdx),%xmm10 movdqa 48(%rdx),%xmm11 pand %xmm0,%xmm8 movdqa 64(%rdx),%xmm12 pand %xmm1,%xmm9 movdqa 80(%rdx),%xmm13 pand %xmm2,%xmm10 movdqa 96(%rdx),%xmm14 pand %xmm3,%xmm11 movdqa 112(%rdx),%xmm15 leaq 128(%rdx),%rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx_gather .byte 102,76,15,126,195 movq %r8,128(%rsp) movq %rdi,128+8(%rsp) movq %rcx,128+16(%rsp) movq (%rsi),%rax movq 8(%rsi),%rcx mulq %rbx movq %rax,(%rsp) movq %rcx,%rax movq %rdx,%r8 mulq %rbx addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r12 movq 48(%rsi),%rax movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 adcq $0,%r15 leaq 8(%rsp),%rdi movl $7,%ecx jmp .Loop_mul_gather .align 32 .Loop_mul_gather: movdqa 0(%rbp),%xmm8 movdqa 16(%rbp),%xmm9 movdqa 32(%rbp),%xmm10 movdqa 48(%rbp),%xmm11 pand %xmm0,%xmm8 movdqa 64(%rbp),%xmm12 pand %xmm1,%xmm9 movdqa 80(%rbp),%xmm13 pand %xmm2,%xmm10 movdqa 96(%rbp),%xmm14 pand %xmm3,%xmm11 movdqa 112(%rbp),%xmm15 leaq 128(%rbp),%rbp pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 .byte 102,76,15,126,195 mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx addq %r9,%r8 movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rsi),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rsi),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx addq %r15,%r14 movq %rdx,%r15 adcq $0,%r15 leaq 8(%rdi),%rdi decl %ecx jnz .Loop_mul_gather movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq 128+8(%rsp),%rdi movq 128+16(%rsp),%rbp movq (%rsp),%r8 movq 8(%rsp),%r9 movq 16(%rsp),%r10 movq 24(%rsp),%r11 movq 32(%rsp),%r12 movq 40(%rsp),%r13 movq 48(%rsp),%r14 movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_gather_tail + +.align 32 +.Lmulx_gather: +.byte 102,76,15,126,194 + + movq %r8,128(%rsp) + movq %rdi,128+8(%rsp) + movq %rcx,128+16(%rsp) + + mulxq (%rsi),%rbx,%r8 + movq %rbx,(%rsp) + xorl %edi,%edi + + mulxq 8(%rsi),%rax,%r9 + + mulxq 16(%rsi),%rbx,%r10 + adcxq %rax,%r8 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rbx,%r9 + + mulxq 32(%rsi),%rbx,%r12 + adcxq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rbx,%r11 + + mulxq 48(%rsi),%rbx,%r14 + adcxq %rax,%r12 + + mulxq 56(%rsi),%rax,%r15 + adcxq %rbx,%r13 + adcxq %rax,%r14 +.byte 0x67 + movq %r8,%rbx + adcxq %rdi,%r15 + + movq $-7,%rcx + jmp .Loop_mulx_gather + +.align 32 +.Loop_mulx_gather: + movdqa 0(%rbp),%xmm8 + movdqa 16(%rbp),%xmm9 + movdqa 32(%rbp),%xmm10 + movdqa 48(%rbp),%xmm11 + pand %xmm0,%xmm8 + movdqa 64(%rbp),%xmm12 + pand %xmm1,%xmm9 + movdqa 80(%rbp),%xmm13 + pand %xmm2,%xmm10 + movdqa 96(%rbp),%xmm14 + pand %xmm3,%xmm11 + movdqa 112(%rbp),%xmm15 + leaq 128(%rbp),%rbp + pand %xmm4,%xmm12 + pand %xmm5,%xmm13 + pand %xmm6,%xmm14 + pand %xmm7,%xmm15 + por %xmm10,%xmm8 + por %xmm11,%xmm9 + por %xmm12,%xmm8 + por %xmm13,%xmm9 + por %xmm14,%xmm8 + por %xmm15,%xmm9 + + por %xmm9,%xmm8 + pshufd $0x4e,%xmm8,%xmm9 + por %xmm9,%xmm8 +.byte 102,76,15,126,194 + +.byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rsi),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rsi),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + +.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 + adcxq %rax,%r10 + adoxq %r12,%r11 + + mulxq 32(%rsi),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 +.byte 0x67 + adoxq %r15,%r14 + + mulxq 56(%rsi),%rax,%r15 + movq %rbx,64(%rsp,%rcx,8) + adcxq %rax,%r14 + adoxq %rdi,%r15 + movq %r8,%rbx + adcxq %rdi,%r15 + + incq %rcx + jnz .Loop_mulx_gather + + movq %r8,64(%rsp) + movq %r9,64+8(%rsp) + movq %r10,64+16(%rsp) + movq %r11,64+24(%rsp) + movq %r12,64+32(%rsp) + movq %r13,64+40(%rsp) + movq %r14,64+48(%rsp) + movq %r15,64+56(%rsp) + + movq 128(%rsp),%rdx + movq 128+8(%rsp),%rdi + movq 128+16(%rsp),%rbp + + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + +.Lmul_gather_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 adcq 88(%rsp),%r11 adcq 96(%rsp),%r12 adcq 104(%rsp),%r13 adcq 112(%rsp),%r14 adcq 120(%rsp),%r15 sbbq %rcx,%rcx call __rsaz_512_subtract leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lmul_gather4_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4 .globl rsaz_512_mul_scatter4 .type rsaz_512_mul_scatter4,@function .align 32 rsaz_512_mul_scatter4: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 movl %r9d,%r9d subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_scatter4_body: leaq (%r8,%r9,8),%r8 .byte 102,72,15,110,199 .byte 102,72,15,110,202 .byte 102,73,15,110,208 movq %rcx,128(%rsp) movq %rdi,%rbp + movl $0x80100,%r11d + andl OPENSSL_ia32cap_P+8(%rip),%r11d + cmpl $0x80100,%r11d + je .Lmulx_scatter movq (%rdi),%rbx call __rsaz_512_mul .byte 102,72,15,126,199 .byte 102,72,15,126,205 movq (%rsp),%r8 movq 8(%rsp),%r9 movq 16(%rsp),%r10 movq 24(%rsp),%r11 movq 32(%rsp),%r12 movq 40(%rsp),%r13 movq 48(%rsp),%r14 movq 56(%rsp),%r15 call __rsaz_512_reduce + jmp .Lmul_scatter_tail + +.align 32 +.Lmulx_scatter: + movq (%rdi),%rdx + call __rsaz_512_mulx + +.byte 102,72,15,126,199 +.byte 102,72,15,126,205 + + movq 128(%rsp),%rdx + movq (%rsp),%r8 + movq 8(%rsp),%r9 + movq 16(%rsp),%r10 + movq 24(%rsp),%r11 + movq 32(%rsp),%r12 + movq 40(%rsp),%r13 + movq 48(%rsp),%r14 + movq 56(%rsp),%r15 + + call __rsaz_512_reducex + +.Lmul_scatter_tail: addq 64(%rsp),%r8 adcq 72(%rsp),%r9 adcq 80(%rsp),%r10 adcq 88(%rsp),%r11 adcq 96(%rsp),%r12 adcq 104(%rsp),%r13 adcq 112(%rsp),%r14 adcq 120(%rsp),%r15 .byte 102,72,15,126,214 sbbq %rcx,%rcx call __rsaz_512_subtract movq %r8,0(%rsi) movq %r9,128(%rsi) movq %r10,256(%rsi) movq %r11,384(%rsi) movq %r12,512(%rsi) movq %r13,640(%rsi) movq %r14,768(%rsi) movq %r15,896(%rsi) leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lmul_scatter4_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4 .globl rsaz_512_mul_by_one .type rsaz_512_mul_by_one,@function .align 32 rsaz_512_mul_by_one: .cfi_startproc pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-16 pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 subq $128+24,%rsp .cfi_adjust_cfa_offset 128+24 .Lmul_by_one_body: + movl OPENSSL_ia32cap_P+8(%rip),%eax movq %rdx,%rbp movq %rcx,128(%rsp) movq (%rsi),%r8 pxor %xmm0,%xmm0 movq 8(%rsi),%r9 movq 16(%rsi),%r10 movq 24(%rsi),%r11 movq 32(%rsi),%r12 movq 40(%rsi),%r13 movq 48(%rsi),%r14 movq 56(%rsi),%r15 movdqa %xmm0,(%rsp) movdqa %xmm0,16(%rsp) movdqa %xmm0,32(%rsp) movdqa %xmm0,48(%rsp) movdqa %xmm0,64(%rsp) movdqa %xmm0,80(%rsp) movdqa %xmm0,96(%rsp) + andl $0x80100,%eax + cmpl $0x80100,%eax + je .Lby_one_callx call __rsaz_512_reduce + jmp .Lby_one_tail +.align 32 +.Lby_one_callx: + movq 128(%rsp),%rdx + call __rsaz_512_reducex +.Lby_one_tail: movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 128+24+48(%rsp),%rax .cfi_def_cfa %rax,8 movq -48(%rax),%r15 .cfi_restore %r15 movq -40(%rax),%r14 .cfi_restore %r14 movq -32(%rax),%r13 .cfi_restore %r13 movq -24(%rax),%r12 .cfi_restore %r12 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lmul_by_one_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one .type __rsaz_512_reduce,@function .align 32 __rsaz_512_reduce: .cfi_startproc movq %r8,%rbx imulq 128+8(%rsp),%rbx movq 0(%rbp),%rax movl $8,%ecx jmp .Lreduction_loop .align 32 .Lreduction_loop: mulq %rbx movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq 128+8(%rsp),%rsi adcq $0,%rdx movq %rdx,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq %rsi,%rbx addq %rax,%r15 movq 0(%rbp),%rax adcq $0,%rdx addq %r15,%r14 movq %rdx,%r15 adcq $0,%r15 decl %ecx jne .Lreduction_loop .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_reduce,.-__rsaz_512_reduce +.type __rsaz_512_reducex,@function +.align 32 +__rsaz_512_reducex: +.cfi_startproc + + imulq %r8,%rdx + xorq %rsi,%rsi + movl $8,%ecx + jmp .Lreduction_loopx + +.align 32 +.Lreduction_loopx: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 128+8(%rsp),%rbx,%rdx + movq %rax,%rdx + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + + decl %ecx + jne .Lreduction_loopx + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rsaz_512_reducex,.-__rsaz_512_reducex .type __rsaz_512_subtract,@function .align 32 __rsaz_512_subtract: .cfi_startproc movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) movq 0(%rbp),%r8 movq 8(%rbp),%r9 negq %r8 notq %r9 andq %rcx,%r8 movq 16(%rbp),%r10 andq %rcx,%r9 notq %r10 movq 24(%rbp),%r11 andq %rcx,%r10 notq %r11 movq 32(%rbp),%r12 andq %rcx,%r11 notq %r12 movq 40(%rbp),%r13 andq %rcx,%r12 notq %r13 movq 48(%rbp),%r14 andq %rcx,%r13 notq %r14 movq 56(%rbp),%r15 andq %rcx,%r14 notq %r15 andq %rcx,%r15 addq (%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_subtract,.-__rsaz_512_subtract .type __rsaz_512_mul,@function .align 32 __rsaz_512_mul: .cfi_startproc leaq 8(%rsp),%rdi movq (%rsi),%rax mulq %rbx movq %rax,(%rdi) movq 8(%rsi),%rax movq %rdx,%r8 mulq %rbx addq %rax,%r8 movq 16(%rsi),%rax movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r9 movq 24(%rsi),%rax movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r10 movq 32(%rsi),%rax movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r11 movq 40(%rsi),%rax movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r12 movq 48(%rsi),%rax movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r13 movq 56(%rsi),%rax movq %rdx,%r14 adcq $0,%r14 mulq %rbx addq %rax,%r14 movq (%rsi),%rax movq %rdx,%r15 adcq $0,%r15 leaq 8(%rbp),%rbp leaq 8(%rdi),%rdi movl $7,%ecx jmp .Loop_mul .align 32 .Loop_mul: movq (%rbp),%rbx mulq %rbx addq %rax,%r8 movq 8(%rsi),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rsi),%rax adcq $0,%rdx addq %r9,%r8 movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rsi),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rsi),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rsi),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rsi),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rsi),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 leaq 8(%rbp),%rbp adcq $0,%r14 mulq %rbx addq %rax,%r15 movq (%rsi),%rax adcq $0,%rdx addq %r15,%r14 movq %rdx,%r15 adcq $0,%r15 leaq 8(%rdi),%rdi decl %ecx jnz .Loop_mul movq %r8,(%rdi) movq %r9,8(%rdi) movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size __rsaz_512_mul,.-__rsaz_512_mul +.type __rsaz_512_mulx,@function +.align 32 +__rsaz_512_mulx: +.cfi_startproc + mulxq (%rsi),%rbx,%r8 + movq $-6,%rcx + + mulxq 8(%rsi),%rax,%r9 + movq %rbx,8(%rsp) + + mulxq 16(%rsi),%rbx,%r10 + adcq %rax,%r8 + + mulxq 24(%rsi),%rax,%r11 + adcq %rbx,%r9 + + mulxq 32(%rsi),%rbx,%r12 + adcq %rax,%r10 + + mulxq 40(%rsi),%rax,%r13 + adcq %rbx,%r11 + + mulxq 48(%rsi),%rbx,%r14 + adcq %rax,%r12 + + mulxq 56(%rsi),%rax,%r15 + movq 8(%rbp),%rdx + adcq %rbx,%r13 + adcq %rax,%r14 + adcq $0,%r15 + + xorq %rdi,%rdi + jmp .Loop_mulx + +.align 32 +.Loop_mulx: + movq %r8,%rbx + mulxq (%rsi),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rsi),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rsi),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rsi),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rsi),%rax,%r15 + movq 64(%rbp,%rcx,8),%rdx + movq %rbx,8+64-8(%rsp,%rcx,8) + adcxq %rax,%r14 + adoxq %rdi,%r15 + adcxq %rdi,%r15 + + incq %rcx + jnz .Loop_mulx + + movq %r8,%rbx + mulxq (%rsi),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + +.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 + adcxq %rax,%r8 + adoxq %r10,%r9 + +.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rsi),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + + mulxq 32(%rsi),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rsi),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + +.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 + adcxq %rax,%r14 + adoxq %rdi,%r15 + adcxq %rdi,%r15 + + movq %rbx,8+64-8(%rsp) + movq %r8,8+64(%rsp) + movq %r9,8+64+8(%rsp) + movq %r10,8+64+16(%rsp) + movq %r11,8+64+24(%rsp) + movq %r12,8+64+32(%rsp) + movq %r13,8+64+40(%rsp) + movq %r14,8+64+48(%rsp) + movq %r15,8+64+56(%rsp) + + .byte 0xf3,0xc3 +.cfi_endproc +.size __rsaz_512_mulx,.-__rsaz_512_mulx .globl rsaz_512_scatter4 .type rsaz_512_scatter4,@function .align 16 rsaz_512_scatter4: .cfi_startproc leaq (%rdi,%rdx,8),%rdi movl $8,%r9d jmp .Loop_scatter .align 16 .Loop_scatter: movq (%rsi),%rax leaq 8(%rsi),%rsi movq %rax,(%rdi) leaq 128(%rdi),%rdi decl %r9d jnz .Loop_scatter .byte 0xf3,0xc3 .cfi_endproc .size rsaz_512_scatter4,.-rsaz_512_scatter4 .globl rsaz_512_gather4 .type rsaz_512_gather4,@function .align 16 rsaz_512_gather4: .cfi_startproc movd %edx,%xmm8 movdqa .Linc+16(%rip),%xmm1 movdqa .Linc(%rip),%xmm0 pshufd $0,%xmm8,%xmm8 movdqa %xmm1,%xmm7 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm8,%xmm0 movdqa %xmm7,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm8,%xmm1 movdqa %xmm7,%xmm4 paddd %xmm2,%xmm3 pcmpeqd %xmm8,%xmm2 movdqa %xmm7,%xmm5 paddd %xmm3,%xmm4 pcmpeqd %xmm8,%xmm3 movdqa %xmm7,%xmm6 paddd %xmm4,%xmm5 pcmpeqd %xmm8,%xmm4 paddd %xmm5,%xmm6 pcmpeqd %xmm8,%xmm5 paddd %xmm6,%xmm7 pcmpeqd %xmm8,%xmm6 pcmpeqd %xmm8,%xmm7 movl $8,%r9d jmp .Loop_gather .align 16 .Loop_gather: movdqa 0(%rsi),%xmm8 movdqa 16(%rsi),%xmm9 movdqa 32(%rsi),%xmm10 movdqa 48(%rsi),%xmm11 pand %xmm0,%xmm8 movdqa 64(%rsi),%xmm12 pand %xmm1,%xmm9 movdqa 80(%rsi),%xmm13 pand %xmm2,%xmm10 movdqa 96(%rsi),%xmm14 pand %xmm3,%xmm11 movdqa 112(%rsi),%xmm15 leaq 128(%rsi),%rsi pand %xmm4,%xmm12 pand %xmm5,%xmm13 pand %xmm6,%xmm14 pand %xmm7,%xmm15 por %xmm10,%xmm8 por %xmm11,%xmm9 por %xmm12,%xmm8 por %xmm13,%xmm9 por %xmm14,%xmm8 por %xmm15,%xmm9 por %xmm9,%xmm8 pshufd $0x4e,%xmm8,%xmm9 por %xmm9,%xmm8 movq %xmm8,(%rdi) leaq 8(%rdi),%rdi decl %r9d jnz .Loop_gather .byte 0xf3,0xc3 .LSEH_end_rsaz_512_gather4: .cfi_endproc .size rsaz_512_gather4,.-rsaz_512_gather4 .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 Index: stable/12/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/sha1-mb-x86_64.S (revision 364963) @@ -1,2954 +1,7269 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha1-mb-x86_64.pl. */ .text .globl sha1_multi_block .type sha1_multi_block,@function .align 32 sha1_multi_block: .cfi_startproc movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut + testl $268435456,%ecx + jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbx,-24 subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) .cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 .Lbody: leaq K_XX_XX(%rip),%rbp leaq 256(%rsp),%rbx .Loop_grande: movl %edx,280(%rsp) xorl %edx,%edx movq 0(%rsi),%r8 movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r8 movq 16(%rsi),%r9 movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r9 movq 32(%rsi),%r10 movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r10 movq 48(%rsi),%r11 movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,12(%rbx) cmovleq %rbp,%r11 testl %edx,%edx jz .Ldone movdqu 0(%rdi),%xmm10 leaq 128(%rsp),%rax movdqu 32(%rdi),%xmm11 movdqu 64(%rdi),%xmm12 movdqu 96(%rdi),%xmm13 movdqu 128(%rdi),%xmm14 movdqa 96(%rbp),%xmm5 movdqa -32(%rbp),%xmm15 jmp .Loop .align 32 .Loop: movd (%r8),%xmm0 leaq 64(%r8),%r8 movd (%r9),%xmm2 leaq 64(%r9),%r9 movd (%r10),%xmm3 leaq 64(%r10),%r10 movd (%r11),%xmm4 leaq 64(%r11),%r11 punpckldq %xmm3,%xmm0 movd -60(%r8),%xmm1 punpckldq %xmm4,%xmm2 movd -60(%r9),%xmm9 punpckldq %xmm2,%xmm0 movd -60(%r10),%xmm8 .byte 102,15,56,0,197 movd -60(%r11),%xmm7 punpckldq %xmm8,%xmm1 movdqa %xmm10,%xmm8 paddd %xmm15,%xmm14 punpckldq %xmm7,%xmm9 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm6 pslld $5,%xmm8 pandn %xmm13,%xmm7 pand %xmm12,%xmm6 punpckldq %xmm9,%xmm1 movdqa %xmm10,%xmm9 movdqa %xmm0,0-128(%rax) paddd %xmm0,%xmm14 movd -56(%r8),%xmm2 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm11,%xmm7 por %xmm9,%xmm8 movd -56(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 .byte 102,15,56,0,205 movd -56(%r10),%xmm8 por %xmm7,%xmm11 movd -56(%r11),%xmm7 punpckldq %xmm8,%xmm2 movdqa %xmm14,%xmm8 paddd %xmm15,%xmm13 punpckldq %xmm7,%xmm9 movdqa %xmm10,%xmm7 movdqa %xmm10,%xmm6 pslld $5,%xmm8 pandn %xmm12,%xmm7 pand %xmm11,%xmm6 punpckldq %xmm9,%xmm2 movdqa %xmm14,%xmm9 movdqa %xmm1,16-128(%rax) paddd %xmm1,%xmm13 movd -52(%r8),%xmm3 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm10,%xmm7 por %xmm9,%xmm8 movd -52(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 .byte 102,15,56,0,213 movd -52(%r10),%xmm8 por %xmm7,%xmm10 movd -52(%r11),%xmm7 punpckldq %xmm8,%xmm3 movdqa %xmm13,%xmm8 paddd %xmm15,%xmm12 punpckldq %xmm7,%xmm9 movdqa %xmm14,%xmm7 movdqa %xmm14,%xmm6 pslld $5,%xmm8 pandn %xmm11,%xmm7 pand %xmm10,%xmm6 punpckldq %xmm9,%xmm3 movdqa %xmm13,%xmm9 movdqa %xmm2,32-128(%rax) paddd %xmm2,%xmm12 movd -48(%r8),%xmm4 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm14,%xmm7 por %xmm9,%xmm8 movd -48(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 .byte 102,15,56,0,221 movd -48(%r10),%xmm8 por %xmm7,%xmm14 movd -48(%r11),%xmm7 punpckldq %xmm8,%xmm4 movdqa %xmm12,%xmm8 paddd %xmm15,%xmm11 punpckldq %xmm7,%xmm9 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm6 pslld $5,%xmm8 pandn %xmm10,%xmm7 pand %xmm14,%xmm6 punpckldq %xmm9,%xmm4 movdqa %xmm12,%xmm9 movdqa %xmm3,48-128(%rax) paddd %xmm3,%xmm11 movd -44(%r8),%xmm0 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm13,%xmm7 por %xmm9,%xmm8 movd -44(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 .byte 102,15,56,0,229 movd -44(%r10),%xmm8 por %xmm7,%xmm13 movd -44(%r11),%xmm7 punpckldq %xmm8,%xmm0 movdqa %xmm11,%xmm8 paddd %xmm15,%xmm10 punpckldq %xmm7,%xmm9 movdqa %xmm12,%xmm7 movdqa %xmm12,%xmm6 pslld $5,%xmm8 pandn %xmm14,%xmm7 pand %xmm13,%xmm6 punpckldq %xmm9,%xmm0 movdqa %xmm11,%xmm9 movdqa %xmm4,64-128(%rax) paddd %xmm4,%xmm10 movd -40(%r8),%xmm1 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm12,%xmm7 por %xmm9,%xmm8 movd -40(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 .byte 102,15,56,0,197 movd -40(%r10),%xmm8 por %xmm7,%xmm12 movd -40(%r11),%xmm7 punpckldq %xmm8,%xmm1 movdqa %xmm10,%xmm8 paddd %xmm15,%xmm14 punpckldq %xmm7,%xmm9 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm6 pslld $5,%xmm8 pandn %xmm13,%xmm7 pand %xmm12,%xmm6 punpckldq %xmm9,%xmm1 movdqa %xmm10,%xmm9 movdqa %xmm0,80-128(%rax) paddd %xmm0,%xmm14 movd -36(%r8),%xmm2 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm11,%xmm7 por %xmm9,%xmm8 movd -36(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 .byte 102,15,56,0,205 movd -36(%r10),%xmm8 por %xmm7,%xmm11 movd -36(%r11),%xmm7 punpckldq %xmm8,%xmm2 movdqa %xmm14,%xmm8 paddd %xmm15,%xmm13 punpckldq %xmm7,%xmm9 movdqa %xmm10,%xmm7 movdqa %xmm10,%xmm6 pslld $5,%xmm8 pandn %xmm12,%xmm7 pand %xmm11,%xmm6 punpckldq %xmm9,%xmm2 movdqa %xmm14,%xmm9 movdqa %xmm1,96-128(%rax) paddd %xmm1,%xmm13 movd -32(%r8),%xmm3 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm10,%xmm7 por %xmm9,%xmm8 movd -32(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 .byte 102,15,56,0,213 movd -32(%r10),%xmm8 por %xmm7,%xmm10 movd -32(%r11),%xmm7 punpckldq %xmm8,%xmm3 movdqa %xmm13,%xmm8 paddd %xmm15,%xmm12 punpckldq %xmm7,%xmm9 movdqa %xmm14,%xmm7 movdqa %xmm14,%xmm6 pslld $5,%xmm8 pandn %xmm11,%xmm7 pand %xmm10,%xmm6 punpckldq %xmm9,%xmm3 movdqa %xmm13,%xmm9 movdqa %xmm2,112-128(%rax) paddd %xmm2,%xmm12 movd -28(%r8),%xmm4 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm14,%xmm7 por %xmm9,%xmm8 movd -28(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 .byte 102,15,56,0,221 movd -28(%r10),%xmm8 por %xmm7,%xmm14 movd -28(%r11),%xmm7 punpckldq %xmm8,%xmm4 movdqa %xmm12,%xmm8 paddd %xmm15,%xmm11 punpckldq %xmm7,%xmm9 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm6 pslld $5,%xmm8 pandn %xmm10,%xmm7 pand %xmm14,%xmm6 punpckldq %xmm9,%xmm4 movdqa %xmm12,%xmm9 movdqa %xmm3,128-128(%rax) paddd %xmm3,%xmm11 movd -24(%r8),%xmm0 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm13,%xmm7 por %xmm9,%xmm8 movd -24(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 .byte 102,15,56,0,229 movd -24(%r10),%xmm8 por %xmm7,%xmm13 movd -24(%r11),%xmm7 punpckldq %xmm8,%xmm0 movdqa %xmm11,%xmm8 paddd %xmm15,%xmm10 punpckldq %xmm7,%xmm9 movdqa %xmm12,%xmm7 movdqa %xmm12,%xmm6 pslld $5,%xmm8 pandn %xmm14,%xmm7 pand %xmm13,%xmm6 punpckldq %xmm9,%xmm0 movdqa %xmm11,%xmm9 movdqa %xmm4,144-128(%rax) paddd %xmm4,%xmm10 movd -20(%r8),%xmm1 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm12,%xmm7 por %xmm9,%xmm8 movd -20(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 .byte 102,15,56,0,197 movd -20(%r10),%xmm8 por %xmm7,%xmm12 movd -20(%r11),%xmm7 punpckldq %xmm8,%xmm1 movdqa %xmm10,%xmm8 paddd %xmm15,%xmm14 punpckldq %xmm7,%xmm9 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm6 pslld $5,%xmm8 pandn %xmm13,%xmm7 pand %xmm12,%xmm6 punpckldq %xmm9,%xmm1 movdqa %xmm10,%xmm9 movdqa %xmm0,160-128(%rax) paddd %xmm0,%xmm14 movd -16(%r8),%xmm2 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm11,%xmm7 por %xmm9,%xmm8 movd -16(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 .byte 102,15,56,0,205 movd -16(%r10),%xmm8 por %xmm7,%xmm11 movd -16(%r11),%xmm7 punpckldq %xmm8,%xmm2 movdqa %xmm14,%xmm8 paddd %xmm15,%xmm13 punpckldq %xmm7,%xmm9 movdqa %xmm10,%xmm7 movdqa %xmm10,%xmm6 pslld $5,%xmm8 pandn %xmm12,%xmm7 pand %xmm11,%xmm6 punpckldq %xmm9,%xmm2 movdqa %xmm14,%xmm9 movdqa %xmm1,176-128(%rax) paddd %xmm1,%xmm13 movd -12(%r8),%xmm3 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm10,%xmm7 por %xmm9,%xmm8 movd -12(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 .byte 102,15,56,0,213 movd -12(%r10),%xmm8 por %xmm7,%xmm10 movd -12(%r11),%xmm7 punpckldq %xmm8,%xmm3 movdqa %xmm13,%xmm8 paddd %xmm15,%xmm12 punpckldq %xmm7,%xmm9 movdqa %xmm14,%xmm7 movdqa %xmm14,%xmm6 pslld $5,%xmm8 pandn %xmm11,%xmm7 pand %xmm10,%xmm6 punpckldq %xmm9,%xmm3 movdqa %xmm13,%xmm9 movdqa %xmm2,192-128(%rax) paddd %xmm2,%xmm12 movd -8(%r8),%xmm4 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm14,%xmm7 por %xmm9,%xmm8 movd -8(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 .byte 102,15,56,0,221 movd -8(%r10),%xmm8 por %xmm7,%xmm14 movd -8(%r11),%xmm7 punpckldq %xmm8,%xmm4 movdqa %xmm12,%xmm8 paddd %xmm15,%xmm11 punpckldq %xmm7,%xmm9 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm6 pslld $5,%xmm8 pandn %xmm10,%xmm7 pand %xmm14,%xmm6 punpckldq %xmm9,%xmm4 movdqa %xmm12,%xmm9 movdqa %xmm3,208-128(%rax) paddd %xmm3,%xmm11 movd -4(%r8),%xmm0 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm13,%xmm7 por %xmm9,%xmm8 movd -4(%r9),%xmm9 pslld $30,%xmm7 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 .byte 102,15,56,0,229 movd -4(%r10),%xmm8 por %xmm7,%xmm13 movdqa 0-128(%rax),%xmm1 movd -4(%r11),%xmm7 punpckldq %xmm8,%xmm0 movdqa %xmm11,%xmm8 paddd %xmm15,%xmm10 punpckldq %xmm7,%xmm9 movdqa %xmm12,%xmm7 movdqa %xmm12,%xmm6 pslld $5,%xmm8 prefetcht0 63(%r8) pandn %xmm14,%xmm7 pand %xmm13,%xmm6 punpckldq %xmm9,%xmm0 movdqa %xmm11,%xmm9 movdqa %xmm4,224-128(%rax) paddd %xmm4,%xmm10 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm12,%xmm7 prefetcht0 63(%r9) por %xmm9,%xmm8 pslld $30,%xmm7 paddd %xmm6,%xmm10 prefetcht0 63(%r10) psrld $2,%xmm12 paddd %xmm8,%xmm10 .byte 102,15,56,0,197 prefetcht0 63(%r11) por %xmm7,%xmm12 movdqa 16-128(%rax),%xmm2 pxor %xmm3,%xmm1 movdqa 32-128(%rax),%xmm3 movdqa %xmm10,%xmm8 pxor 128-128(%rax),%xmm1 paddd %xmm15,%xmm14 movdqa %xmm11,%xmm7 pslld $5,%xmm8 pxor %xmm3,%xmm1 movdqa %xmm11,%xmm6 pandn %xmm13,%xmm7 movdqa %xmm1,%xmm5 pand %xmm12,%xmm6 movdqa %xmm10,%xmm9 psrld $31,%xmm5 paddd %xmm1,%xmm1 movdqa %xmm0,240-128(%rax) paddd %xmm0,%xmm14 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm11,%xmm7 por %xmm9,%xmm8 pslld $30,%xmm7 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 48-128(%rax),%xmm4 movdqa %xmm14,%xmm8 pxor 144-128(%rax),%xmm2 paddd %xmm15,%xmm13 movdqa %xmm10,%xmm7 pslld $5,%xmm8 pxor %xmm4,%xmm2 movdqa %xmm10,%xmm6 pandn %xmm12,%xmm7 movdqa %xmm2,%xmm5 pand %xmm11,%xmm6 movdqa %xmm14,%xmm9 psrld $31,%xmm5 paddd %xmm2,%xmm2 movdqa %xmm1,0-128(%rax) paddd %xmm1,%xmm13 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm10,%xmm7 por %xmm9,%xmm8 pslld $30,%xmm7 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 64-128(%rax),%xmm0 movdqa %xmm13,%xmm8 pxor 160-128(%rax),%xmm3 paddd %xmm15,%xmm12 movdqa %xmm14,%xmm7 pslld $5,%xmm8 pxor %xmm0,%xmm3 movdqa %xmm14,%xmm6 pandn %xmm11,%xmm7 movdqa %xmm3,%xmm5 pand %xmm10,%xmm6 movdqa %xmm13,%xmm9 psrld $31,%xmm5 paddd %xmm3,%xmm3 movdqa %xmm2,16-128(%rax) paddd %xmm2,%xmm12 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm14,%xmm7 por %xmm9,%xmm8 pslld $30,%xmm7 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 80-128(%rax),%xmm1 movdqa %xmm12,%xmm8 pxor 176-128(%rax),%xmm4 paddd %xmm15,%xmm11 movdqa %xmm13,%xmm7 pslld $5,%xmm8 pxor %xmm1,%xmm4 movdqa %xmm13,%xmm6 pandn %xmm10,%xmm7 movdqa %xmm4,%xmm5 pand %xmm14,%xmm6 movdqa %xmm12,%xmm9 psrld $31,%xmm5 paddd %xmm4,%xmm4 movdqa %xmm3,32-128(%rax) paddd %xmm3,%xmm11 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm13,%xmm7 por %xmm9,%xmm8 pslld $30,%xmm7 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 96-128(%rax),%xmm2 movdqa %xmm11,%xmm8 pxor 192-128(%rax),%xmm0 paddd %xmm15,%xmm10 movdqa %xmm12,%xmm7 pslld $5,%xmm8 pxor %xmm2,%xmm0 movdqa %xmm12,%xmm6 pandn %xmm14,%xmm7 movdqa %xmm0,%xmm5 pand %xmm13,%xmm6 movdqa %xmm11,%xmm9 psrld $31,%xmm5 paddd %xmm0,%xmm0 movdqa %xmm4,48-128(%rax) paddd %xmm4,%xmm10 psrld $27,%xmm9 pxor %xmm7,%xmm6 movdqa %xmm12,%xmm7 por %xmm9,%xmm8 pslld $30,%xmm7 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 movdqa 0(%rbp),%xmm15 pxor %xmm3,%xmm1 movdqa 112-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 208-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,64-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 128-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 224-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,80-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 144-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 240-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 movdqa %xmm2,96-128(%rax) paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 160-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 0-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 movdqa %xmm3,112-128(%rax) paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 176-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 16-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 movdqa %xmm4,128-128(%rax) paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 192-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 32-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,144-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 208-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 48-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,160-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 224-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 64-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 movdqa %xmm2,176-128(%rax) paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 240-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 80-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 movdqa %xmm3,192-128(%rax) paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 0-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 96-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 movdqa %xmm4,208-128(%rax) paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 16-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 112-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,224-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 32-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 128-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,240-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 48-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 144-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 movdqa %xmm2,0-128(%rax) paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 64-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 160-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 movdqa %xmm3,16-128(%rax) paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 80-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 176-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 movdqa %xmm4,32-128(%rax) paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 96-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 192-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,48-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 112-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 208-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,64-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 128-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 224-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 movdqa %xmm2,80-128(%rax) paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 144-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 240-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 movdqa %xmm3,96-128(%rax) paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 160-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 0-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 movdqa %xmm4,112-128(%rax) paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 movdqa 32(%rbp),%xmm15 pxor %xmm3,%xmm1 movdqa 176-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm7 pxor 16-128(%rax),%xmm1 pxor %xmm3,%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 movdqa %xmm10,%xmm9 pand %xmm12,%xmm7 movdqa %xmm13,%xmm6 movdqa %xmm1,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm14 pxor %xmm12,%xmm6 movdqa %xmm0,128-128(%rax) paddd %xmm0,%xmm14 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm11,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 paddd %xmm1,%xmm1 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 192-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm7 pxor 32-128(%rax),%xmm2 pxor %xmm4,%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 movdqa %xmm14,%xmm9 pand %xmm11,%xmm7 movdqa %xmm12,%xmm6 movdqa %xmm2,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm13 pxor %xmm11,%xmm6 movdqa %xmm1,144-128(%rax) paddd %xmm1,%xmm13 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm10,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 paddd %xmm2,%xmm2 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 208-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm7 pxor 48-128(%rax),%xmm3 pxor %xmm0,%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 movdqa %xmm13,%xmm9 pand %xmm10,%xmm7 movdqa %xmm11,%xmm6 movdqa %xmm3,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm12 pxor %xmm10,%xmm6 movdqa %xmm2,160-128(%rax) paddd %xmm2,%xmm12 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm14,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 paddd %xmm3,%xmm3 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 224-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm7 pxor 64-128(%rax),%xmm4 pxor %xmm1,%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 movdqa %xmm12,%xmm9 pand %xmm14,%xmm7 movdqa %xmm10,%xmm6 movdqa %xmm4,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm11 pxor %xmm14,%xmm6 movdqa %xmm3,176-128(%rax) paddd %xmm3,%xmm11 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm13,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 paddd %xmm4,%xmm4 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 240-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm7 pxor 80-128(%rax),%xmm0 pxor %xmm2,%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 movdqa %xmm11,%xmm9 pand %xmm13,%xmm7 movdqa %xmm14,%xmm6 movdqa %xmm0,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm10 pxor %xmm13,%xmm6 movdqa %xmm4,192-128(%rax) paddd %xmm4,%xmm10 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm12,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 paddd %xmm0,%xmm0 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 0-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm7 pxor 96-128(%rax),%xmm1 pxor %xmm3,%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 movdqa %xmm10,%xmm9 pand %xmm12,%xmm7 movdqa %xmm13,%xmm6 movdqa %xmm1,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm14 pxor %xmm12,%xmm6 movdqa %xmm0,208-128(%rax) paddd %xmm0,%xmm14 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm11,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 paddd %xmm1,%xmm1 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 16-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm7 pxor 112-128(%rax),%xmm2 pxor %xmm4,%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 movdqa %xmm14,%xmm9 pand %xmm11,%xmm7 movdqa %xmm12,%xmm6 movdqa %xmm2,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm13 pxor %xmm11,%xmm6 movdqa %xmm1,224-128(%rax) paddd %xmm1,%xmm13 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm10,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 paddd %xmm2,%xmm2 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 32-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm7 pxor 128-128(%rax),%xmm3 pxor %xmm0,%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 movdqa %xmm13,%xmm9 pand %xmm10,%xmm7 movdqa %xmm11,%xmm6 movdqa %xmm3,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm12 pxor %xmm10,%xmm6 movdqa %xmm2,240-128(%rax) paddd %xmm2,%xmm12 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm14,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 paddd %xmm3,%xmm3 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 48-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm7 pxor 144-128(%rax),%xmm4 pxor %xmm1,%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 movdqa %xmm12,%xmm9 pand %xmm14,%xmm7 movdqa %xmm10,%xmm6 movdqa %xmm4,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm11 pxor %xmm14,%xmm6 movdqa %xmm3,0-128(%rax) paddd %xmm3,%xmm11 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm13,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 paddd %xmm4,%xmm4 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 64-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm7 pxor 160-128(%rax),%xmm0 pxor %xmm2,%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 movdqa %xmm11,%xmm9 pand %xmm13,%xmm7 movdqa %xmm14,%xmm6 movdqa %xmm0,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm10 pxor %xmm13,%xmm6 movdqa %xmm4,16-128(%rax) paddd %xmm4,%xmm10 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm12,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 paddd %xmm0,%xmm0 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 80-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm7 pxor 176-128(%rax),%xmm1 pxor %xmm3,%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 movdqa %xmm10,%xmm9 pand %xmm12,%xmm7 movdqa %xmm13,%xmm6 movdqa %xmm1,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm14 pxor %xmm12,%xmm6 movdqa %xmm0,32-128(%rax) paddd %xmm0,%xmm14 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm11,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 paddd %xmm1,%xmm1 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 96-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm7 pxor 192-128(%rax),%xmm2 pxor %xmm4,%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 movdqa %xmm14,%xmm9 pand %xmm11,%xmm7 movdqa %xmm12,%xmm6 movdqa %xmm2,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm13 pxor %xmm11,%xmm6 movdqa %xmm1,48-128(%rax) paddd %xmm1,%xmm13 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm10,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 paddd %xmm2,%xmm2 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 112-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm7 pxor 208-128(%rax),%xmm3 pxor %xmm0,%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 movdqa %xmm13,%xmm9 pand %xmm10,%xmm7 movdqa %xmm11,%xmm6 movdqa %xmm3,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm12 pxor %xmm10,%xmm6 movdqa %xmm2,64-128(%rax) paddd %xmm2,%xmm12 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm14,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 paddd %xmm3,%xmm3 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 128-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm7 pxor 224-128(%rax),%xmm4 pxor %xmm1,%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 movdqa %xmm12,%xmm9 pand %xmm14,%xmm7 movdqa %xmm10,%xmm6 movdqa %xmm4,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm11 pxor %xmm14,%xmm6 movdqa %xmm3,80-128(%rax) paddd %xmm3,%xmm11 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm13,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 paddd %xmm4,%xmm4 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 144-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm7 pxor 240-128(%rax),%xmm0 pxor %xmm2,%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 movdqa %xmm11,%xmm9 pand %xmm13,%xmm7 movdqa %xmm14,%xmm6 movdqa %xmm0,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm10 pxor %xmm13,%xmm6 movdqa %xmm4,96-128(%rax) paddd %xmm4,%xmm10 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm12,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 paddd %xmm0,%xmm0 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 160-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm7 pxor 0-128(%rax),%xmm1 pxor %xmm3,%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 movdqa %xmm10,%xmm9 pand %xmm12,%xmm7 movdqa %xmm13,%xmm6 movdqa %xmm1,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm14 pxor %xmm12,%xmm6 movdqa %xmm0,112-128(%rax) paddd %xmm0,%xmm14 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm11,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 paddd %xmm1,%xmm1 paddd %xmm6,%xmm14 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 176-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm7 pxor 16-128(%rax),%xmm2 pxor %xmm4,%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 movdqa %xmm14,%xmm9 pand %xmm11,%xmm7 movdqa %xmm12,%xmm6 movdqa %xmm2,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm13 pxor %xmm11,%xmm6 movdqa %xmm1,128-128(%rax) paddd %xmm1,%xmm13 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm10,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 paddd %xmm2,%xmm2 paddd %xmm6,%xmm13 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 192-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm7 pxor 32-128(%rax),%xmm3 pxor %xmm0,%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 movdqa %xmm13,%xmm9 pand %xmm10,%xmm7 movdqa %xmm11,%xmm6 movdqa %xmm3,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm12 pxor %xmm10,%xmm6 movdqa %xmm2,144-128(%rax) paddd %xmm2,%xmm12 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm14,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 paddd %xmm3,%xmm3 paddd %xmm6,%xmm12 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 208-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm7 pxor 48-128(%rax),%xmm4 pxor %xmm1,%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 movdqa %xmm12,%xmm9 pand %xmm14,%xmm7 movdqa %xmm10,%xmm6 movdqa %xmm4,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm11 pxor %xmm14,%xmm6 movdqa %xmm3,160-128(%rax) paddd %xmm3,%xmm11 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm13,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 paddd %xmm4,%xmm4 paddd %xmm6,%xmm11 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 224-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm7 pxor 64-128(%rax),%xmm0 pxor %xmm2,%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 movdqa %xmm11,%xmm9 pand %xmm13,%xmm7 movdqa %xmm14,%xmm6 movdqa %xmm0,%xmm5 psrld $27,%xmm9 paddd %xmm7,%xmm10 pxor %xmm13,%xmm6 movdqa %xmm4,176-128(%rax) paddd %xmm4,%xmm10 por %xmm9,%xmm8 psrld $31,%xmm5 pand %xmm12,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 paddd %xmm0,%xmm0 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 movdqa 64(%rbp),%xmm15 pxor %xmm3,%xmm1 movdqa 240-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 80-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,192-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 0-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 96-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,208-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 16-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 112-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 movdqa %xmm2,224-128(%rax) paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 32-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 128-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 movdqa %xmm3,240-128(%rax) paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 48-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 144-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 movdqa %xmm4,0-128(%rax) paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 64-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 160-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,16-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 80-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 176-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,32-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 96-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 192-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 movdqa %xmm2,48-128(%rax) paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 112-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 208-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 movdqa %xmm3,64-128(%rax) paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 128-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 224-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 movdqa %xmm4,80-128(%rax) paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 144-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 240-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 movdqa %xmm0,96-128(%rax) paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 160-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 0-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 movdqa %xmm1,112-128(%rax) paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 176-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 16-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 192-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 32-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 pxor %xmm2,%xmm0 movdqa 208-128(%rax),%xmm2 movdqa %xmm11,%xmm8 movdqa %xmm14,%xmm6 pxor 48-128(%rax),%xmm0 paddd %xmm15,%xmm10 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 paddd %xmm4,%xmm10 pxor %xmm2,%xmm0 psrld $27,%xmm9 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm7 pslld $30,%xmm7 movdqa %xmm0,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm10 paddd %xmm0,%xmm0 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm5,%xmm0 por %xmm7,%xmm12 pxor %xmm3,%xmm1 movdqa 224-128(%rax),%xmm3 movdqa %xmm10,%xmm8 movdqa %xmm13,%xmm6 pxor 64-128(%rax),%xmm1 paddd %xmm15,%xmm14 pslld $5,%xmm8 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm9 paddd %xmm0,%xmm14 pxor %xmm3,%xmm1 psrld $27,%xmm9 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm7 pslld $30,%xmm7 movdqa %xmm1,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm14 paddd %xmm1,%xmm1 psrld $2,%xmm11 paddd %xmm8,%xmm14 por %xmm5,%xmm1 por %xmm7,%xmm11 pxor %xmm4,%xmm2 movdqa 240-128(%rax),%xmm4 movdqa %xmm14,%xmm8 movdqa %xmm12,%xmm6 pxor 80-128(%rax),%xmm2 paddd %xmm15,%xmm13 pslld $5,%xmm8 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm9 paddd %xmm1,%xmm13 pxor %xmm4,%xmm2 psrld $27,%xmm9 pxor %xmm11,%xmm6 movdqa %xmm10,%xmm7 pslld $30,%xmm7 movdqa %xmm2,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm13 paddd %xmm2,%xmm2 psrld $2,%xmm10 paddd %xmm8,%xmm13 por %xmm5,%xmm2 por %xmm7,%xmm10 pxor %xmm0,%xmm3 movdqa 0-128(%rax),%xmm0 movdqa %xmm13,%xmm8 movdqa %xmm11,%xmm6 pxor 96-128(%rax),%xmm3 paddd %xmm15,%xmm12 pslld $5,%xmm8 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm9 paddd %xmm2,%xmm12 pxor %xmm0,%xmm3 psrld $27,%xmm9 pxor %xmm10,%xmm6 movdqa %xmm14,%xmm7 pslld $30,%xmm7 movdqa %xmm3,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm12 paddd %xmm3,%xmm3 psrld $2,%xmm14 paddd %xmm8,%xmm12 por %xmm5,%xmm3 por %xmm7,%xmm14 pxor %xmm1,%xmm4 movdqa 16-128(%rax),%xmm1 movdqa %xmm12,%xmm8 movdqa %xmm10,%xmm6 pxor 112-128(%rax),%xmm4 paddd %xmm15,%xmm11 pslld $5,%xmm8 pxor %xmm13,%xmm6 movdqa %xmm12,%xmm9 paddd %xmm3,%xmm11 pxor %xmm1,%xmm4 psrld $27,%xmm9 pxor %xmm14,%xmm6 movdqa %xmm13,%xmm7 pslld $30,%xmm7 movdqa %xmm4,%xmm5 por %xmm9,%xmm8 psrld $31,%xmm5 paddd %xmm6,%xmm11 paddd %xmm4,%xmm4 psrld $2,%xmm13 paddd %xmm8,%xmm11 por %xmm5,%xmm4 por %xmm7,%xmm13 movdqa %xmm11,%xmm8 paddd %xmm15,%xmm10 movdqa %xmm14,%xmm6 pslld $5,%xmm8 pxor %xmm12,%xmm6 movdqa %xmm11,%xmm9 paddd %xmm4,%xmm10 psrld $27,%xmm9 movdqa %xmm12,%xmm7 pxor %xmm13,%xmm6 pslld $30,%xmm7 por %xmm9,%xmm8 paddd %xmm6,%xmm10 psrld $2,%xmm12 paddd %xmm8,%xmm10 por %xmm7,%xmm12 movdqa (%rbx),%xmm0 movl $1,%ecx cmpl 0(%rbx),%ecx pxor %xmm8,%xmm8 cmovgeq %rbp,%r8 cmpl 4(%rbx),%ecx movdqa %xmm0,%xmm1 cmovgeq %rbp,%r9 cmpl 8(%rbx),%ecx pcmpgtd %xmm8,%xmm1 cmovgeq %rbp,%r10 cmpl 12(%rbx),%ecx paddd %xmm1,%xmm0 cmovgeq %rbp,%r11 movdqu 0(%rdi),%xmm6 pand %xmm1,%xmm10 movdqu 32(%rdi),%xmm7 pand %xmm1,%xmm11 paddd %xmm6,%xmm10 movdqu 64(%rdi),%xmm8 pand %xmm1,%xmm12 paddd %xmm7,%xmm11 movdqu 96(%rdi),%xmm9 pand %xmm1,%xmm13 paddd %xmm8,%xmm12 movdqu 128(%rdi),%xmm5 pand %xmm1,%xmm14 movdqu %xmm10,0(%rdi) paddd %xmm9,%xmm13 movdqu %xmm11,32(%rdi) paddd %xmm5,%xmm14 movdqu %xmm12,64(%rdi) movdqu %xmm13,96(%rdi) movdqu %xmm14,128(%rdi) movdqa %xmm0,(%rbx) movdqa 96(%rbp),%xmm5 movdqa -32(%rbp),%xmm15 decl %edx jnz .Loop movl 280(%rsp),%edx leaq 16(%rdi),%rdi leaq 64(%rsi),%rsi decl %edx jnz .Loop_grande .Ldone: movq 272(%rsp),%rax .cfi_def_cfa %rax,8 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc .size sha1_multi_block,.-sha1_multi_block .type sha1_multi_block_shaext,@function .align 32 sha1_multi_block_shaext: .cfi_startproc _shaext_shortcut: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 subq $288,%rsp shll $1,%edx andq $-256,%rsp leaq 64(%rdi),%rdi movq %rax,272(%rsp) .Lbody_shaext: leaq 256(%rsp),%rbx movdqa K_XX_XX+128(%rip),%xmm3 .Loop_grande_shaext: movl %edx,280(%rsp) xorl %edx,%edx movq 0(%rsi),%r8 movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rsp,%r8 movq 16(%rsi),%r9 movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rsp,%r9 testl %edx,%edx jz .Ldone_shaext movq 0-64(%rdi),%xmm0 movq 32-64(%rdi),%xmm4 movq 64-64(%rdi),%xmm5 movq 96-64(%rdi),%xmm6 movq 128-64(%rdi),%xmm7 punpckldq %xmm4,%xmm0 punpckldq %xmm6,%xmm5 movdqa %xmm0,%xmm8 punpcklqdq %xmm5,%xmm0 punpckhqdq %xmm5,%xmm8 pshufd $63,%xmm7,%xmm1 pshufd $127,%xmm7,%xmm9 pshufd $27,%xmm0,%xmm0 pshufd $27,%xmm8,%xmm8 jmp .Loop_shaext .align 32 .Loop_shaext: movdqu 0(%r8),%xmm4 movdqu 0(%r9),%xmm11 movdqu 16(%r8),%xmm5 movdqu 16(%r9),%xmm12 movdqu 32(%r8),%xmm6 .byte 102,15,56,0,227 movdqu 32(%r9),%xmm13 .byte 102,68,15,56,0,219 movdqu 48(%r8),%xmm7 leaq 64(%r8),%r8 .byte 102,15,56,0,235 movdqu 48(%r9),%xmm14 leaq 64(%r9),%r9 .byte 102,68,15,56,0,227 movdqa %xmm1,80(%rsp) paddd %xmm4,%xmm1 movdqa %xmm9,112(%rsp) paddd %xmm11,%xmm9 movdqa %xmm0,64(%rsp) movdqa %xmm0,%xmm2 movdqa %xmm8,96(%rsp) movdqa %xmm8,%xmm10 .byte 15,58,204,193,0 .byte 15,56,200,213 .byte 69,15,58,204,193,0 .byte 69,15,56,200,212 .byte 102,15,56,0,243 prefetcht0 127(%r8) .byte 15,56,201,229 .byte 102,68,15,56,0,235 prefetcht0 127(%r9) .byte 69,15,56,201,220 .byte 102,15,56,0,251 movdqa %xmm0,%xmm1 .byte 102,68,15,56,0,243 movdqa %xmm8,%xmm9 .byte 15,58,204,194,0 .byte 15,56,200,206 .byte 69,15,58,204,194,0 .byte 69,15,56,200,205 pxor %xmm6,%xmm4 .byte 15,56,201,238 pxor %xmm13,%xmm11 .byte 69,15,56,201,229 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,0 .byte 15,56,200,215 .byte 69,15,58,204,193,0 .byte 69,15,56,200,214 .byte 15,56,202,231 .byte 69,15,56,202,222 pxor %xmm7,%xmm5 .byte 15,56,201,247 pxor %xmm14,%xmm12 .byte 69,15,56,201,238 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,0 .byte 15,56,200,204 .byte 69,15,58,204,194,0 .byte 69,15,56,200,203 .byte 15,56,202,236 .byte 69,15,56,202,227 pxor %xmm4,%xmm6 .byte 15,56,201,252 pxor %xmm11,%xmm13 .byte 69,15,56,201,243 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,0 .byte 15,56,200,213 .byte 69,15,58,204,193,0 .byte 69,15,56,200,212 .byte 15,56,202,245 .byte 69,15,56,202,236 pxor %xmm5,%xmm7 .byte 15,56,201,229 pxor %xmm12,%xmm14 .byte 69,15,56,201,220 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,1 .byte 15,56,200,206 .byte 69,15,58,204,194,1 .byte 69,15,56,200,205 .byte 15,56,202,254 .byte 69,15,56,202,245 pxor %xmm6,%xmm4 .byte 15,56,201,238 pxor %xmm13,%xmm11 .byte 69,15,56,201,229 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,1 .byte 15,56,200,215 .byte 69,15,58,204,193,1 .byte 69,15,56,200,214 .byte 15,56,202,231 .byte 69,15,56,202,222 pxor %xmm7,%xmm5 .byte 15,56,201,247 pxor %xmm14,%xmm12 .byte 69,15,56,201,238 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,1 .byte 15,56,200,204 .byte 69,15,58,204,194,1 .byte 69,15,56,200,203 .byte 15,56,202,236 .byte 69,15,56,202,227 pxor %xmm4,%xmm6 .byte 15,56,201,252 pxor %xmm11,%xmm13 .byte 69,15,56,201,243 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,1 .byte 15,56,200,213 .byte 69,15,58,204,193,1 .byte 69,15,56,200,212 .byte 15,56,202,245 .byte 69,15,56,202,236 pxor %xmm5,%xmm7 .byte 15,56,201,229 pxor %xmm12,%xmm14 .byte 69,15,56,201,220 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,1 .byte 15,56,200,206 .byte 69,15,58,204,194,1 .byte 69,15,56,200,205 .byte 15,56,202,254 .byte 69,15,56,202,245 pxor %xmm6,%xmm4 .byte 15,56,201,238 pxor %xmm13,%xmm11 .byte 69,15,56,201,229 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,2 .byte 15,56,200,215 .byte 69,15,58,204,193,2 .byte 69,15,56,200,214 .byte 15,56,202,231 .byte 69,15,56,202,222 pxor %xmm7,%xmm5 .byte 15,56,201,247 pxor %xmm14,%xmm12 .byte 69,15,56,201,238 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,2 .byte 15,56,200,204 .byte 69,15,58,204,194,2 .byte 69,15,56,200,203 .byte 15,56,202,236 .byte 69,15,56,202,227 pxor %xmm4,%xmm6 .byte 15,56,201,252 pxor %xmm11,%xmm13 .byte 69,15,56,201,243 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,2 .byte 15,56,200,213 .byte 69,15,58,204,193,2 .byte 69,15,56,200,212 .byte 15,56,202,245 .byte 69,15,56,202,236 pxor %xmm5,%xmm7 .byte 15,56,201,229 pxor %xmm12,%xmm14 .byte 69,15,56,201,220 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,2 .byte 15,56,200,206 .byte 69,15,58,204,194,2 .byte 69,15,56,200,205 .byte 15,56,202,254 .byte 69,15,56,202,245 pxor %xmm6,%xmm4 .byte 15,56,201,238 pxor %xmm13,%xmm11 .byte 69,15,56,201,229 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,2 .byte 15,56,200,215 .byte 69,15,58,204,193,2 .byte 69,15,56,200,214 .byte 15,56,202,231 .byte 69,15,56,202,222 pxor %xmm7,%xmm5 .byte 15,56,201,247 pxor %xmm14,%xmm12 .byte 69,15,56,201,238 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,3 .byte 15,56,200,204 .byte 69,15,58,204,194,3 .byte 69,15,56,200,203 .byte 15,56,202,236 .byte 69,15,56,202,227 pxor %xmm4,%xmm6 .byte 15,56,201,252 pxor %xmm11,%xmm13 .byte 69,15,56,201,243 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,3 .byte 15,56,200,213 .byte 69,15,58,204,193,3 .byte 69,15,56,200,212 .byte 15,56,202,245 .byte 69,15,56,202,236 pxor %xmm5,%xmm7 pxor %xmm12,%xmm14 movl $1,%ecx pxor %xmm4,%xmm4 cmpl 0(%rbx),%ecx cmovgeq %rsp,%r8 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,3 .byte 15,56,200,206 .byte 69,15,58,204,194,3 .byte 69,15,56,200,205 .byte 15,56,202,254 .byte 69,15,56,202,245 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 movq (%rbx),%xmm6 movdqa %xmm0,%xmm2 movdqa %xmm8,%xmm10 .byte 15,58,204,193,3 .byte 15,56,200,215 .byte 69,15,58,204,193,3 .byte 69,15,56,200,214 pshufd $0x00,%xmm6,%xmm11 pshufd $0x55,%xmm6,%xmm12 movdqa %xmm6,%xmm7 pcmpgtd %xmm4,%xmm11 pcmpgtd %xmm4,%xmm12 movdqa %xmm0,%xmm1 movdqa %xmm8,%xmm9 .byte 15,58,204,194,3 .byte 15,56,200,204 .byte 69,15,58,204,194,3 .byte 68,15,56,200,204 pcmpgtd %xmm4,%xmm7 pand %xmm11,%xmm0 pand %xmm11,%xmm1 pand %xmm12,%xmm8 pand %xmm12,%xmm9 paddd %xmm7,%xmm6 paddd 64(%rsp),%xmm0 paddd 80(%rsp),%xmm1 paddd 96(%rsp),%xmm8 paddd 112(%rsp),%xmm9 movq %xmm6,(%rbx) decl %edx jnz .Loop_shaext movl 280(%rsp),%edx pshufd $27,%xmm0,%xmm0 pshufd $27,%xmm8,%xmm8 movdqa %xmm0,%xmm6 punpckldq %xmm8,%xmm0 punpckhdq %xmm8,%xmm6 punpckhdq %xmm9,%xmm1 movq %xmm0,0-64(%rdi) psrldq $8,%xmm0 movq %xmm6,64-64(%rdi) psrldq $8,%xmm6 movq %xmm0,32-64(%rdi) psrldq $8,%xmm1 movq %xmm6,96-64(%rdi) movq %xmm1,128-64(%rdi) leaq 8(%rdi),%rdi leaq 32(%rsi),%rsi decl %edx jnz .Loop_grande_shaext .Ldone_shaext: movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_shaext: .byte 0xf3,0xc3 .cfi_endproc .size sha1_multi_block_shaext,.-sha1_multi_block_shaext +.type sha1_multi_block_avx,@function +.align 32 +sha1_multi_block_avx: +.cfi_startproc +_avx_shortcut: + shrq $32,%rcx + cmpl $2,%edx + jb .Lavx + testl $32,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 +.Lbody_avx: + leaq K_XX_XX(%rip),%rbp + leaq 256(%rsp),%rbx + + vzeroupper +.Loop_grande_avx: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone_avx + + vmovdqu 0(%rdi),%xmm10 + leaq 128(%rsp),%rax + vmovdqu 32(%rdi),%xmm11 + vmovdqu 64(%rdi),%xmm12 + vmovdqu 96(%rdi),%xmm13 + vmovdqu 128(%rdi),%xmm14 + vmovdqu 96(%rbp),%xmm5 + jmp .Loop_avx + +.align 32 +.Loop_avx: + vmovdqa -32(%rbp),%xmm15 + vmovd (%r8),%xmm0 + leaq 64(%r8),%r8 + vmovd (%r9),%xmm2 + leaq 64(%r9),%r9 + vpinsrd $1,(%r10),%xmm0,%xmm0 + leaq 64(%r10),%r10 + vpinsrd $1,(%r11),%xmm2,%xmm2 + leaq 64(%r11),%r11 + vmovd -60(%r8),%xmm1 + vpunpckldq %xmm2,%xmm0,%xmm0 + vmovd -60(%r9),%xmm9 + vpshufb %xmm5,%xmm0,%xmm0 + vpinsrd $1,-60(%r10),%xmm1,%xmm1 + vpinsrd $1,-60(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,0-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpunpckldq %xmm9,%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -56(%r8),%xmm2 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -56(%r9),%xmm9 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpshufb %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpinsrd $1,-56(%r10),%xmm2,%xmm2 + vpinsrd $1,-56(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,16-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpunpckldq %xmm9,%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -52(%r8),%xmm3 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -52(%r9),%xmm9 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpshufb %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpinsrd $1,-52(%r10),%xmm3,%xmm3 + vpinsrd $1,-52(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,32-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpunpckldq %xmm9,%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -48(%r8),%xmm4 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -48(%r9),%xmm9 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpshufb %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpinsrd $1,-48(%r10),%xmm4,%xmm4 + vpinsrd $1,-48(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,48-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpunpckldq %xmm9,%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -44(%r8),%xmm0 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -44(%r9),%xmm9 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpshufb %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpinsrd $1,-44(%r10),%xmm0,%xmm0 + vpinsrd $1,-44(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,64-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpunpckldq %xmm9,%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -40(%r8),%xmm1 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -40(%r9),%xmm9 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpshufb %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpinsrd $1,-40(%r10),%xmm1,%xmm1 + vpinsrd $1,-40(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,80-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpunpckldq %xmm9,%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -36(%r8),%xmm2 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -36(%r9),%xmm9 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpshufb %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpinsrd $1,-36(%r10),%xmm2,%xmm2 + vpinsrd $1,-36(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,96-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpunpckldq %xmm9,%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -32(%r8),%xmm3 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -32(%r9),%xmm9 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpshufb %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpinsrd $1,-32(%r10),%xmm3,%xmm3 + vpinsrd $1,-32(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,112-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpunpckldq %xmm9,%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -28(%r8),%xmm4 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -28(%r9),%xmm9 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpshufb %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpinsrd $1,-28(%r10),%xmm4,%xmm4 + vpinsrd $1,-28(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,128-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpunpckldq %xmm9,%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -24(%r8),%xmm0 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -24(%r9),%xmm9 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpshufb %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpinsrd $1,-24(%r10),%xmm0,%xmm0 + vpinsrd $1,-24(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,144-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpunpckldq %xmm9,%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -20(%r8),%xmm1 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -20(%r9),%xmm9 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpshufb %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpinsrd $1,-20(%r10),%xmm1,%xmm1 + vpinsrd $1,-20(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,160-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpunpckldq %xmm9,%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -16(%r8),%xmm2 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -16(%r9),%xmm9 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpshufb %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpinsrd $1,-16(%r10),%xmm2,%xmm2 + vpinsrd $1,-16(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,176-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpunpckldq %xmm9,%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -12(%r8),%xmm3 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -12(%r9),%xmm9 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpshufb %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpinsrd $1,-12(%r10),%xmm3,%xmm3 + vpinsrd $1,-12(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,192-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpunpckldq %xmm9,%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -8(%r8),%xmm4 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -8(%r9),%xmm9 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpshufb %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpinsrd $1,-8(%r10),%xmm4,%xmm4 + vpinsrd $1,-8(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,208-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpunpckldq %xmm9,%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vmovd -4(%r8),%xmm0 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vmovd -4(%r9),%xmm9 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpshufb %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vmovdqa 0-128(%rax),%xmm1 + vpinsrd $1,-4(%r10),%xmm0,%xmm0 + vpinsrd $1,-4(%r11),%xmm9,%xmm9 + vpaddd %xmm15,%xmm10,%xmm10 + prefetcht0 63(%r8) + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,224-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpunpckldq %xmm9,%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + prefetcht0 63(%r9) + vpxor %xmm7,%xmm6,%xmm6 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + prefetcht0 63(%r10) + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + prefetcht0 63(%r11) + vpshufb %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 16-128(%rax),%xmm2 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 32-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpandn %xmm13,%xmm11,%xmm7 + + vpand %xmm12,%xmm11,%xmm6 + + vmovdqa %xmm0,240-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 128-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 48-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpandn %xmm12,%xmm10,%xmm7 + + vpand %xmm11,%xmm10,%xmm6 + + vmovdqa %xmm1,0-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 144-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 64-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpandn %xmm11,%xmm14,%xmm7 + + vpand %xmm10,%xmm14,%xmm6 + + vmovdqa %xmm2,16-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 160-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 80-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpandn %xmm10,%xmm13,%xmm7 + + vpand %xmm14,%xmm13,%xmm6 + + vmovdqa %xmm3,32-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 176-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 96-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpandn %xmm14,%xmm12,%xmm7 + + vpand %xmm13,%xmm12,%xmm6 + + vmovdqa %xmm4,48-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 192-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm7,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 0(%rbp),%xmm15 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 112-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,64-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 208-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 128-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,80-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 224-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 144-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,96-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 240-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 160-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,112-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 0-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 176-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,128-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 16-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 192-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,144-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 32-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 208-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,160-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 48-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 224-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,176-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 64-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 240-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,192-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 80-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 0-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,208-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 96-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 16-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,224-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 112-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 32-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,240-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 128-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 48-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,0-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 144-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 64-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,16-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 160-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 80-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,32-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 176-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 96-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,48-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 192-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 112-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,64-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 208-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 128-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,80-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 224-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 144-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,96-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 240-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 160-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,112-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 0-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 32(%rbp),%xmm15 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 176-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 16-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,128-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 192-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 32-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,144-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 208-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 48-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,160-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 224-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 64-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,176-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 240-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 80-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,192-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 0-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 96-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,208-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 16-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 112-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,224-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 32-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 128-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,240-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 48-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 144-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,0-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 64-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 160-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,16-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 80-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 176-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,32-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 96-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 192-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,48-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 112-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 208-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,64-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 128-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 224-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,80-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 144-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 240-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,96-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 160-128(%rax),%xmm3 + + vpaddd %xmm15,%xmm14,%xmm14 + vpslld $5,%xmm10,%xmm8 + vpand %xmm12,%xmm13,%xmm7 + vpxor 0-128(%rax),%xmm1,%xmm1 + + vpaddd %xmm7,%xmm14,%xmm14 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm13,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vmovdqu %xmm0,112-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm1,%xmm5 + vpand %xmm11,%xmm6,%xmm6 + vpaddd %xmm1,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpaddd %xmm6,%xmm14,%xmm14 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 176-128(%rax),%xmm4 + + vpaddd %xmm15,%xmm13,%xmm13 + vpslld $5,%xmm14,%xmm8 + vpand %xmm11,%xmm12,%xmm7 + vpxor 16-128(%rax),%xmm2,%xmm2 + + vpaddd %xmm7,%xmm13,%xmm13 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm12,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vmovdqu %xmm1,128-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm2,%xmm5 + vpand %xmm10,%xmm6,%xmm6 + vpaddd %xmm2,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpaddd %xmm6,%xmm13,%xmm13 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 192-128(%rax),%xmm0 + + vpaddd %xmm15,%xmm12,%xmm12 + vpslld $5,%xmm13,%xmm8 + vpand %xmm10,%xmm11,%xmm7 + vpxor 32-128(%rax),%xmm3,%xmm3 + + vpaddd %xmm7,%xmm12,%xmm12 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm11,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vmovdqu %xmm2,144-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm3,%xmm5 + vpand %xmm14,%xmm6,%xmm6 + vpaddd %xmm3,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpaddd %xmm6,%xmm12,%xmm12 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 208-128(%rax),%xmm1 + + vpaddd %xmm15,%xmm11,%xmm11 + vpslld $5,%xmm12,%xmm8 + vpand %xmm14,%xmm10,%xmm7 + vpxor 48-128(%rax),%xmm4,%xmm4 + + vpaddd %xmm7,%xmm11,%xmm11 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm10,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqu %xmm3,160-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm4,%xmm5 + vpand %xmm13,%xmm6,%xmm6 + vpaddd %xmm4,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpaddd %xmm6,%xmm11,%xmm11 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 224-128(%rax),%xmm2 + + vpaddd %xmm15,%xmm10,%xmm10 + vpslld $5,%xmm11,%xmm8 + vpand %xmm13,%xmm14,%xmm7 + vpxor 64-128(%rax),%xmm0,%xmm0 + + vpaddd %xmm7,%xmm10,%xmm10 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm14,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vmovdqu %xmm4,176-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpor %xmm9,%xmm8,%xmm8 + vpsrld $31,%xmm0,%xmm5 + vpand %xmm12,%xmm6,%xmm6 + vpaddd %xmm0,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vmovdqa 64(%rbp),%xmm15 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 240-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,192-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 80-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 0-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,208-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 96-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 16-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,224-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 112-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 32-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,240-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 128-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 48-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,0-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 144-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 64-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,16-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 160-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 80-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,32-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 176-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 96-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vmovdqa %xmm2,48-128(%rax) + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 192-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 112-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vmovdqa %xmm3,64-128(%rax) + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 208-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 128-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vmovdqa %xmm4,80-128(%rax) + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 224-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 144-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vmovdqa %xmm0,96-128(%rax) + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 240-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 160-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vmovdqa %xmm1,112-128(%rax) + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 0-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 176-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 16-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 192-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 32-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa 208-128(%rax),%xmm2 + + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + vpaddd %xmm4,%xmm10,%xmm10 + vpxor 48-128(%rax),%xmm0,%xmm0 + vpsrld $27,%xmm11,%xmm9 + vpxor %xmm13,%xmm6,%xmm6 + vpxor %xmm2,%xmm0,%xmm0 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + vpsrld $31,%xmm0,%xmm5 + vpaddd %xmm0,%xmm0,%xmm0 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm5,%xmm0,%xmm0 + vpor %xmm7,%xmm12,%xmm12 + vpxor %xmm3,%xmm1,%xmm1 + vmovdqa 224-128(%rax),%xmm3 + + vpslld $5,%xmm10,%xmm8 + vpaddd %xmm15,%xmm14,%xmm14 + vpxor %xmm11,%xmm13,%xmm6 + vpaddd %xmm0,%xmm14,%xmm14 + vpxor 64-128(%rax),%xmm1,%xmm1 + vpsrld $27,%xmm10,%xmm9 + vpxor %xmm12,%xmm6,%xmm6 + vpxor %xmm3,%xmm1,%xmm1 + + vpslld $30,%xmm11,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm14,%xmm14 + vpsrld $31,%xmm1,%xmm5 + vpaddd %xmm1,%xmm1,%xmm1 + + vpsrld $2,%xmm11,%xmm11 + vpaddd %xmm8,%xmm14,%xmm14 + vpor %xmm5,%xmm1,%xmm1 + vpor %xmm7,%xmm11,%xmm11 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa 240-128(%rax),%xmm4 + + vpslld $5,%xmm14,%xmm8 + vpaddd %xmm15,%xmm13,%xmm13 + vpxor %xmm10,%xmm12,%xmm6 + vpaddd %xmm1,%xmm13,%xmm13 + vpxor 80-128(%rax),%xmm2,%xmm2 + vpsrld $27,%xmm14,%xmm9 + vpxor %xmm11,%xmm6,%xmm6 + vpxor %xmm4,%xmm2,%xmm2 + + vpslld $30,%xmm10,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm13,%xmm13 + vpsrld $31,%xmm2,%xmm5 + vpaddd %xmm2,%xmm2,%xmm2 + + vpsrld $2,%xmm10,%xmm10 + vpaddd %xmm8,%xmm13,%xmm13 + vpor %xmm5,%xmm2,%xmm2 + vpor %xmm7,%xmm10,%xmm10 + vpxor %xmm0,%xmm3,%xmm3 + vmovdqa 0-128(%rax),%xmm0 + + vpslld $5,%xmm13,%xmm8 + vpaddd %xmm15,%xmm12,%xmm12 + vpxor %xmm14,%xmm11,%xmm6 + vpaddd %xmm2,%xmm12,%xmm12 + vpxor 96-128(%rax),%xmm3,%xmm3 + vpsrld $27,%xmm13,%xmm9 + vpxor %xmm10,%xmm6,%xmm6 + vpxor %xmm0,%xmm3,%xmm3 + + vpslld $30,%xmm14,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + vpsrld $31,%xmm3,%xmm5 + vpaddd %xmm3,%xmm3,%xmm3 + + vpsrld $2,%xmm14,%xmm14 + vpaddd %xmm8,%xmm12,%xmm12 + vpor %xmm5,%xmm3,%xmm3 + vpor %xmm7,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqa 16-128(%rax),%xmm1 + + vpslld $5,%xmm12,%xmm8 + vpaddd %xmm15,%xmm11,%xmm11 + vpxor %xmm13,%xmm10,%xmm6 + vpaddd %xmm3,%xmm11,%xmm11 + vpxor 112-128(%rax),%xmm4,%xmm4 + vpsrld $27,%xmm12,%xmm9 + vpxor %xmm14,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm4 + + vpslld $30,%xmm13,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm11,%xmm11 + vpsrld $31,%xmm4,%xmm5 + vpaddd %xmm4,%xmm4,%xmm4 + + vpsrld $2,%xmm13,%xmm13 + vpaddd %xmm8,%xmm11,%xmm11 + vpor %xmm5,%xmm4,%xmm4 + vpor %xmm7,%xmm13,%xmm13 + vpslld $5,%xmm11,%xmm8 + vpaddd %xmm15,%xmm10,%xmm10 + vpxor %xmm12,%xmm14,%xmm6 + + vpsrld $27,%xmm11,%xmm9 + vpaddd %xmm4,%xmm10,%xmm10 + vpxor %xmm13,%xmm6,%xmm6 + + vpslld $30,%xmm12,%xmm7 + vpor %xmm9,%xmm8,%xmm8 + vpaddd %xmm6,%xmm10,%xmm10 + + vpsrld $2,%xmm12,%xmm12 + vpaddd %xmm8,%xmm10,%xmm10 + vpor %xmm7,%xmm12,%xmm12 + movl $1,%ecx + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqu (%rbx),%xmm6 + vpxor %xmm8,%xmm8,%xmm8 + vmovdqa %xmm6,%xmm7 + vpcmpgtd %xmm8,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + + vpand %xmm7,%xmm10,%xmm10 + vpand %xmm7,%xmm11,%xmm11 + vpaddd 0(%rdi),%xmm10,%xmm10 + vpand %xmm7,%xmm12,%xmm12 + vpaddd 32(%rdi),%xmm11,%xmm11 + vpand %xmm7,%xmm13,%xmm13 + vpaddd 64(%rdi),%xmm12,%xmm12 + vpand %xmm7,%xmm14,%xmm14 + vpaddd 96(%rdi),%xmm13,%xmm13 + vpaddd 128(%rdi),%xmm14,%xmm14 + vmovdqu %xmm10,0(%rdi) + vmovdqu %xmm11,32(%rdi) + vmovdqu %xmm12,64(%rdi) + vmovdqu %xmm13,96(%rdi) + vmovdqu %xmm14,128(%rdi) + + vmovdqu %xmm6,(%rbx) + vmovdqu 96(%rbp),%xmm5 + decl %edx + jnz .Loop_avx + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande_avx + +.Ldone_avx: + movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_multi_block_avx,.-sha1_multi_block_avx +.type sha1_multi_block_avx2,@function +.align 32 +sha1_multi_block_avx2: +.cfi_startproc +_avx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $576,%rsp + andq $-256,%rsp + movq %rax,544(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 +.Lbody_avx2: + leaq K_XX_XX(%rip),%rbp + shrl $1,%edx + + vzeroupper +.Loop_grande_avx2: + movl %edx,552(%rsp) + xorl %edx,%edx + leaq 512(%rsp),%rbx + movq 0(%rsi),%r12 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r12 + movq 16(%rsi),%r13 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r13 + movq 32(%rsi),%r14 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r14 + movq 48(%rsi),%r15 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r15 + movq 64(%rsi),%r8 + movl 72(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,16(%rbx) + cmovleq %rbp,%r8 + movq 80(%rsi),%r9 + movl 88(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,20(%rbx) + cmovleq %rbp,%r9 + movq 96(%rsi),%r10 + movl 104(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,24(%rbx) + cmovleq %rbp,%r10 + movq 112(%rsi),%r11 + movl 120(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,28(%rbx) + cmovleq %rbp,%r11 + vmovdqu 0(%rdi),%ymm0 + leaq 128(%rsp),%rax + vmovdqu 32(%rdi),%ymm1 + leaq 256+128(%rsp),%rbx + vmovdqu 64(%rdi),%ymm2 + vmovdqu 96(%rdi),%ymm3 + vmovdqu 128(%rdi),%ymm4 + vmovdqu 96(%rbp),%ymm9 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + vmovdqa -32(%rbp),%ymm15 + vmovd (%r12),%xmm10 + leaq 64(%r12),%r12 + vmovd (%r8),%xmm12 + leaq 64(%r8),%r8 + vmovd (%r13),%xmm7 + leaq 64(%r13),%r13 + vmovd (%r9),%xmm6 + leaq 64(%r9),%r9 + vpinsrd $1,(%r14),%xmm10,%xmm10 + leaq 64(%r14),%r14 + vpinsrd $1,(%r10),%xmm12,%xmm12 + leaq 64(%r10),%r10 + vpinsrd $1,(%r15),%xmm7,%xmm7 + leaq 64(%r15),%r15 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,(%r11),%xmm6,%xmm6 + leaq 64(%r11),%r11 + vpunpckldq %ymm6,%ymm12,%ymm12 + vmovd -60(%r12),%xmm11 + vinserti128 $1,%xmm12,%ymm10,%ymm10 + vmovd -60(%r8),%xmm8 + vpshufb %ymm9,%ymm10,%ymm10 + vmovd -60(%r13),%xmm7 + vmovd -60(%r9),%xmm6 + vpinsrd $1,-60(%r14),%xmm11,%xmm11 + vpinsrd $1,-60(%r10),%xmm8,%xmm8 + vpinsrd $1,-60(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm11,%ymm11 + vpinsrd $1,-60(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,0-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vinserti128 $1,%xmm8,%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -56(%r12),%xmm12 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -56(%r8),%xmm8 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpshufb %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vmovd -56(%r13),%xmm7 + vmovd -56(%r9),%xmm6 + vpinsrd $1,-56(%r14),%xmm12,%xmm12 + vpinsrd $1,-56(%r10),%xmm8,%xmm8 + vpinsrd $1,-56(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm12,%ymm12 + vpinsrd $1,-56(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,32-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vinserti128 $1,%xmm8,%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -52(%r12),%xmm13 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -52(%r8),%xmm8 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpshufb %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vmovd -52(%r13),%xmm7 + vmovd -52(%r9),%xmm6 + vpinsrd $1,-52(%r14),%xmm13,%xmm13 + vpinsrd $1,-52(%r10),%xmm8,%xmm8 + vpinsrd $1,-52(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm13,%ymm13 + vpinsrd $1,-52(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,64-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vinserti128 $1,%xmm8,%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -48(%r12),%xmm14 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -48(%r8),%xmm8 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpshufb %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vmovd -48(%r13),%xmm7 + vmovd -48(%r9),%xmm6 + vpinsrd $1,-48(%r14),%xmm14,%xmm14 + vpinsrd $1,-48(%r10),%xmm8,%xmm8 + vpinsrd $1,-48(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm14,%ymm14 + vpinsrd $1,-48(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,96-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vinserti128 $1,%xmm8,%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -44(%r12),%xmm10 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -44(%r8),%xmm8 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpshufb %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vmovd -44(%r13),%xmm7 + vmovd -44(%r9),%xmm6 + vpinsrd $1,-44(%r14),%xmm10,%xmm10 + vpinsrd $1,-44(%r10),%xmm8,%xmm8 + vpinsrd $1,-44(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,-44(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,128-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vinserti128 $1,%xmm8,%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -40(%r12),%xmm11 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -40(%r8),%xmm8 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpshufb %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovd -40(%r13),%xmm7 + vmovd -40(%r9),%xmm6 + vpinsrd $1,-40(%r14),%xmm11,%xmm11 + vpinsrd $1,-40(%r10),%xmm8,%xmm8 + vpinsrd $1,-40(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm11,%ymm11 + vpinsrd $1,-40(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,160-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vinserti128 $1,%xmm8,%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -36(%r12),%xmm12 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -36(%r8),%xmm8 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpshufb %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vmovd -36(%r13),%xmm7 + vmovd -36(%r9),%xmm6 + vpinsrd $1,-36(%r14),%xmm12,%xmm12 + vpinsrd $1,-36(%r10),%xmm8,%xmm8 + vpinsrd $1,-36(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm12,%ymm12 + vpinsrd $1,-36(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,192-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vinserti128 $1,%xmm8,%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -32(%r12),%xmm13 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -32(%r8),%xmm8 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpshufb %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vmovd -32(%r13),%xmm7 + vmovd -32(%r9),%xmm6 + vpinsrd $1,-32(%r14),%xmm13,%xmm13 + vpinsrd $1,-32(%r10),%xmm8,%xmm8 + vpinsrd $1,-32(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm13,%ymm13 + vpinsrd $1,-32(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,224-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vinserti128 $1,%xmm8,%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -28(%r12),%xmm14 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -28(%r8),%xmm8 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpshufb %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vmovd -28(%r13),%xmm7 + vmovd -28(%r9),%xmm6 + vpinsrd $1,-28(%r14),%xmm14,%xmm14 + vpinsrd $1,-28(%r10),%xmm8,%xmm8 + vpinsrd $1,-28(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm14,%ymm14 + vpinsrd $1,-28(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,256-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vinserti128 $1,%xmm8,%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -24(%r12),%xmm10 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -24(%r8),%xmm8 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpshufb %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vmovd -24(%r13),%xmm7 + vmovd -24(%r9),%xmm6 + vpinsrd $1,-24(%r14),%xmm10,%xmm10 + vpinsrd $1,-24(%r10),%xmm8,%xmm8 + vpinsrd $1,-24(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,-24(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,288-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vinserti128 $1,%xmm8,%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -20(%r12),%xmm11 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -20(%r8),%xmm8 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpshufb %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovd -20(%r13),%xmm7 + vmovd -20(%r9),%xmm6 + vpinsrd $1,-20(%r14),%xmm11,%xmm11 + vpinsrd $1,-20(%r10),%xmm8,%xmm8 + vpinsrd $1,-20(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm11,%ymm11 + vpinsrd $1,-20(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,320-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vinserti128 $1,%xmm8,%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -16(%r12),%xmm12 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -16(%r8),%xmm8 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpshufb %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vmovd -16(%r13),%xmm7 + vmovd -16(%r9),%xmm6 + vpinsrd $1,-16(%r14),%xmm12,%xmm12 + vpinsrd $1,-16(%r10),%xmm8,%xmm8 + vpinsrd $1,-16(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm12,%ymm12 + vpinsrd $1,-16(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,352-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vinserti128 $1,%xmm8,%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -12(%r12),%xmm13 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -12(%r8),%xmm8 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpshufb %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vmovd -12(%r13),%xmm7 + vmovd -12(%r9),%xmm6 + vpinsrd $1,-12(%r14),%xmm13,%xmm13 + vpinsrd $1,-12(%r10),%xmm8,%xmm8 + vpinsrd $1,-12(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm13,%ymm13 + vpinsrd $1,-12(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,384-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vinserti128 $1,%xmm8,%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -8(%r12),%xmm14 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -8(%r8),%xmm8 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpshufb %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vmovd -8(%r13),%xmm7 + vmovd -8(%r9),%xmm6 + vpinsrd $1,-8(%r14),%xmm14,%xmm14 + vpinsrd $1,-8(%r10),%xmm8,%xmm8 + vpinsrd $1,-8(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm14,%ymm14 + vpinsrd $1,-8(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,416-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vinserti128 $1,%xmm8,%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vmovd -4(%r12),%xmm10 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vmovd -4(%r8),%xmm8 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpshufb %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vmovdqa 0-128(%rax),%ymm11 + vmovd -4(%r13),%xmm7 + vmovd -4(%r9),%xmm6 + vpinsrd $1,-4(%r14),%xmm10,%xmm10 + vpinsrd $1,-4(%r10),%xmm8,%xmm8 + vpinsrd $1,-4(%r15),%xmm7,%xmm7 + vpunpckldq %ymm7,%ymm10,%ymm10 + vpinsrd $1,-4(%r11),%xmm6,%xmm6 + vpunpckldq %ymm6,%ymm8,%ymm8 + vpaddd %ymm15,%ymm0,%ymm0 + prefetcht0 63(%r12) + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,448-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vinserti128 $1,%xmm8,%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + prefetcht0 63(%r13) + vpxor %ymm6,%ymm5,%ymm5 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + prefetcht0 63(%r14) + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + prefetcht0 63(%r15) + vpshufb %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 32-128(%rax),%ymm12 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 64-128(%rax),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpandn %ymm3,%ymm1,%ymm6 + prefetcht0 63(%r8) + vpand %ymm2,%ymm1,%ymm5 + + vmovdqa %ymm10,480-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 256-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + prefetcht0 63(%r9) + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + prefetcht0 63(%r10) + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + prefetcht0 63(%r11) + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 96-128(%rax),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpandn %ymm2,%ymm0,%ymm6 + + vpand %ymm1,%ymm0,%ymm5 + + vmovdqa %ymm11,0-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 288-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 128-128(%rax),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpandn %ymm1,%ymm4,%ymm6 + + vpand %ymm0,%ymm4,%ymm5 + + vmovdqa %ymm12,32-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 320-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 160-128(%rax),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpandn %ymm0,%ymm3,%ymm6 + + vpand %ymm4,%ymm3,%ymm5 + + vmovdqa %ymm13,64-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 352-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 192-128(%rax),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpandn %ymm4,%ymm2,%ymm6 + + vpand %ymm3,%ymm2,%ymm5 + + vmovdqa %ymm14,96-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 384-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm6,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 0(%rbp),%ymm15 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 224-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,128-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 416-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 256-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,160-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 448-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 288-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,192-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 480-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 320-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,224-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 0-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 352-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,256-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 32-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 384-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,288-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 64-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 416-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,320-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 96-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 448-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,352-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 128-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 480-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,384-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 160-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 0-128(%rax),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,416-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 192-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 32-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,448-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 224-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 64-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,480-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 256-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 96-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,0-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 288-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 128-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,32-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 320-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 160-128(%rax),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,64-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 352-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 192-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,96-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 384-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 224-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,128-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 416-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 256-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,160-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 448-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 288-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,192-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 480-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 320-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,224-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 0-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 32(%rbp),%ymm15 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 352-256-128(%rbx),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 32-128(%rax),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,256-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 384-256-128(%rbx),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 64-128(%rax),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,288-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 416-256-128(%rbx),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 96-128(%rax),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,320-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 448-256-128(%rbx),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 128-128(%rax),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,352-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 480-256-128(%rbx),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 160-128(%rax),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,384-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 0-128(%rax),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 192-128(%rax),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,416-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 32-128(%rax),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 224-128(%rax),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,448-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 64-128(%rax),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 256-256-128(%rbx),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,480-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 96-128(%rax),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 288-256-128(%rbx),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,0-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 128-128(%rax),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 320-256-128(%rbx),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,32-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 160-128(%rax),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 352-256-128(%rbx),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,64-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 192-128(%rax),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 384-256-128(%rbx),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,96-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 224-128(%rax),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 416-256-128(%rbx),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,128-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 256-256-128(%rbx),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 448-256-128(%rbx),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,160-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 288-256-128(%rbx),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 480-256-128(%rbx),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,192-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 320-256-128(%rbx),%ymm13 + + vpaddd %ymm15,%ymm4,%ymm4 + vpslld $5,%ymm0,%ymm7 + vpand %ymm2,%ymm3,%ymm6 + vpxor 0-128(%rax),%ymm11,%ymm11 + + vpaddd %ymm6,%ymm4,%ymm4 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm3,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vmovdqu %ymm10,224-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm11,%ymm9 + vpand %ymm1,%ymm5,%ymm5 + vpaddd %ymm11,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpaddd %ymm5,%ymm4,%ymm4 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 352-256-128(%rbx),%ymm14 + + vpaddd %ymm15,%ymm3,%ymm3 + vpslld $5,%ymm4,%ymm7 + vpand %ymm1,%ymm2,%ymm6 + vpxor 32-128(%rax),%ymm12,%ymm12 + + vpaddd %ymm6,%ymm3,%ymm3 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm2,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vmovdqu %ymm11,256-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm12,%ymm9 + vpand %ymm0,%ymm5,%ymm5 + vpaddd %ymm12,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpaddd %ymm5,%ymm3,%ymm3 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 384-256-128(%rbx),%ymm10 + + vpaddd %ymm15,%ymm2,%ymm2 + vpslld $5,%ymm3,%ymm7 + vpand %ymm0,%ymm1,%ymm6 + vpxor 64-128(%rax),%ymm13,%ymm13 + + vpaddd %ymm6,%ymm2,%ymm2 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm1,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vmovdqu %ymm12,288-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm13,%ymm9 + vpand %ymm4,%ymm5,%ymm5 + vpaddd %ymm13,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpaddd %ymm5,%ymm2,%ymm2 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 416-256-128(%rbx),%ymm11 + + vpaddd %ymm15,%ymm1,%ymm1 + vpslld $5,%ymm2,%ymm7 + vpand %ymm4,%ymm0,%ymm6 + vpxor 96-128(%rax),%ymm14,%ymm14 + + vpaddd %ymm6,%ymm1,%ymm1 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm0,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vmovdqu %ymm13,320-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm14,%ymm9 + vpand %ymm3,%ymm5,%ymm5 + vpaddd %ymm14,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 448-256-128(%rbx),%ymm12 + + vpaddd %ymm15,%ymm0,%ymm0 + vpslld $5,%ymm1,%ymm7 + vpand %ymm3,%ymm4,%ymm6 + vpxor 128-128(%rax),%ymm10,%ymm10 + + vpaddd %ymm6,%ymm0,%ymm0 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm4,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vmovdqu %ymm14,352-256-128(%rbx) + vpaddd %ymm14,%ymm0,%ymm0 + vpor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm10,%ymm9 + vpand %ymm2,%ymm5,%ymm5 + vpaddd %ymm10,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vmovdqa 64(%rbp),%ymm15 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 480-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,384-256-128(%rbx) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 160-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 0-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,416-256-128(%rbx) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 192-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 32-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,448-256-128(%rbx) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 224-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 64-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,480-256-128(%rbx) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 256-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 96-128(%rax),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,0-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 288-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 128-128(%rax),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,32-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 320-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 160-128(%rax),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,64-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 352-256-128(%rbx),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 192-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vmovdqa %ymm12,96-128(%rax) + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 384-256-128(%rbx),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 224-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vmovdqa %ymm13,128-128(%rax) + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 416-256-128(%rbx),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 256-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vmovdqa %ymm14,160-128(%rax) + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 448-256-128(%rbx),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 288-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vmovdqa %ymm10,192-128(%rax) + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 480-256-128(%rbx),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 320-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vmovdqa %ymm11,224-128(%rax) + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 0-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 352-256-128(%rbx),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 32-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 384-256-128(%rbx),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 64-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpxor %ymm12,%ymm10,%ymm10 + vmovdqa 416-256-128(%rbx),%ymm12 + + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + vpaddd %ymm14,%ymm0,%ymm0 + vpxor 96-128(%rax),%ymm10,%ymm10 + vpsrld $27,%ymm1,%ymm8 + vpxor %ymm3,%ymm5,%ymm5 + vpxor %ymm12,%ymm10,%ymm10 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + vpsrld $31,%ymm10,%ymm9 + vpaddd %ymm10,%ymm10,%ymm10 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm9,%ymm10,%ymm10 + vpor %ymm6,%ymm2,%ymm2 + vpxor %ymm13,%ymm11,%ymm11 + vmovdqa 448-256-128(%rbx),%ymm13 + + vpslld $5,%ymm0,%ymm7 + vpaddd %ymm15,%ymm4,%ymm4 + vpxor %ymm1,%ymm3,%ymm5 + vpaddd %ymm10,%ymm4,%ymm4 + vpxor 128-128(%rax),%ymm11,%ymm11 + vpsrld $27,%ymm0,%ymm8 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm13,%ymm11,%ymm11 + + vpslld $30,%ymm1,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm4,%ymm4 + vpsrld $31,%ymm11,%ymm9 + vpaddd %ymm11,%ymm11,%ymm11 + + vpsrld $2,%ymm1,%ymm1 + vpaddd %ymm7,%ymm4,%ymm4 + vpor %ymm9,%ymm11,%ymm11 + vpor %ymm6,%ymm1,%ymm1 + vpxor %ymm14,%ymm12,%ymm12 + vmovdqa 480-256-128(%rbx),%ymm14 + + vpslld $5,%ymm4,%ymm7 + vpaddd %ymm15,%ymm3,%ymm3 + vpxor %ymm0,%ymm2,%ymm5 + vpaddd %ymm11,%ymm3,%ymm3 + vpxor 160-128(%rax),%ymm12,%ymm12 + vpsrld $27,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm14,%ymm12,%ymm12 + + vpslld $30,%ymm0,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm3,%ymm3 + vpsrld $31,%ymm12,%ymm9 + vpaddd %ymm12,%ymm12,%ymm12 + + vpsrld $2,%ymm0,%ymm0 + vpaddd %ymm7,%ymm3,%ymm3 + vpor %ymm9,%ymm12,%ymm12 + vpor %ymm6,%ymm0,%ymm0 + vpxor %ymm10,%ymm13,%ymm13 + vmovdqa 0-128(%rax),%ymm10 + + vpslld $5,%ymm3,%ymm7 + vpaddd %ymm15,%ymm2,%ymm2 + vpxor %ymm4,%ymm1,%ymm5 + vpaddd %ymm12,%ymm2,%ymm2 + vpxor 192-128(%rax),%ymm13,%ymm13 + vpsrld $27,%ymm3,%ymm8 + vpxor %ymm0,%ymm5,%ymm5 + vpxor %ymm10,%ymm13,%ymm13 + + vpslld $30,%ymm4,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm2,%ymm2 + vpsrld $31,%ymm13,%ymm9 + vpaddd %ymm13,%ymm13,%ymm13 + + vpsrld $2,%ymm4,%ymm4 + vpaddd %ymm7,%ymm2,%ymm2 + vpor %ymm9,%ymm13,%ymm13 + vpor %ymm6,%ymm4,%ymm4 + vpxor %ymm11,%ymm14,%ymm14 + vmovdqa 32-128(%rax),%ymm11 + + vpslld $5,%ymm2,%ymm7 + vpaddd %ymm15,%ymm1,%ymm1 + vpxor %ymm3,%ymm0,%ymm5 + vpaddd %ymm13,%ymm1,%ymm1 + vpxor 224-128(%rax),%ymm14,%ymm14 + vpsrld $27,%ymm2,%ymm8 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm11,%ymm14,%ymm14 + + vpslld $30,%ymm3,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm1,%ymm1 + vpsrld $31,%ymm14,%ymm9 + vpaddd %ymm14,%ymm14,%ymm14 + + vpsrld $2,%ymm3,%ymm3 + vpaddd %ymm7,%ymm1,%ymm1 + vpor %ymm9,%ymm14,%ymm14 + vpor %ymm6,%ymm3,%ymm3 + vpslld $5,%ymm1,%ymm7 + vpaddd %ymm15,%ymm0,%ymm0 + vpxor %ymm2,%ymm4,%ymm5 + + vpsrld $27,%ymm1,%ymm8 + vpaddd %ymm14,%ymm0,%ymm0 + vpxor %ymm3,%ymm5,%ymm5 + + vpslld $30,%ymm2,%ymm6 + vpor %ymm8,%ymm7,%ymm7 + vpaddd %ymm5,%ymm0,%ymm0 + + vpsrld $2,%ymm2,%ymm2 + vpaddd %ymm7,%ymm0,%ymm0 + vpor %ymm6,%ymm2,%ymm2 + movl $1,%ecx + leaq 512(%rsp),%rbx + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r12 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r13 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r14 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r15 + cmpl 16(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 20(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 24(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 28(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqu (%rbx),%ymm5 + vpxor %ymm7,%ymm7,%ymm7 + vmovdqa %ymm5,%ymm6 + vpcmpgtd %ymm7,%ymm6,%ymm6 + vpaddd %ymm6,%ymm5,%ymm5 + + vpand %ymm6,%ymm0,%ymm0 + vpand %ymm6,%ymm1,%ymm1 + vpaddd 0(%rdi),%ymm0,%ymm0 + vpand %ymm6,%ymm2,%ymm2 + vpaddd 32(%rdi),%ymm1,%ymm1 + vpand %ymm6,%ymm3,%ymm3 + vpaddd 64(%rdi),%ymm2,%ymm2 + vpand %ymm6,%ymm4,%ymm4 + vpaddd 96(%rdi),%ymm3,%ymm3 + vpaddd 128(%rdi),%ymm4,%ymm4 + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm1,32(%rdi) + vmovdqu %ymm2,64(%rdi) + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,128(%rdi) + + vmovdqu %ymm5,(%rbx) + leaq 256+128(%rsp),%rbx + vmovdqu 96(%rbp),%ymm9 + decl %edx + jnz .Loop_avx2 + + + + + + + +.Ldone_avx2: + movq 544(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_multi_block_avx2,.-sha1_multi_block_avx2 .align 256 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 K_XX_XX: .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 Index: stable/12/secure/lib/libcrypto/amd64/sha1-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/sha1-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/sha1-x86_64.S (revision 364963) @@ -1,2623 +1,5452 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha1-x86_64.pl. */ .text .globl sha1_block_data_order .type sha1_block_data_order,@function .align 16 sha1_block_data_order: .cfi_startproc movl OPENSSL_ia32cap_P+0(%rip),%r9d movl OPENSSL_ia32cap_P+4(%rip),%r8d movl OPENSSL_ia32cap_P+8(%rip),%r10d testl $512,%r8d jz .Lialu testl $536870912,%r10d jnz _shaext_shortcut + andl $296,%r10d + cmpl $296,%r10d + je _avx2_shortcut + andl $268435456,%r8d + andl $1073741824,%r9d + orl %r9d,%r8d + cmpl $1342177280,%r8d + je _avx_shortcut jmp _ssse3_shortcut .align 16 .Lialu: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 movq %rdi,%r8 subq $72,%rsp movq %rsi,%r9 andq $-64,%rsp movq %rdx,%r10 movq %rax,64(%rsp) .cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%r8),%esi movl 4(%r8),%edi movl 8(%r8),%r11d movl 12(%r8),%r12d movl 16(%r8),%r13d jmp .Lloop .align 16 .Lloop: movl 0(%r9),%edx bswapl %edx movl 4(%r9),%ebp movl %r12d,%eax movl %edx,0(%rsp) movl %esi,%ecx bswapl %ebp xorl %r11d,%eax roll $5,%ecx andl %edi,%eax leal 1518500249(%rdx,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d movl 8(%r9),%r14d movl %r11d,%eax movl %ebp,4(%rsp) movl %r13d,%ecx bswapl %r14d xorl %edi,%eax roll $5,%ecx andl %esi,%eax leal 1518500249(%rbp,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d movl 12(%r9),%edx movl %edi,%eax movl %r14d,8(%rsp) movl %r12d,%ecx bswapl %edx xorl %esi,%eax roll $5,%ecx andl %r13d,%eax leal 1518500249(%r14,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d movl 16(%r9),%ebp movl %esi,%eax movl %edx,12(%rsp) movl %r11d,%ecx bswapl %ebp xorl %r13d,%eax roll $5,%ecx andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi movl 20(%r9),%r14d movl %r13d,%eax movl %ebp,16(%rsp) movl %edi,%ecx bswapl %r14d xorl %r12d,%eax roll $5,%ecx andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi movl 24(%r9),%edx movl %r12d,%eax movl %r14d,20(%rsp) movl %esi,%ecx bswapl %edx xorl %r11d,%eax roll $5,%ecx andl %edi,%eax leal 1518500249(%r14,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d movl 28(%r9),%ebp movl %r11d,%eax movl %edx,24(%rsp) movl %r13d,%ecx bswapl %ebp xorl %edi,%eax roll $5,%ecx andl %esi,%eax leal 1518500249(%rdx,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d movl 32(%r9),%r14d movl %edi,%eax movl %ebp,28(%rsp) movl %r12d,%ecx bswapl %r14d xorl %esi,%eax roll $5,%ecx andl %r13d,%eax leal 1518500249(%rbp,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d movl 36(%r9),%edx movl %esi,%eax movl %r14d,32(%rsp) movl %r11d,%ecx bswapl %edx xorl %r13d,%eax roll $5,%ecx andl %r12d,%eax leal 1518500249(%r14,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi movl 40(%r9),%ebp movl %r13d,%eax movl %edx,36(%rsp) movl %edi,%ecx bswapl %ebp xorl %r12d,%eax roll $5,%ecx andl %r11d,%eax leal 1518500249(%rdx,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi movl 44(%r9),%r14d movl %r12d,%eax movl %ebp,40(%rsp) movl %esi,%ecx bswapl %r14d xorl %r11d,%eax roll $5,%ecx andl %edi,%eax leal 1518500249(%rbp,%r13,1),%r13d addl %ecx,%r13d xorl %r12d,%eax roll $30,%edi addl %eax,%r13d movl 48(%r9),%edx movl %r11d,%eax movl %r14d,44(%rsp) movl %r13d,%ecx bswapl %edx xorl %edi,%eax roll $5,%ecx andl %esi,%eax leal 1518500249(%r14,%r12,1),%r12d addl %ecx,%r12d xorl %r11d,%eax roll $30,%esi addl %eax,%r12d movl 52(%r9),%ebp movl %edi,%eax movl %edx,48(%rsp) movl %r12d,%ecx bswapl %ebp xorl %esi,%eax roll $5,%ecx andl %r13d,%eax leal 1518500249(%rdx,%r11,1),%r11d addl %ecx,%r11d xorl %edi,%eax roll $30,%r13d addl %eax,%r11d movl 56(%r9),%r14d movl %esi,%eax movl %ebp,52(%rsp) movl %r11d,%ecx bswapl %r14d xorl %r13d,%eax roll $5,%ecx andl %r12d,%eax leal 1518500249(%rbp,%rdi,1),%edi addl %ecx,%edi xorl %esi,%eax roll $30,%r12d addl %eax,%edi movl 60(%r9),%edx movl %r13d,%eax movl %r14d,56(%rsp) movl %edi,%ecx bswapl %edx xorl %r12d,%eax roll $5,%ecx andl %r11d,%eax leal 1518500249(%r14,%rsi,1),%esi addl %ecx,%esi xorl %r13d,%eax roll $30,%r11d addl %eax,%esi xorl 0(%rsp),%ebp movl %r12d,%eax movl %edx,60(%rsp) movl %esi,%ecx xorl 8(%rsp),%ebp xorl %r11d,%eax roll $5,%ecx xorl 32(%rsp),%ebp andl %edi,%eax leal 1518500249(%rdx,%r13,1),%r13d roll $30,%edi xorl %r12d,%eax addl %ecx,%r13d roll $1,%ebp addl %eax,%r13d xorl 4(%rsp),%r14d movl %r11d,%eax movl %ebp,0(%rsp) movl %r13d,%ecx xorl 12(%rsp),%r14d xorl %edi,%eax roll $5,%ecx xorl 36(%rsp),%r14d andl %esi,%eax leal 1518500249(%rbp,%r12,1),%r12d roll $30,%esi xorl %r11d,%eax addl %ecx,%r12d roll $1,%r14d addl %eax,%r12d xorl 8(%rsp),%edx movl %edi,%eax movl %r14d,4(%rsp) movl %r12d,%ecx xorl 16(%rsp),%edx xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%edx andl %r13d,%eax leal 1518500249(%r14,%r11,1),%r11d roll $30,%r13d xorl %edi,%eax addl %ecx,%r11d roll $1,%edx addl %eax,%r11d xorl 12(%rsp),%ebp movl %esi,%eax movl %edx,8(%rsp) movl %r11d,%ecx xorl 20(%rsp),%ebp xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%ebp andl %r12d,%eax leal 1518500249(%rdx,%rdi,1),%edi roll $30,%r12d xorl %esi,%eax addl %ecx,%edi roll $1,%ebp addl %eax,%edi xorl 16(%rsp),%r14d movl %r13d,%eax movl %ebp,12(%rsp) movl %edi,%ecx xorl 24(%rsp),%r14d xorl %r12d,%eax roll $5,%ecx xorl 48(%rsp),%r14d andl %r11d,%eax leal 1518500249(%rbp,%rsi,1),%esi roll $30,%r11d xorl %r13d,%eax addl %ecx,%esi roll $1,%r14d addl %eax,%esi xorl 20(%rsp),%edx movl %edi,%eax movl %r14d,16(%rsp) movl %esi,%ecx xorl 28(%rsp),%edx xorl %r12d,%eax roll $5,%ecx xorl 52(%rsp),%edx leal 1859775393(%r14,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%edx xorl 24(%rsp),%ebp movl %esi,%eax movl %edx,20(%rsp) movl %r13d,%ecx xorl 32(%rsp),%ebp xorl %r11d,%eax roll $5,%ecx xorl 56(%rsp),%ebp leal 1859775393(%rdx,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%ebp xorl 28(%rsp),%r14d movl %r13d,%eax movl %ebp,24(%rsp) movl %r12d,%ecx xorl 36(%rsp),%r14d xorl %edi,%eax roll $5,%ecx xorl 60(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%r14d xorl 32(%rsp),%edx movl %r12d,%eax movl %r14d,28(%rsp) movl %r11d,%ecx xorl 40(%rsp),%edx xorl %esi,%eax roll $5,%ecx xorl 0(%rsp),%edx leal 1859775393(%r14,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%edx xorl 36(%rsp),%ebp movl %r11d,%eax movl %edx,32(%rsp) movl %edi,%ecx xorl 44(%rsp),%ebp xorl %r13d,%eax roll $5,%ecx xorl 4(%rsp),%ebp leal 1859775393(%rdx,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%ebp xorl 40(%rsp),%r14d movl %edi,%eax movl %ebp,36(%rsp) movl %esi,%ecx xorl 48(%rsp),%r14d xorl %r12d,%eax roll $5,%ecx xorl 8(%rsp),%r14d leal 1859775393(%rbp,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%r14d xorl 44(%rsp),%edx movl %esi,%eax movl %r14d,40(%rsp) movl %r13d,%ecx xorl 52(%rsp),%edx xorl %r11d,%eax roll $5,%ecx xorl 12(%rsp),%edx leal 1859775393(%r14,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%edx xorl 48(%rsp),%ebp movl %r13d,%eax movl %edx,44(%rsp) movl %r12d,%ecx xorl 56(%rsp),%ebp xorl %edi,%eax roll $5,%ecx xorl 16(%rsp),%ebp leal 1859775393(%rdx,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%ebp xorl 52(%rsp),%r14d movl %r12d,%eax movl %ebp,48(%rsp) movl %r11d,%ecx xorl 60(%rsp),%r14d xorl %esi,%eax roll $5,%ecx xorl 20(%rsp),%r14d leal 1859775393(%rbp,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%r14d xorl 56(%rsp),%edx movl %r11d,%eax movl %r14d,52(%rsp) movl %edi,%ecx xorl 0(%rsp),%edx xorl %r13d,%eax roll $5,%ecx xorl 24(%rsp),%edx leal 1859775393(%r14,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%edx xorl 60(%rsp),%ebp movl %edi,%eax movl %edx,56(%rsp) movl %esi,%ecx xorl 4(%rsp),%ebp xorl %r12d,%eax roll $5,%ecx xorl 28(%rsp),%ebp leal 1859775393(%rdx,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%ebp xorl 0(%rsp),%r14d movl %esi,%eax movl %ebp,60(%rsp) movl %r13d,%ecx xorl 8(%rsp),%r14d xorl %r11d,%eax roll $5,%ecx xorl 32(%rsp),%r14d leal 1859775393(%rbp,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%r14d xorl 4(%rsp),%edx movl %r13d,%eax movl %r14d,0(%rsp) movl %r12d,%ecx xorl 12(%rsp),%edx xorl %edi,%eax roll $5,%ecx xorl 36(%rsp),%edx leal 1859775393(%r14,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%edx xorl 8(%rsp),%ebp movl %r12d,%eax movl %edx,4(%rsp) movl %r11d,%ecx xorl 16(%rsp),%ebp xorl %esi,%eax roll $5,%ecx xorl 40(%rsp),%ebp leal 1859775393(%rdx,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%ebp xorl 12(%rsp),%r14d movl %r11d,%eax movl %ebp,8(%rsp) movl %edi,%ecx xorl 20(%rsp),%r14d xorl %r13d,%eax roll $5,%ecx xorl 44(%rsp),%r14d leal 1859775393(%rbp,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%r14d xorl 16(%rsp),%edx movl %edi,%eax movl %r14d,12(%rsp) movl %esi,%ecx xorl 24(%rsp),%edx xorl %r12d,%eax roll $5,%ecx xorl 48(%rsp),%edx leal 1859775393(%r14,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%edx xorl 20(%rsp),%ebp movl %esi,%eax movl %edx,16(%rsp) movl %r13d,%ecx xorl 28(%rsp),%ebp xorl %r11d,%eax roll $5,%ecx xorl 52(%rsp),%ebp leal 1859775393(%rdx,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%ebp xorl 24(%rsp),%r14d movl %r13d,%eax movl %ebp,20(%rsp) movl %r12d,%ecx xorl 32(%rsp),%r14d xorl %edi,%eax roll $5,%ecx xorl 56(%rsp),%r14d leal 1859775393(%rbp,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%r14d xorl 28(%rsp),%edx movl %r12d,%eax movl %r14d,24(%rsp) movl %r11d,%ecx xorl 36(%rsp),%edx xorl %esi,%eax roll $5,%ecx xorl 60(%rsp),%edx leal 1859775393(%r14,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%edx xorl 32(%rsp),%ebp movl %r11d,%eax movl %edx,28(%rsp) movl %edi,%ecx xorl 40(%rsp),%ebp xorl %r13d,%eax roll $5,%ecx xorl 0(%rsp),%ebp leal 1859775393(%rdx,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%ebp xorl 36(%rsp),%r14d movl %r12d,%eax movl %ebp,32(%rsp) movl %r12d,%ebx xorl 44(%rsp),%r14d andl %r11d,%eax movl %esi,%ecx xorl 4(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d xorl %r11d,%ebx roll $5,%ecx addl %eax,%r13d roll $1,%r14d andl %edi,%ebx addl %ecx,%r13d roll $30,%edi addl %ebx,%r13d xorl 40(%rsp),%edx movl %r11d,%eax movl %r14d,36(%rsp) movl %r11d,%ebx xorl 48(%rsp),%edx andl %edi,%eax movl %r13d,%ecx xorl 8(%rsp),%edx leal -1894007588(%r14,%r12,1),%r12d xorl %edi,%ebx roll $5,%ecx addl %eax,%r12d roll $1,%edx andl %esi,%ebx addl %ecx,%r12d roll $30,%esi addl %ebx,%r12d xorl 44(%rsp),%ebp movl %edi,%eax movl %edx,40(%rsp) movl %edi,%ebx xorl 52(%rsp),%ebp andl %esi,%eax movl %r12d,%ecx xorl 12(%rsp),%ebp leal -1894007588(%rdx,%r11,1),%r11d xorl %esi,%ebx roll $5,%ecx addl %eax,%r11d roll $1,%ebp andl %r13d,%ebx addl %ecx,%r11d roll $30,%r13d addl %ebx,%r11d xorl 48(%rsp),%r14d movl %esi,%eax movl %ebp,44(%rsp) movl %esi,%ebx xorl 56(%rsp),%r14d andl %r13d,%eax movl %r11d,%ecx xorl 16(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi xorl %r13d,%ebx roll $5,%ecx addl %eax,%edi roll $1,%r14d andl %r12d,%ebx addl %ecx,%edi roll $30,%r12d addl %ebx,%edi xorl 52(%rsp),%edx movl %r13d,%eax movl %r14d,48(%rsp) movl %r13d,%ebx xorl 60(%rsp),%edx andl %r12d,%eax movl %edi,%ecx xorl 20(%rsp),%edx leal -1894007588(%r14,%rsi,1),%esi xorl %r12d,%ebx roll $5,%ecx addl %eax,%esi roll $1,%edx andl %r11d,%ebx addl %ecx,%esi roll $30,%r11d addl %ebx,%esi xorl 56(%rsp),%ebp movl %r12d,%eax movl %edx,52(%rsp) movl %r12d,%ebx xorl 0(%rsp),%ebp andl %r11d,%eax movl %esi,%ecx xorl 24(%rsp),%ebp leal -1894007588(%rdx,%r13,1),%r13d xorl %r11d,%ebx roll $5,%ecx addl %eax,%r13d roll $1,%ebp andl %edi,%ebx addl %ecx,%r13d roll $30,%edi addl %ebx,%r13d xorl 60(%rsp),%r14d movl %r11d,%eax movl %ebp,56(%rsp) movl %r11d,%ebx xorl 4(%rsp),%r14d andl %edi,%eax movl %r13d,%ecx xorl 28(%rsp),%r14d leal -1894007588(%rbp,%r12,1),%r12d xorl %edi,%ebx roll $5,%ecx addl %eax,%r12d roll $1,%r14d andl %esi,%ebx addl %ecx,%r12d roll $30,%esi addl %ebx,%r12d xorl 0(%rsp),%edx movl %edi,%eax movl %r14d,60(%rsp) movl %edi,%ebx xorl 8(%rsp),%edx andl %esi,%eax movl %r12d,%ecx xorl 32(%rsp),%edx leal -1894007588(%r14,%r11,1),%r11d xorl %esi,%ebx roll $5,%ecx addl %eax,%r11d roll $1,%edx andl %r13d,%ebx addl %ecx,%r11d roll $30,%r13d addl %ebx,%r11d xorl 4(%rsp),%ebp movl %esi,%eax movl %edx,0(%rsp) movl %esi,%ebx xorl 12(%rsp),%ebp andl %r13d,%eax movl %r11d,%ecx xorl 36(%rsp),%ebp leal -1894007588(%rdx,%rdi,1),%edi xorl %r13d,%ebx roll $5,%ecx addl %eax,%edi roll $1,%ebp andl %r12d,%ebx addl %ecx,%edi roll $30,%r12d addl %ebx,%edi xorl 8(%rsp),%r14d movl %r13d,%eax movl %ebp,4(%rsp) movl %r13d,%ebx xorl 16(%rsp),%r14d andl %r12d,%eax movl %edi,%ecx xorl 40(%rsp),%r14d leal -1894007588(%rbp,%rsi,1),%esi xorl %r12d,%ebx roll $5,%ecx addl %eax,%esi roll $1,%r14d andl %r11d,%ebx addl %ecx,%esi roll $30,%r11d addl %ebx,%esi xorl 12(%rsp),%edx movl %r12d,%eax movl %r14d,8(%rsp) movl %r12d,%ebx xorl 20(%rsp),%edx andl %r11d,%eax movl %esi,%ecx xorl 44(%rsp),%edx leal -1894007588(%r14,%r13,1),%r13d xorl %r11d,%ebx roll $5,%ecx addl %eax,%r13d roll $1,%edx andl %edi,%ebx addl %ecx,%r13d roll $30,%edi addl %ebx,%r13d xorl 16(%rsp),%ebp movl %r11d,%eax movl %edx,12(%rsp) movl %r11d,%ebx xorl 24(%rsp),%ebp andl %edi,%eax movl %r13d,%ecx xorl 48(%rsp),%ebp leal -1894007588(%rdx,%r12,1),%r12d xorl %edi,%ebx roll $5,%ecx addl %eax,%r12d roll $1,%ebp andl %esi,%ebx addl %ecx,%r12d roll $30,%esi addl %ebx,%r12d xorl 20(%rsp),%r14d movl %edi,%eax movl %ebp,16(%rsp) movl %edi,%ebx xorl 28(%rsp),%r14d andl %esi,%eax movl %r12d,%ecx xorl 52(%rsp),%r14d leal -1894007588(%rbp,%r11,1),%r11d xorl %esi,%ebx roll $5,%ecx addl %eax,%r11d roll $1,%r14d andl %r13d,%ebx addl %ecx,%r11d roll $30,%r13d addl %ebx,%r11d xorl 24(%rsp),%edx movl %esi,%eax movl %r14d,20(%rsp) movl %esi,%ebx xorl 32(%rsp),%edx andl %r13d,%eax movl %r11d,%ecx xorl 56(%rsp),%edx leal -1894007588(%r14,%rdi,1),%edi xorl %r13d,%ebx roll $5,%ecx addl %eax,%edi roll $1,%edx andl %r12d,%ebx addl %ecx,%edi roll $30,%r12d addl %ebx,%edi xorl 28(%rsp),%ebp movl %r13d,%eax movl %edx,24(%rsp) movl %r13d,%ebx xorl 36(%rsp),%ebp andl %r12d,%eax movl %edi,%ecx xorl 60(%rsp),%ebp leal -1894007588(%rdx,%rsi,1),%esi xorl %r12d,%ebx roll $5,%ecx addl %eax,%esi roll $1,%ebp andl %r11d,%ebx addl %ecx,%esi roll $30,%r11d addl %ebx,%esi xorl 32(%rsp),%r14d movl %r12d,%eax movl %ebp,28(%rsp) movl %r12d,%ebx xorl 40(%rsp),%r14d andl %r11d,%eax movl %esi,%ecx xorl 0(%rsp),%r14d leal -1894007588(%rbp,%r13,1),%r13d xorl %r11d,%ebx roll $5,%ecx addl %eax,%r13d roll $1,%r14d andl %edi,%ebx addl %ecx,%r13d roll $30,%edi addl %ebx,%r13d xorl 36(%rsp),%edx movl %r11d,%eax movl %r14d,32(%rsp) movl %r11d,%ebx xorl 44(%rsp),%edx andl %edi,%eax movl %r13d,%ecx xorl 4(%rsp),%edx leal -1894007588(%r14,%r12,1),%r12d xorl %edi,%ebx roll $5,%ecx addl %eax,%r12d roll $1,%edx andl %esi,%ebx addl %ecx,%r12d roll $30,%esi addl %ebx,%r12d xorl 40(%rsp),%ebp movl %edi,%eax movl %edx,36(%rsp) movl %edi,%ebx xorl 48(%rsp),%ebp andl %esi,%eax movl %r12d,%ecx xorl 8(%rsp),%ebp leal -1894007588(%rdx,%r11,1),%r11d xorl %esi,%ebx roll $5,%ecx addl %eax,%r11d roll $1,%ebp andl %r13d,%ebx addl %ecx,%r11d roll $30,%r13d addl %ebx,%r11d xorl 44(%rsp),%r14d movl %esi,%eax movl %ebp,40(%rsp) movl %esi,%ebx xorl 52(%rsp),%r14d andl %r13d,%eax movl %r11d,%ecx xorl 12(%rsp),%r14d leal -1894007588(%rbp,%rdi,1),%edi xorl %r13d,%ebx roll $5,%ecx addl %eax,%edi roll $1,%r14d andl %r12d,%ebx addl %ecx,%edi roll $30,%r12d addl %ebx,%edi xorl 48(%rsp),%edx movl %r13d,%eax movl %r14d,44(%rsp) movl %r13d,%ebx xorl 56(%rsp),%edx andl %r12d,%eax movl %edi,%ecx xorl 16(%rsp),%edx leal -1894007588(%r14,%rsi,1),%esi xorl %r12d,%ebx roll $5,%ecx addl %eax,%esi roll $1,%edx andl %r11d,%ebx addl %ecx,%esi roll $30,%r11d addl %ebx,%esi xorl 52(%rsp),%ebp movl %edi,%eax movl %edx,48(%rsp) movl %esi,%ecx xorl 60(%rsp),%ebp xorl %r12d,%eax roll $5,%ecx xorl 20(%rsp),%ebp leal -899497514(%rdx,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%ebp xorl 56(%rsp),%r14d movl %esi,%eax movl %ebp,52(%rsp) movl %r13d,%ecx xorl 0(%rsp),%r14d xorl %r11d,%eax roll $5,%ecx xorl 24(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%r14d xorl 60(%rsp),%edx movl %r13d,%eax movl %r14d,56(%rsp) movl %r12d,%ecx xorl 4(%rsp),%edx xorl %edi,%eax roll $5,%ecx xorl 28(%rsp),%edx leal -899497514(%r14,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%edx xorl 0(%rsp),%ebp movl %r12d,%eax movl %edx,60(%rsp) movl %r11d,%ecx xorl 8(%rsp),%ebp xorl %esi,%eax roll $5,%ecx xorl 32(%rsp),%ebp leal -899497514(%rdx,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%ebp xorl 4(%rsp),%r14d movl %r11d,%eax movl %ebp,0(%rsp) movl %edi,%ecx xorl 12(%rsp),%r14d xorl %r13d,%eax roll $5,%ecx xorl 36(%rsp),%r14d leal -899497514(%rbp,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%r14d xorl 8(%rsp),%edx movl %edi,%eax movl %r14d,4(%rsp) movl %esi,%ecx xorl 16(%rsp),%edx xorl %r12d,%eax roll $5,%ecx xorl 40(%rsp),%edx leal -899497514(%r14,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%edx xorl 12(%rsp),%ebp movl %esi,%eax movl %edx,8(%rsp) movl %r13d,%ecx xorl 20(%rsp),%ebp xorl %r11d,%eax roll $5,%ecx xorl 44(%rsp),%ebp leal -899497514(%rdx,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%ebp xorl 16(%rsp),%r14d movl %r13d,%eax movl %ebp,12(%rsp) movl %r12d,%ecx xorl 24(%rsp),%r14d xorl %edi,%eax roll $5,%ecx xorl 48(%rsp),%r14d leal -899497514(%rbp,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%r14d xorl 20(%rsp),%edx movl %r12d,%eax movl %r14d,16(%rsp) movl %r11d,%ecx xorl 28(%rsp),%edx xorl %esi,%eax roll $5,%ecx xorl 52(%rsp),%edx leal -899497514(%r14,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%edx xorl 24(%rsp),%ebp movl %r11d,%eax movl %edx,20(%rsp) movl %edi,%ecx xorl 32(%rsp),%ebp xorl %r13d,%eax roll $5,%ecx xorl 56(%rsp),%ebp leal -899497514(%rdx,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%ebp xorl 28(%rsp),%r14d movl %edi,%eax movl %ebp,24(%rsp) movl %esi,%ecx xorl 36(%rsp),%r14d xorl %r12d,%eax roll $5,%ecx xorl 60(%rsp),%r14d leal -899497514(%rbp,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%r14d xorl 32(%rsp),%edx movl %esi,%eax movl %r14d,28(%rsp) movl %r13d,%ecx xorl 40(%rsp),%edx xorl %r11d,%eax roll $5,%ecx xorl 0(%rsp),%edx leal -899497514(%r14,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%edx xorl 36(%rsp),%ebp movl %r13d,%eax movl %r12d,%ecx xorl 44(%rsp),%ebp xorl %edi,%eax roll $5,%ecx xorl 4(%rsp),%ebp leal -899497514(%rdx,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%ebp xorl 40(%rsp),%r14d movl %r12d,%eax movl %r11d,%ecx xorl 48(%rsp),%r14d xorl %esi,%eax roll $5,%ecx xorl 8(%rsp),%r14d leal -899497514(%rbp,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%r14d xorl 44(%rsp),%edx movl %r11d,%eax movl %edi,%ecx xorl 52(%rsp),%edx xorl %r13d,%eax roll $5,%ecx xorl 12(%rsp),%edx leal -899497514(%r14,%rsi,1),%esi xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi roll $1,%edx xorl 48(%rsp),%ebp movl %edi,%eax movl %esi,%ecx xorl 56(%rsp),%ebp xorl %r12d,%eax roll $5,%ecx xorl 16(%rsp),%ebp leal -899497514(%rdx,%r13,1),%r13d xorl %r11d,%eax addl %ecx,%r13d roll $30,%edi addl %eax,%r13d roll $1,%ebp xorl 52(%rsp),%r14d movl %esi,%eax movl %r13d,%ecx xorl 60(%rsp),%r14d xorl %r11d,%eax roll $5,%ecx xorl 20(%rsp),%r14d leal -899497514(%rbp,%r12,1),%r12d xorl %edi,%eax addl %ecx,%r12d roll $30,%esi addl %eax,%r12d roll $1,%r14d xorl 56(%rsp),%edx movl %r13d,%eax movl %r12d,%ecx xorl 0(%rsp),%edx xorl %edi,%eax roll $5,%ecx xorl 24(%rsp),%edx leal -899497514(%r14,%r11,1),%r11d xorl %esi,%eax addl %ecx,%r11d roll $30,%r13d addl %eax,%r11d roll $1,%edx xorl 60(%rsp),%ebp movl %r12d,%eax movl %r11d,%ecx xorl 4(%rsp),%ebp xorl %esi,%eax roll $5,%ecx xorl 28(%rsp),%ebp leal -899497514(%rdx,%rdi,1),%edi xorl %r13d,%eax addl %ecx,%edi roll $30,%r12d addl %eax,%edi roll $1,%ebp movl %r11d,%eax movl %edi,%ecx xorl %r13d,%eax leal -899497514(%rbp,%rsi,1),%esi roll $5,%ecx xorl %r12d,%eax addl %ecx,%esi roll $30,%r11d addl %eax,%esi addl 0(%r8),%esi addl 4(%r8),%edi addl 8(%r8),%r11d addl 12(%r8),%r12d addl 16(%r8),%r13d movl %esi,0(%r8) movl %edi,4(%r8) movl %r11d,8(%r8) movl %r12d,12(%r8) movl %r13d,16(%r8) subq $1,%r10 leaq 64(%r9),%r9 jnz .Lloop movq 64(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order,.-sha1_block_data_order .type sha1_block_data_order_shaext,@function .align 32 sha1_block_data_order_shaext: _shaext_shortcut: .cfi_startproc movdqu (%rdi),%xmm0 movd 16(%rdi),%xmm1 movdqa K_XX_XX+160(%rip),%xmm3 movdqu (%rsi),%xmm4 pshufd $27,%xmm0,%xmm0 movdqu 16(%rsi),%xmm5 pshufd $27,%xmm1,%xmm1 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%rsi),%xmm7 .byte 102,15,56,0,235 .byte 102,15,56,0,243 movdqa %xmm1,%xmm9 .byte 102,15,56,0,251 jmp .Loop_shaext .align 16 .Loop_shaext: decq %rdx leaq 64(%rsi),%r8 paddd %xmm4,%xmm1 cmovneq %r8,%rsi movdqa %xmm0,%xmm8 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,0 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,0 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,1 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,1 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,2 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,2 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 movdqu (%rsi),%xmm4 movdqa %xmm0,%xmm2 .byte 15,58,204,193,3 .byte 15,56,200,213 movdqu 16(%rsi),%xmm5 .byte 102,15,56,0,227 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 15,56,200,206 movdqu 32(%rsi),%xmm6 .byte 102,15,56,0,235 movdqa %xmm0,%xmm2 .byte 15,58,204,193,3 .byte 15,56,200,215 movdqu 48(%rsi),%xmm7 .byte 102,15,56,0,243 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 65,15,56,200,201 .byte 102,15,56,0,251 paddd %xmm8,%xmm0 movdqa %xmm1,%xmm9 jnz .Loop_shaext pshufd $27,%xmm0,%xmm0 pshufd $27,%xmm1,%xmm1 movdqu %xmm0,(%rdi) movd %xmm1,16(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext .type sha1_block_data_order_ssse3,@function .align 16 sha1_block_data_order_ssse3: _ssse3_shortcut: .cfi_startproc movq %rsp,%r11 .cfi_def_cfa_register %r11 pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 leaq -64(%rsp),%rsp andq $-64,%rsp movq %rdi,%r8 movq %rsi,%r9 movq %rdx,%r10 shlq $6,%r10 addq %r9,%r10 leaq K_XX_XX+64(%rip),%r14 movl 0(%r8),%eax movl 4(%r8),%ebx movl 8(%r8),%ecx movl 12(%r8),%edx movl %ebx,%esi movl 16(%r8),%ebp movl %ecx,%edi xorl %edx,%edi andl %edi,%esi movdqa 64(%r14),%xmm6 movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 .byte 102,15,56,0,206 .byte 102,15,56,0,214 addq $64,%r9 paddd %xmm9,%xmm0 .byte 102,15,56,0,222 paddd %xmm9,%xmm1 paddd %xmm9,%xmm2 movdqa %xmm0,0(%rsp) psubd %xmm9,%xmm0 movdqa %xmm1,16(%rsp) psubd %xmm9,%xmm1 movdqa %xmm2,32(%rsp) psubd %xmm9,%xmm2 jmp .Loop_ssse3 .align 16 .Loop_ssse3: rorl $2,%ebx pshufd $238,%xmm0,%xmm4 xorl %edx,%esi movdqa %xmm3,%xmm8 paddd %xmm3,%xmm9 movl %eax,%edi addl 0(%rsp),%ebp punpcklqdq %xmm1,%xmm4 xorl %ecx,%ebx roll $5,%eax addl %esi,%ebp psrldq $4,%xmm8 andl %ebx,%edi xorl %ecx,%ebx pxor %xmm0,%xmm4 addl %eax,%ebp rorl $7,%eax pxor %xmm2,%xmm8 xorl %ecx,%edi movl %ebp,%esi addl 4(%rsp),%edx pxor %xmm8,%xmm4 xorl %ebx,%eax roll $5,%ebp movdqa %xmm9,48(%rsp) addl %edi,%edx andl %eax,%esi movdqa %xmm4,%xmm10 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp movdqa %xmm4,%xmm8 xorl %ebx,%esi pslldq $12,%xmm10 paddd %xmm4,%xmm4 movl %edx,%edi addl 8(%rsp),%ecx psrld $31,%xmm8 xorl %eax,%ebp roll $5,%edx addl %esi,%ecx movdqa %xmm10,%xmm9 andl %ebp,%edi xorl %eax,%ebp psrld $30,%xmm10 addl %edx,%ecx rorl $7,%edx por %xmm8,%xmm4 xorl %eax,%edi movl %ecx,%esi addl 12(%rsp),%ebx pslld $2,%xmm9 pxor %xmm10,%xmm4 xorl %ebp,%edx movdqa -64(%r14),%xmm10 roll $5,%ecx addl %edi,%ebx andl %edx,%esi pxor %xmm9,%xmm4 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx pshufd $238,%xmm1,%xmm5 xorl %ebp,%esi movdqa %xmm4,%xmm9 paddd %xmm4,%xmm10 movl %ebx,%edi addl 16(%rsp),%eax punpcklqdq %xmm2,%xmm5 xorl %edx,%ecx roll $5,%ebx addl %esi,%eax psrldq $4,%xmm9 andl %ecx,%edi xorl %edx,%ecx pxor %xmm1,%xmm5 addl %ebx,%eax rorl $7,%ebx pxor %xmm3,%xmm9 xorl %edx,%edi movl %eax,%esi addl 20(%rsp),%ebp pxor %xmm9,%xmm5 xorl %ecx,%ebx roll $5,%eax movdqa %xmm10,0(%rsp) addl %edi,%ebp andl %ebx,%esi movdqa %xmm5,%xmm8 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax movdqa %xmm5,%xmm9 xorl %ecx,%esi pslldq $12,%xmm8 paddd %xmm5,%xmm5 movl %ebp,%edi addl 24(%rsp),%edx psrld $31,%xmm9 xorl %ebx,%eax roll $5,%ebp addl %esi,%edx movdqa %xmm8,%xmm10 andl %eax,%edi xorl %ebx,%eax psrld $30,%xmm8 addl %ebp,%edx rorl $7,%ebp por %xmm9,%xmm5 xorl %ebx,%edi movl %edx,%esi addl 28(%rsp),%ecx pslld $2,%xmm10 pxor %xmm8,%xmm5 xorl %eax,%ebp movdqa -32(%r14),%xmm8 roll $5,%edx addl %edi,%ecx andl %ebp,%esi pxor %xmm10,%xmm5 xorl %eax,%ebp addl %edx,%ecx rorl $7,%edx pshufd $238,%xmm2,%xmm6 xorl %eax,%esi movdqa %xmm5,%xmm10 paddd %xmm5,%xmm8 movl %ecx,%edi addl 32(%rsp),%ebx punpcklqdq %xmm3,%xmm6 xorl %ebp,%edx roll $5,%ecx addl %esi,%ebx psrldq $4,%xmm10 andl %edx,%edi xorl %ebp,%edx pxor %xmm2,%xmm6 addl %ecx,%ebx rorl $7,%ecx pxor %xmm4,%xmm10 xorl %ebp,%edi movl %ebx,%esi addl 36(%rsp),%eax pxor %xmm10,%xmm6 xorl %edx,%ecx roll $5,%ebx movdqa %xmm8,16(%rsp) addl %edi,%eax andl %ecx,%esi movdqa %xmm6,%xmm9 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx movdqa %xmm6,%xmm10 xorl %edx,%esi pslldq $12,%xmm9 paddd %xmm6,%xmm6 movl %eax,%edi addl 40(%rsp),%ebp psrld $31,%xmm10 xorl %ecx,%ebx roll $5,%eax addl %esi,%ebp movdqa %xmm9,%xmm8 andl %ebx,%edi xorl %ecx,%ebx psrld $30,%xmm9 addl %eax,%ebp rorl $7,%eax por %xmm10,%xmm6 xorl %ecx,%edi movl %ebp,%esi addl 44(%rsp),%edx pslld $2,%xmm8 pxor %xmm9,%xmm6 xorl %ebx,%eax movdqa -32(%r14),%xmm9 roll $5,%ebp addl %edi,%edx andl %eax,%esi pxor %xmm8,%xmm6 xorl %ebx,%eax addl %ebp,%edx rorl $7,%ebp pshufd $238,%xmm3,%xmm7 xorl %ebx,%esi movdqa %xmm6,%xmm8 paddd %xmm6,%xmm9 movl %edx,%edi addl 48(%rsp),%ecx punpcklqdq %xmm4,%xmm7 xorl %eax,%ebp roll $5,%edx addl %esi,%ecx psrldq $4,%xmm8 andl %ebp,%edi xorl %eax,%ebp pxor %xmm3,%xmm7 addl %edx,%ecx rorl $7,%edx pxor %xmm5,%xmm8 xorl %eax,%edi movl %ecx,%esi addl 52(%rsp),%ebx pxor %xmm8,%xmm7 xorl %ebp,%edx roll $5,%ecx movdqa %xmm9,32(%rsp) addl %edi,%ebx andl %edx,%esi movdqa %xmm7,%xmm10 xorl %ebp,%edx addl %ecx,%ebx rorl $7,%ecx movdqa %xmm7,%xmm8 xorl %ebp,%esi pslldq $12,%xmm10 paddd %xmm7,%xmm7 movl %ebx,%edi addl 56(%rsp),%eax psrld $31,%xmm8 xorl %edx,%ecx roll $5,%ebx addl %esi,%eax movdqa %xmm10,%xmm9 andl %ecx,%edi xorl %edx,%ecx psrld $30,%xmm10 addl %ebx,%eax rorl $7,%ebx por %xmm8,%xmm7 xorl %edx,%edi movl %eax,%esi addl 60(%rsp),%ebp pslld $2,%xmm9 pxor %xmm10,%xmm7 xorl %ecx,%ebx movdqa -32(%r14),%xmm10 roll $5,%eax addl %edi,%ebp andl %ebx,%esi pxor %xmm9,%xmm7 pshufd $238,%xmm6,%xmm9 xorl %ecx,%ebx addl %eax,%ebp rorl $7,%eax pxor %xmm4,%xmm0 xorl %ecx,%esi movl %ebp,%edi addl 0(%rsp),%edx punpcklqdq %xmm7,%xmm9 xorl %ebx,%eax roll $5,%ebp pxor %xmm1,%xmm0 addl %esi,%edx andl %eax,%edi movdqa %xmm10,%xmm8 xorl %ebx,%eax paddd %xmm7,%xmm10 addl %ebp,%edx pxor %xmm9,%xmm0 rorl $7,%ebp xorl %ebx,%edi movl %edx,%esi addl 4(%rsp),%ecx movdqa %xmm0,%xmm9 xorl %eax,%ebp roll $5,%edx movdqa %xmm10,48(%rsp) addl %edi,%ecx andl %ebp,%esi xorl %eax,%ebp pslld $2,%xmm0 addl %edx,%ecx rorl $7,%edx psrld $30,%xmm9 xorl %eax,%esi movl %ecx,%edi addl 8(%rsp),%ebx por %xmm9,%xmm0 xorl %ebp,%edx roll $5,%ecx pshufd $238,%xmm7,%xmm10 addl %esi,%ebx andl %edx,%edi xorl %ebp,%edx addl %ecx,%ebx addl 12(%rsp),%eax xorl %ebp,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax pxor %xmm5,%xmm1 addl 16(%rsp),%ebp xorl %ecx,%esi punpcklqdq %xmm0,%xmm10 movl %eax,%edi roll $5,%eax pxor %xmm2,%xmm1 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm8,%xmm9 rorl $7,%ebx paddd %xmm0,%xmm8 addl %eax,%ebp pxor %xmm10,%xmm1 addl 20(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm1,%xmm10 addl %edi,%edx xorl %ebx,%esi movdqa %xmm8,0(%rsp) rorl $7,%eax addl %ebp,%edx addl 24(%rsp),%ecx pslld $2,%xmm1 xorl %eax,%esi movl %edx,%edi psrld $30,%xmm10 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp por %xmm10,%xmm1 addl %edx,%ecx addl 28(%rsp),%ebx pshufd $238,%xmm0,%xmm8 xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx pxor %xmm6,%xmm2 addl 32(%rsp),%eax xorl %edx,%esi punpcklqdq %xmm1,%xmm8 movl %ebx,%edi roll $5,%ebx pxor %xmm3,%xmm2 addl %esi,%eax xorl %edx,%edi movdqa 0(%r14),%xmm10 rorl $7,%ecx paddd %xmm1,%xmm9 addl %ebx,%eax pxor %xmm8,%xmm2 addl 36(%rsp),%ebp xorl %ecx,%edi movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm8 addl %edi,%ebp xorl %ecx,%esi movdqa %xmm9,16(%rsp) rorl $7,%ebx addl %eax,%ebp addl 40(%rsp),%edx pslld $2,%xmm2 xorl %ebx,%esi movl %ebp,%edi psrld $30,%xmm8 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax por %xmm8,%xmm2 addl %ebp,%edx addl 44(%rsp),%ecx pshufd $238,%xmm1,%xmm9 xorl %eax,%edi movl %edx,%esi roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx pxor %xmm7,%xmm3 addl 48(%rsp),%ebx xorl %ebp,%esi punpcklqdq %xmm2,%xmm9 movl %ecx,%edi roll $5,%ecx pxor %xmm4,%xmm3 addl %esi,%ebx xorl %ebp,%edi movdqa %xmm10,%xmm8 rorl $7,%edx paddd %xmm2,%xmm10 addl %ecx,%ebx pxor %xmm9,%xmm3 addl 52(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm9 addl %edi,%eax xorl %edx,%esi movdqa %xmm10,32(%rsp) rorl $7,%ecx addl %ebx,%eax addl 56(%rsp),%ebp pslld $2,%xmm3 xorl %ecx,%esi movl %eax,%edi psrld $30,%xmm9 roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx por %xmm9,%xmm3 addl %eax,%ebp addl 60(%rsp),%edx pshufd $238,%xmm2,%xmm10 xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx pxor %xmm0,%xmm4 addl 0(%rsp),%ecx xorl %eax,%esi punpcklqdq %xmm3,%xmm10 movl %edx,%edi roll $5,%edx pxor %xmm5,%xmm4 addl %esi,%ecx xorl %eax,%edi movdqa %xmm8,%xmm9 rorl $7,%ebp paddd %xmm3,%xmm8 addl %edx,%ecx pxor %xmm10,%xmm4 addl 4(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm10 addl %edi,%ebx xorl %ebp,%esi movdqa %xmm8,48(%rsp) rorl $7,%edx addl %ecx,%ebx addl 8(%rsp),%eax pslld $2,%xmm4 xorl %edx,%esi movl %ebx,%edi psrld $30,%xmm10 roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx por %xmm10,%xmm4 addl %ebx,%eax addl 12(%rsp),%ebp pshufd $238,%xmm3,%xmm8 xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp pxor %xmm1,%xmm5 addl 16(%rsp),%edx xorl %ebx,%esi punpcklqdq %xmm4,%xmm8 movl %ebp,%edi roll $5,%ebp pxor %xmm6,%xmm5 addl %esi,%edx xorl %ebx,%edi movdqa %xmm9,%xmm10 rorl $7,%eax paddd %xmm4,%xmm9 addl %ebp,%edx pxor %xmm8,%xmm5 addl 20(%rsp),%ecx xorl %eax,%edi movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm8 addl %edi,%ecx xorl %eax,%esi movdqa %xmm9,0(%rsp) rorl $7,%ebp addl %edx,%ecx addl 24(%rsp),%ebx pslld $2,%xmm5 xorl %ebp,%esi movl %ecx,%edi psrld $30,%xmm8 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx por %xmm8,%xmm5 addl %ecx,%ebx addl 28(%rsp),%eax pshufd $238,%xmm4,%xmm9 rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi roll $5,%ebx addl %edi,%eax xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax pxor %xmm2,%xmm6 addl 32(%rsp),%ebp andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx punpcklqdq %xmm5,%xmm9 movl %eax,%edi xorl %ecx,%esi pxor %xmm7,%xmm6 roll $5,%eax addl %esi,%ebp movdqa %xmm10,%xmm8 xorl %ebx,%edi paddd %xmm5,%xmm10 xorl %ecx,%ebx pxor %xmm9,%xmm6 addl %eax,%ebp addl 36(%rsp),%edx andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax movdqa %xmm6,%xmm9 movl %ebp,%esi xorl %ebx,%edi movdqa %xmm10,16(%rsp) roll $5,%ebp addl %edi,%edx xorl %eax,%esi pslld $2,%xmm6 xorl %ebx,%eax addl %ebp,%edx psrld $30,%xmm9 addl 40(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax por %xmm9,%xmm6 rorl $7,%ebp movl %edx,%edi xorl %eax,%esi roll $5,%edx pshufd $238,%xmm5,%xmm10 addl %esi,%ecx xorl %ebp,%edi xorl %eax,%ebp addl %edx,%ecx addl 44(%rsp),%ebx andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx movl %ecx,%esi xorl %ebp,%edi roll $5,%ecx addl %edi,%ebx xorl %edx,%esi xorl %ebp,%edx addl %ecx,%ebx pxor %xmm3,%xmm7 addl 48(%rsp),%eax andl %edx,%esi xorl %ebp,%edx rorl $7,%ecx punpcklqdq %xmm6,%xmm10 movl %ebx,%edi xorl %edx,%esi pxor %xmm0,%xmm7 roll $5,%ebx addl %esi,%eax movdqa 32(%r14),%xmm9 xorl %ecx,%edi paddd %xmm6,%xmm8 xorl %edx,%ecx pxor %xmm10,%xmm7 addl %ebx,%eax addl 52(%rsp),%ebp andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx movdqa %xmm7,%xmm10 movl %eax,%esi xorl %ecx,%edi movdqa %xmm8,32(%rsp) roll $5,%eax addl %edi,%ebp xorl %ebx,%esi pslld $2,%xmm7 xorl %ecx,%ebx addl %eax,%ebp psrld $30,%xmm10 addl 56(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx por %xmm10,%xmm7 rorl $7,%eax movl %ebp,%edi xorl %ebx,%esi roll $5,%ebp pshufd $238,%xmm6,%xmm8 addl %esi,%edx xorl %eax,%edi xorl %ebx,%eax addl %ebp,%edx addl 60(%rsp),%ecx andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp movl %edx,%esi xorl %eax,%edi roll $5,%edx addl %edi,%ecx xorl %ebp,%esi xorl %eax,%ebp addl %edx,%ecx pxor %xmm4,%xmm0 addl 0(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp rorl $7,%edx punpcklqdq %xmm7,%xmm8 movl %ecx,%edi xorl %ebp,%esi pxor %xmm1,%xmm0 roll $5,%ecx addl %esi,%ebx movdqa %xmm9,%xmm10 xorl %edx,%edi paddd %xmm7,%xmm9 xorl %ebp,%edx pxor %xmm8,%xmm0 addl %ecx,%ebx addl 4(%rsp),%eax andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx movdqa %xmm0,%xmm8 movl %ebx,%esi xorl %edx,%edi movdqa %xmm9,48(%rsp) roll $5,%ebx addl %edi,%eax xorl %ecx,%esi pslld $2,%xmm0 xorl %edx,%ecx addl %ebx,%eax psrld $30,%xmm8 addl 8(%rsp),%ebp andl %ecx,%esi xorl %edx,%ecx por %xmm8,%xmm0 rorl $7,%ebx movl %eax,%edi xorl %ecx,%esi roll $5,%eax pshufd $238,%xmm7,%xmm9 addl %esi,%ebp xorl %ebx,%edi xorl %ecx,%ebx addl %eax,%ebp addl 12(%rsp),%edx andl %ebx,%edi xorl %ecx,%ebx rorl $7,%eax movl %ebp,%esi xorl %ebx,%edi roll $5,%ebp addl %edi,%edx xorl %eax,%esi xorl %ebx,%eax addl %ebp,%edx pxor %xmm5,%xmm1 addl 16(%rsp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%ebp punpcklqdq %xmm0,%xmm9 movl %edx,%edi xorl %eax,%esi pxor %xmm2,%xmm1 roll $5,%edx addl %esi,%ecx movdqa %xmm10,%xmm8 xorl %ebp,%edi paddd %xmm0,%xmm10 xorl %eax,%ebp pxor %xmm9,%xmm1 addl %edx,%ecx addl 20(%rsp),%ebx andl %ebp,%edi xorl %eax,%ebp rorl $7,%edx movdqa %xmm1,%xmm9 movl %ecx,%esi xorl %ebp,%edi movdqa %xmm10,0(%rsp) roll $5,%ecx addl %edi,%ebx xorl %edx,%esi pslld $2,%xmm1 xorl %ebp,%edx addl %ecx,%ebx psrld $30,%xmm9 addl 24(%rsp),%eax andl %edx,%esi xorl %ebp,%edx por %xmm9,%xmm1 rorl $7,%ecx movl %ebx,%edi xorl %edx,%esi roll $5,%ebx pshufd $238,%xmm0,%xmm10 addl %esi,%eax xorl %ecx,%edi xorl %edx,%ecx addl %ebx,%eax addl 28(%rsp),%ebp andl %ecx,%edi xorl %edx,%ecx rorl $7,%ebx movl %eax,%esi xorl %ecx,%edi roll $5,%eax addl %edi,%ebp xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%ebp pxor %xmm6,%xmm2 addl 32(%rsp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax punpcklqdq %xmm1,%xmm10 movl %ebp,%edi xorl %ebx,%esi pxor %xmm3,%xmm2 roll $5,%ebp addl %esi,%edx movdqa %xmm8,%xmm9 xorl %eax,%edi paddd %xmm1,%xmm8 xorl %ebx,%eax pxor %xmm10,%xmm2 addl %ebp,%edx addl 36(%rsp),%ecx andl %eax,%edi xorl %ebx,%eax rorl $7,%ebp movdqa %xmm2,%xmm10 movl %edx,%esi xorl %eax,%edi movdqa %xmm8,16(%rsp) roll $5,%edx addl %edi,%ecx xorl %ebp,%esi pslld $2,%xmm2 xorl %eax,%ebp addl %edx,%ecx psrld $30,%xmm10 addl 40(%rsp),%ebx andl %ebp,%esi xorl %eax,%ebp por %xmm10,%xmm2 rorl $7,%edx movl %ecx,%edi xorl %ebp,%esi roll $5,%ecx pshufd $238,%xmm1,%xmm8 addl %esi,%ebx xorl %edx,%edi xorl %ebp,%edx addl %ecx,%ebx addl 44(%rsp),%eax andl %edx,%edi xorl %ebp,%edx rorl $7,%ecx movl %ebx,%esi xorl %edx,%edi roll $5,%ebx addl %edi,%eax xorl %edx,%esi addl %ebx,%eax pxor %xmm7,%xmm3 addl 48(%rsp),%ebp xorl %ecx,%esi punpcklqdq %xmm2,%xmm8 movl %eax,%edi roll $5,%eax pxor %xmm4,%xmm3 addl %esi,%ebp xorl %ecx,%edi movdqa %xmm9,%xmm10 rorl $7,%ebx paddd %xmm2,%xmm9 addl %eax,%ebp pxor %xmm8,%xmm3 addl 52(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp movdqa %xmm3,%xmm8 addl %edi,%edx xorl %ebx,%esi movdqa %xmm9,32(%rsp) rorl $7,%eax addl %ebp,%edx addl 56(%rsp),%ecx pslld $2,%xmm3 xorl %eax,%esi movl %edx,%edi psrld $30,%xmm8 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp por %xmm8,%xmm3 addl %edx,%ecx addl 60(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx addl 0(%rsp),%eax xorl %edx,%esi movl %ebx,%edi roll $5,%ebx paddd %xmm3,%xmm10 addl %esi,%eax xorl %edx,%edi movdqa %xmm10,48(%rsp) rorl $7,%ecx addl %ebx,%eax addl 4(%rsp),%ebp xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp addl 8(%rsp),%edx xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax addl %ebp,%edx addl 12(%rsp),%ecx xorl %eax,%edi movl %edx,%esi roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx cmpq %r10,%r9 je .Ldone_ssse3 movdqa 64(%r14),%xmm6 movdqa -64(%r14),%xmm9 movdqu 0(%r9),%xmm0 movdqu 16(%r9),%xmm1 movdqu 32(%r9),%xmm2 movdqu 48(%r9),%xmm3 .byte 102,15,56,0,198 addq $64,%r9 addl 16(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi .byte 102,15,56,0,206 roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx paddd %xmm9,%xmm0 addl %ecx,%ebx addl 20(%rsp),%eax xorl %edx,%edi movl %ebx,%esi movdqa %xmm0,0(%rsp) roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx psubd %xmm9,%xmm0 addl %ebx,%eax addl 24(%rsp),%ebp xorl %ecx,%esi movl %eax,%edi roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx addl %eax,%ebp addl 28(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi movl %edx,%edi .byte 102,15,56,0,214 roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp paddd %xmm9,%xmm1 addl %edx,%ecx addl 36(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi movdqa %xmm1,16(%rsp) roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx psubd %xmm9,%xmm1 addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi movl %ebx,%edi roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx addl %ebx,%eax addl 44(%rsp),%ebp xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp addl 48(%rsp),%edx xorl %ebx,%esi movl %ebp,%edi .byte 102,15,56,0,222 roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax paddd %xmm9,%xmm2 addl %ebp,%edx addl 52(%rsp),%ecx xorl %eax,%edi movl %edx,%esi movdqa %xmm2,32(%rsp) roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp psubd %xmm9,%xmm2 addl %edx,%ecx addl 56(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx addl %ecx,%ebx addl 60(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax rorl $7,%ecx addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx addl 12(%r8),%edx movl %eax,0(%r8) addl 16(%r8),%ebp movl %esi,4(%r8) movl %esi,%ebx movl %ecx,8(%r8) movl %ecx,%edi movl %edx,12(%r8) xorl %edx,%edi movl %ebp,16(%r8) andl %edi,%esi jmp .Loop_ssse3 .align 16 .Ldone_ssse3: addl 16(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx addl %ecx,%ebx addl 20(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax addl 24(%rsp),%ebp xorl %ecx,%esi movl %eax,%edi roll $5,%eax addl %esi,%ebp xorl %ecx,%edi rorl $7,%ebx addl %eax,%ebp addl 28(%rsp),%edx xorl %ebx,%edi movl %ebp,%esi roll $5,%ebp addl %edi,%edx xorl %ebx,%esi rorl $7,%eax addl %ebp,%edx addl 32(%rsp),%ecx xorl %eax,%esi movl %edx,%edi roll $5,%edx addl %esi,%ecx xorl %eax,%edi rorl $7,%ebp addl %edx,%ecx addl 36(%rsp),%ebx xorl %ebp,%edi movl %ecx,%esi roll $5,%ecx addl %edi,%ebx xorl %ebp,%esi rorl $7,%edx addl %ecx,%ebx addl 40(%rsp),%eax xorl %edx,%esi movl %ebx,%edi roll $5,%ebx addl %esi,%eax xorl %edx,%edi rorl $7,%ecx addl %ebx,%eax addl 44(%rsp),%ebp xorl %ecx,%edi movl %eax,%esi roll $5,%eax addl %edi,%ebp xorl %ecx,%esi rorl $7,%ebx addl %eax,%ebp addl 48(%rsp),%edx xorl %ebx,%esi movl %ebp,%edi roll $5,%ebp addl %esi,%edx xorl %ebx,%edi rorl $7,%eax addl %ebp,%edx addl 52(%rsp),%ecx xorl %eax,%edi movl %edx,%esi roll $5,%edx addl %edi,%ecx xorl %eax,%esi rorl $7,%ebp addl %edx,%ecx addl 56(%rsp),%ebx xorl %ebp,%esi movl %ecx,%edi roll $5,%ecx addl %esi,%ebx xorl %ebp,%edi rorl $7,%edx addl %ecx,%ebx addl 60(%rsp),%eax xorl %edx,%edi movl %ebx,%esi roll $5,%ebx addl %edi,%eax rorl $7,%ecx addl %ebx,%eax addl 0(%r8),%eax addl 4(%r8),%esi addl 8(%r8),%ecx movl %eax,0(%r8) addl 12(%r8),%edx movl %esi,4(%r8) addl 16(%r8),%ebp movl %ecx,8(%r8) movl %edx,12(%r8) movl %ebp,16(%r8) movq -40(%r11),%r14 .cfi_restore %r14 movq -32(%r11),%r13 .cfi_restore %r13 movq -24(%r11),%r12 .cfi_restore %r12 movq -16(%r11),%rbp .cfi_restore %rbp movq -8(%r11),%rbx .cfi_restore %rbx leaq (%r11),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +_avx_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + leaq -64(%rsp),%rsp + vzeroupper + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.align 16 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.align 16 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +.type sha1_block_data_order_avx2,@function +.align 16 +sha1_block_data_order_avx2: +_avx2_shortcut: +.cfi_startproc + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + vzeroupper + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp .Loop_avx2 +.align 32 +.Loop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp .Lalign32_1 +.align 32 +.Lalign32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp .Lalign32_2 +.align 32 +.Lalign32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je .Ldone_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja .Last_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp .Last_avx2 + +.align 32 +.Last_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp .Lalign32_3 +.align 32 +.Lalign32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + xorl %ebp,%esi + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe .Loop_avx2 + +.Ldone_avx2: + vzeroupper + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 .align 64 K_XX_XX: .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 64 Index: stable/12/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/sha256-mb-x86_64.S (revision 364963) @@ -1,3278 +1,7950 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha256-mb-x86_64.pl. */ .text .globl sha256_multi_block .type sha256_multi_block,@function .align 32 sha256_multi_block: .cfi_startproc movq OPENSSL_ia32cap_P+4(%rip),%rcx btq $61,%rcx jc _shaext_shortcut + testl $268435456,%ecx + jnz _avx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 subq $288,%rsp andq $-256,%rsp movq %rax,272(%rsp) .cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 .Lbody: leaq K256+128(%rip),%rbp leaq 256(%rsp),%rbx leaq 128(%rdi),%rdi .Loop_grande: movl %edx,280(%rsp) xorl %edx,%edx movq 0(%rsi),%r8 movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rbp,%r8 movq 16(%rsi),%r9 movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rbp,%r9 movq 32(%rsi),%r10 movl 40(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,8(%rbx) cmovleq %rbp,%r10 movq 48(%rsi),%r11 movl 56(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,12(%rbx) cmovleq %rbp,%r11 testl %edx,%edx jz .Ldone movdqu 0-128(%rdi),%xmm8 leaq 128(%rsp),%rax movdqu 32-128(%rdi),%xmm9 movdqu 64-128(%rdi),%xmm10 movdqu 96-128(%rdi),%xmm11 movdqu 128-128(%rdi),%xmm12 movdqu 160-128(%rdi),%xmm13 movdqu 192-128(%rdi),%xmm14 movdqu 224-128(%rdi),%xmm15 movdqu .Lpbswap(%rip),%xmm6 jmp .Loop .align 32 .Loop: movdqa %xmm10,%xmm4 pxor %xmm9,%xmm4 movd 0(%r8),%xmm5 movd 0(%r9),%xmm0 movd 0(%r10),%xmm1 movd 0(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm12,%xmm7 .byte 102,15,56,0,238 movdqa %xmm12,%xmm2 psrld $6,%xmm7 movdqa %xmm12,%xmm1 pslld $7,%xmm2 movdqa %xmm5,0-128(%rax) paddd %xmm15,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -128(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm12,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm3 pslld $26-21,%xmm2 pandn %xmm14,%xmm0 pand %xmm13,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm8,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm9,%xmm3 movdqa %xmm8,%xmm7 pslld $10,%xmm2 pxor %xmm8,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm9,%xmm15 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm15 paddd %xmm5,%xmm11 pxor %xmm2,%xmm7 paddd %xmm5,%xmm15 paddd %xmm7,%xmm15 movd 4(%r8),%xmm5 movd 4(%r9),%xmm0 movd 4(%r10),%xmm1 movd 4(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm11,%xmm1 pslld $7,%xmm2 movdqa %xmm5,16-128(%rax) paddd %xmm14,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -96(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm11,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm4 pslld $26-21,%xmm2 pandn %xmm13,%xmm0 pand %xmm12,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm15,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm15,%xmm7 pslld $10,%xmm2 pxor %xmm15,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm8,%xmm14 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm14 paddd %xmm5,%xmm10 pxor %xmm2,%xmm7 paddd %xmm5,%xmm14 paddd %xmm7,%xmm14 movd 8(%r8),%xmm5 movd 8(%r9),%xmm0 movd 8(%r10),%xmm1 movd 8(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm10,%xmm7 .byte 102,15,56,0,238 movdqa %xmm10,%xmm2 psrld $6,%xmm7 movdqa %xmm10,%xmm1 pslld $7,%xmm2 movdqa %xmm5,32-128(%rax) paddd %xmm13,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm10,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm3 pslld $26-21,%xmm2 pandn %xmm12,%xmm0 pand %xmm11,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm14,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm15,%xmm3 movdqa %xmm14,%xmm7 pslld $10,%xmm2 pxor %xmm14,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm15,%xmm13 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm13 paddd %xmm5,%xmm9 pxor %xmm2,%xmm7 paddd %xmm5,%xmm13 paddd %xmm7,%xmm13 movd 12(%r8),%xmm5 movd 12(%r9),%xmm0 movd 12(%r10),%xmm1 movd 12(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm9,%xmm7 movdqa %xmm9,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm9,%xmm1 pslld $7,%xmm2 movdqa %xmm5,48-128(%rax) paddd %xmm12,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -32(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm9,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm4 pslld $26-21,%xmm2 pandn %xmm11,%xmm0 pand %xmm10,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm13,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm14,%xmm4 movdqa %xmm13,%xmm7 pslld $10,%xmm2 pxor %xmm13,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm14,%xmm12 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm12 paddd %xmm5,%xmm8 pxor %xmm2,%xmm7 paddd %xmm5,%xmm12 paddd %xmm7,%xmm12 movd 16(%r8),%xmm5 movd 16(%r9),%xmm0 movd 16(%r10),%xmm1 movd 16(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm8,%xmm7 .byte 102,15,56,0,238 movdqa %xmm8,%xmm2 psrld $6,%xmm7 movdqa %xmm8,%xmm1 pslld $7,%xmm2 movdqa %xmm5,64-128(%rax) paddd %xmm11,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 0(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm8,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm3 pslld $26-21,%xmm2 pandn %xmm10,%xmm0 pand %xmm9,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm12,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm13,%xmm3 movdqa %xmm12,%xmm7 pslld $10,%xmm2 pxor %xmm12,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm13,%xmm11 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm11 paddd %xmm5,%xmm15 pxor %xmm2,%xmm7 paddd %xmm5,%xmm11 paddd %xmm7,%xmm11 movd 20(%r8),%xmm5 movd 20(%r9),%xmm0 movd 20(%r10),%xmm1 movd 20(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm15,%xmm7 movdqa %xmm15,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm15,%xmm1 pslld $7,%xmm2 movdqa %xmm5,80-128(%rax) paddd %xmm10,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 32(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm15,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm4 pslld $26-21,%xmm2 pandn %xmm9,%xmm0 pand %xmm8,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm11,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm12,%xmm4 movdqa %xmm11,%xmm7 pslld $10,%xmm2 pxor %xmm11,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm12,%xmm10 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm10 paddd %xmm5,%xmm14 pxor %xmm2,%xmm7 paddd %xmm5,%xmm10 paddd %xmm7,%xmm10 movd 24(%r8),%xmm5 movd 24(%r9),%xmm0 movd 24(%r10),%xmm1 movd 24(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm14,%xmm7 .byte 102,15,56,0,238 movdqa %xmm14,%xmm2 psrld $6,%xmm7 movdqa %xmm14,%xmm1 pslld $7,%xmm2 movdqa %xmm5,96-128(%rax) paddd %xmm9,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm14,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm3 pslld $26-21,%xmm2 pandn %xmm8,%xmm0 pand %xmm15,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm10,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm11,%xmm3 movdqa %xmm10,%xmm7 pslld $10,%xmm2 pxor %xmm10,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm11,%xmm9 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm9 paddd %xmm5,%xmm13 pxor %xmm2,%xmm7 paddd %xmm5,%xmm9 paddd %xmm7,%xmm9 movd 28(%r8),%xmm5 movd 28(%r9),%xmm0 movd 28(%r10),%xmm1 movd 28(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm13,%xmm1 pslld $7,%xmm2 movdqa %xmm5,112-128(%rax) paddd %xmm8,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 96(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm13,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm4 pslld $26-21,%xmm2 pandn %xmm15,%xmm0 pand %xmm14,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm9,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm10,%xmm4 movdqa %xmm9,%xmm7 pslld $10,%xmm2 pxor %xmm9,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm10,%xmm8 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm8 paddd %xmm5,%xmm12 pxor %xmm2,%xmm7 paddd %xmm5,%xmm8 paddd %xmm7,%xmm8 leaq 256(%rbp),%rbp movd 32(%r8),%xmm5 movd 32(%r9),%xmm0 movd 32(%r10),%xmm1 movd 32(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm12,%xmm7 .byte 102,15,56,0,238 movdqa %xmm12,%xmm2 psrld $6,%xmm7 movdqa %xmm12,%xmm1 pslld $7,%xmm2 movdqa %xmm5,128-128(%rax) paddd %xmm15,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -128(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm12,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm3 pslld $26-21,%xmm2 pandn %xmm14,%xmm0 pand %xmm13,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm8,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm9,%xmm3 movdqa %xmm8,%xmm7 pslld $10,%xmm2 pxor %xmm8,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm9,%xmm15 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm15 paddd %xmm5,%xmm11 pxor %xmm2,%xmm7 paddd %xmm5,%xmm15 paddd %xmm7,%xmm15 movd 36(%r8),%xmm5 movd 36(%r9),%xmm0 movd 36(%r10),%xmm1 movd 36(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm11,%xmm1 pslld $7,%xmm2 movdqa %xmm5,144-128(%rax) paddd %xmm14,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -96(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm11,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm4 pslld $26-21,%xmm2 pandn %xmm13,%xmm0 pand %xmm12,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm15,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm15,%xmm7 pslld $10,%xmm2 pxor %xmm15,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm8,%xmm14 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm14 paddd %xmm5,%xmm10 pxor %xmm2,%xmm7 paddd %xmm5,%xmm14 paddd %xmm7,%xmm14 movd 40(%r8),%xmm5 movd 40(%r9),%xmm0 movd 40(%r10),%xmm1 movd 40(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm10,%xmm7 .byte 102,15,56,0,238 movdqa %xmm10,%xmm2 psrld $6,%xmm7 movdqa %xmm10,%xmm1 pslld $7,%xmm2 movdqa %xmm5,160-128(%rax) paddd %xmm13,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm10,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm3 pslld $26-21,%xmm2 pandn %xmm12,%xmm0 pand %xmm11,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm14,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm15,%xmm3 movdqa %xmm14,%xmm7 pslld $10,%xmm2 pxor %xmm14,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm15,%xmm13 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm13 paddd %xmm5,%xmm9 pxor %xmm2,%xmm7 paddd %xmm5,%xmm13 paddd %xmm7,%xmm13 movd 44(%r8),%xmm5 movd 44(%r9),%xmm0 movd 44(%r10),%xmm1 movd 44(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm9,%xmm7 movdqa %xmm9,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm9,%xmm1 pslld $7,%xmm2 movdqa %xmm5,176-128(%rax) paddd %xmm12,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -32(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm9,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm4 pslld $26-21,%xmm2 pandn %xmm11,%xmm0 pand %xmm10,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm13,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm14,%xmm4 movdqa %xmm13,%xmm7 pslld $10,%xmm2 pxor %xmm13,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm14,%xmm12 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm12 paddd %xmm5,%xmm8 pxor %xmm2,%xmm7 paddd %xmm5,%xmm12 paddd %xmm7,%xmm12 movd 48(%r8),%xmm5 movd 48(%r9),%xmm0 movd 48(%r10),%xmm1 movd 48(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm8,%xmm7 .byte 102,15,56,0,238 movdqa %xmm8,%xmm2 psrld $6,%xmm7 movdqa %xmm8,%xmm1 pslld $7,%xmm2 movdqa %xmm5,192-128(%rax) paddd %xmm11,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 0(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm8,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm3 pslld $26-21,%xmm2 pandn %xmm10,%xmm0 pand %xmm9,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm12,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm13,%xmm3 movdqa %xmm12,%xmm7 pslld $10,%xmm2 pxor %xmm12,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm13,%xmm11 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm11 paddd %xmm5,%xmm15 pxor %xmm2,%xmm7 paddd %xmm5,%xmm11 paddd %xmm7,%xmm11 movd 52(%r8),%xmm5 movd 52(%r9),%xmm0 movd 52(%r10),%xmm1 movd 52(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm15,%xmm7 movdqa %xmm15,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm15,%xmm1 pslld $7,%xmm2 movdqa %xmm5,208-128(%rax) paddd %xmm10,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 32(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm15,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm4 pslld $26-21,%xmm2 pandn %xmm9,%xmm0 pand %xmm8,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm11,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm12,%xmm4 movdqa %xmm11,%xmm7 pslld $10,%xmm2 pxor %xmm11,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm12,%xmm10 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm10 paddd %xmm5,%xmm14 pxor %xmm2,%xmm7 paddd %xmm5,%xmm10 paddd %xmm7,%xmm10 movd 56(%r8),%xmm5 movd 56(%r9),%xmm0 movd 56(%r10),%xmm1 movd 56(%r11),%xmm2 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm14,%xmm7 .byte 102,15,56,0,238 movdqa %xmm14,%xmm2 psrld $6,%xmm7 movdqa %xmm14,%xmm1 pslld $7,%xmm2 movdqa %xmm5,224-128(%rax) paddd %xmm9,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm14,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm3 pslld $26-21,%xmm2 pandn %xmm8,%xmm0 pand %xmm15,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm10,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm11,%xmm3 movdqa %xmm10,%xmm7 pslld $10,%xmm2 pxor %xmm10,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm11,%xmm9 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm9 paddd %xmm5,%xmm13 pxor %xmm2,%xmm7 paddd %xmm5,%xmm9 paddd %xmm7,%xmm9 movd 60(%r8),%xmm5 leaq 64(%r8),%r8 movd 60(%r9),%xmm0 leaq 64(%r9),%r9 movd 60(%r10),%xmm1 leaq 64(%r10),%r10 movd 60(%r11),%xmm2 leaq 64(%r11),%r11 punpckldq %xmm1,%xmm5 punpckldq %xmm2,%xmm0 punpckldq %xmm0,%xmm5 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm2 .byte 102,15,56,0,238 psrld $6,%xmm7 movdqa %xmm13,%xmm1 pslld $7,%xmm2 movdqa %xmm5,240-128(%rax) paddd %xmm8,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 96(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm13,%xmm0 prefetcht0 63(%r8) pxor %xmm2,%xmm7 movdqa %xmm13,%xmm4 pslld $26-21,%xmm2 pandn %xmm15,%xmm0 pand %xmm14,%xmm4 pxor %xmm1,%xmm7 prefetcht0 63(%r9) movdqa %xmm9,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm4,%xmm0 movdqa %xmm10,%xmm4 movdqa %xmm9,%xmm7 pslld $10,%xmm2 pxor %xmm9,%xmm4 prefetcht0 63(%r10) psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 prefetcht0 63(%r11) psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm10,%xmm8 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm8 paddd %xmm5,%xmm12 pxor %xmm2,%xmm7 paddd %xmm5,%xmm8 paddd %xmm7,%xmm8 leaq 256(%rbp),%rbp movdqu 0-128(%rax),%xmm5 movl $3,%ecx jmp .Loop_16_xx .align 32 .Loop_16_xx: movdqa 16-128(%rax),%xmm6 paddd 144-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 224-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm12,%xmm7 movdqa %xmm12,%xmm2 psrld $6,%xmm7 movdqa %xmm12,%xmm1 pslld $7,%xmm2 movdqa %xmm5,0-128(%rax) paddd %xmm15,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -128(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm12,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm3 pslld $26-21,%xmm2 pandn %xmm14,%xmm0 pand %xmm13,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm8,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm9,%xmm3 movdqa %xmm8,%xmm7 pslld $10,%xmm2 pxor %xmm8,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm9,%xmm15 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm15 paddd %xmm5,%xmm11 pxor %xmm2,%xmm7 paddd %xmm5,%xmm15 paddd %xmm7,%xmm15 movdqa 32-128(%rax),%xmm5 paddd 160-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 240-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm2 psrld $6,%xmm7 movdqa %xmm11,%xmm1 pslld $7,%xmm2 movdqa %xmm6,16-128(%rax) paddd %xmm14,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -96(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm11,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm4 pslld $26-21,%xmm2 pandn %xmm13,%xmm0 pand %xmm12,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm15,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm15,%xmm7 pslld $10,%xmm2 pxor %xmm15,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm8,%xmm14 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm14 paddd %xmm6,%xmm10 pxor %xmm2,%xmm7 paddd %xmm6,%xmm14 paddd %xmm7,%xmm14 movdqa 48-128(%rax),%xmm6 paddd 176-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 0-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm10,%xmm7 movdqa %xmm10,%xmm2 psrld $6,%xmm7 movdqa %xmm10,%xmm1 pslld $7,%xmm2 movdqa %xmm5,32-128(%rax) paddd %xmm13,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm10,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm3 pslld $26-21,%xmm2 pandn %xmm12,%xmm0 pand %xmm11,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm14,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm15,%xmm3 movdqa %xmm14,%xmm7 pslld $10,%xmm2 pxor %xmm14,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm15,%xmm13 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm13 paddd %xmm5,%xmm9 pxor %xmm2,%xmm7 paddd %xmm5,%xmm13 paddd %xmm7,%xmm13 movdqa 64-128(%rax),%xmm5 paddd 192-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 16-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm9,%xmm7 movdqa %xmm9,%xmm2 psrld $6,%xmm7 movdqa %xmm9,%xmm1 pslld $7,%xmm2 movdqa %xmm6,48-128(%rax) paddd %xmm12,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -32(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm9,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm4 pslld $26-21,%xmm2 pandn %xmm11,%xmm0 pand %xmm10,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm13,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm14,%xmm4 movdqa %xmm13,%xmm7 pslld $10,%xmm2 pxor %xmm13,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm14,%xmm12 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm12 paddd %xmm6,%xmm8 pxor %xmm2,%xmm7 paddd %xmm6,%xmm12 paddd %xmm7,%xmm12 movdqa 80-128(%rax),%xmm6 paddd 208-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 32-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm8,%xmm7 movdqa %xmm8,%xmm2 psrld $6,%xmm7 movdqa %xmm8,%xmm1 pslld $7,%xmm2 movdqa %xmm5,64-128(%rax) paddd %xmm11,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 0(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm8,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm3 pslld $26-21,%xmm2 pandn %xmm10,%xmm0 pand %xmm9,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm12,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm13,%xmm3 movdqa %xmm12,%xmm7 pslld $10,%xmm2 pxor %xmm12,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm13,%xmm11 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm11 paddd %xmm5,%xmm15 pxor %xmm2,%xmm7 paddd %xmm5,%xmm11 paddd %xmm7,%xmm11 movdqa 96-128(%rax),%xmm5 paddd 224-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 48-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm15,%xmm7 movdqa %xmm15,%xmm2 psrld $6,%xmm7 movdqa %xmm15,%xmm1 pslld $7,%xmm2 movdqa %xmm6,80-128(%rax) paddd %xmm10,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 32(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm15,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm4 pslld $26-21,%xmm2 pandn %xmm9,%xmm0 pand %xmm8,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm11,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm12,%xmm4 movdqa %xmm11,%xmm7 pslld $10,%xmm2 pxor %xmm11,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm12,%xmm10 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm10 paddd %xmm6,%xmm14 pxor %xmm2,%xmm7 paddd %xmm6,%xmm10 paddd %xmm7,%xmm10 movdqa 112-128(%rax),%xmm6 paddd 240-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 64-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm14,%xmm7 movdqa %xmm14,%xmm2 psrld $6,%xmm7 movdqa %xmm14,%xmm1 pslld $7,%xmm2 movdqa %xmm5,96-128(%rax) paddd %xmm9,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm14,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm3 pslld $26-21,%xmm2 pandn %xmm8,%xmm0 pand %xmm15,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm10,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm11,%xmm3 movdqa %xmm10,%xmm7 pslld $10,%xmm2 pxor %xmm10,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm11,%xmm9 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm9 paddd %xmm5,%xmm13 pxor %xmm2,%xmm7 paddd %xmm5,%xmm9 paddd %xmm7,%xmm9 movdqa 128-128(%rax),%xmm5 paddd 0-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 80-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm2 psrld $6,%xmm7 movdqa %xmm13,%xmm1 pslld $7,%xmm2 movdqa %xmm6,112-128(%rax) paddd %xmm8,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 96(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm13,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm4 pslld $26-21,%xmm2 pandn %xmm15,%xmm0 pand %xmm14,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm9,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm10,%xmm4 movdqa %xmm9,%xmm7 pslld $10,%xmm2 pxor %xmm9,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm10,%xmm8 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm8 paddd %xmm6,%xmm12 pxor %xmm2,%xmm7 paddd %xmm6,%xmm8 paddd %xmm7,%xmm8 leaq 256(%rbp),%rbp movdqa 144-128(%rax),%xmm6 paddd 16-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 96-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm12,%xmm7 movdqa %xmm12,%xmm2 psrld $6,%xmm7 movdqa %xmm12,%xmm1 pslld $7,%xmm2 movdqa %xmm5,128-128(%rax) paddd %xmm15,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -128(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm12,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm3 pslld $26-21,%xmm2 pandn %xmm14,%xmm0 pand %xmm13,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm8,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm9,%xmm3 movdqa %xmm8,%xmm7 pslld $10,%xmm2 pxor %xmm8,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm9,%xmm15 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm15 paddd %xmm5,%xmm11 pxor %xmm2,%xmm7 paddd %xmm5,%xmm15 paddd %xmm7,%xmm15 movdqa 160-128(%rax),%xmm5 paddd 32-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 112-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm11,%xmm7 movdqa %xmm11,%xmm2 psrld $6,%xmm7 movdqa %xmm11,%xmm1 pslld $7,%xmm2 movdqa %xmm6,144-128(%rax) paddd %xmm14,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -96(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm11,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm4 pslld $26-21,%xmm2 pandn %xmm13,%xmm0 pand %xmm12,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm15,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm8,%xmm4 movdqa %xmm15,%xmm7 pslld $10,%xmm2 pxor %xmm15,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm8,%xmm14 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm14 paddd %xmm6,%xmm10 pxor %xmm2,%xmm7 paddd %xmm6,%xmm14 paddd %xmm7,%xmm14 movdqa 176-128(%rax),%xmm6 paddd 48-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 128-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm10,%xmm7 movdqa %xmm10,%xmm2 psrld $6,%xmm7 movdqa %xmm10,%xmm1 pslld $7,%xmm2 movdqa %xmm5,160-128(%rax) paddd %xmm13,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm10,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm3 pslld $26-21,%xmm2 pandn %xmm12,%xmm0 pand %xmm11,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm14,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm15,%xmm3 movdqa %xmm14,%xmm7 pslld $10,%xmm2 pxor %xmm14,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm15,%xmm13 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm13 paddd %xmm5,%xmm9 pxor %xmm2,%xmm7 paddd %xmm5,%xmm13 paddd %xmm7,%xmm13 movdqa 192-128(%rax),%xmm5 paddd 64-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 144-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm9,%xmm7 movdqa %xmm9,%xmm2 psrld $6,%xmm7 movdqa %xmm9,%xmm1 pslld $7,%xmm2 movdqa %xmm6,176-128(%rax) paddd %xmm12,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd -32(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm9,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm4 pslld $26-21,%xmm2 pandn %xmm11,%xmm0 pand %xmm10,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm13,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm14,%xmm4 movdqa %xmm13,%xmm7 pslld $10,%xmm2 pxor %xmm13,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm14,%xmm12 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm12 paddd %xmm6,%xmm8 pxor %xmm2,%xmm7 paddd %xmm6,%xmm12 paddd %xmm7,%xmm12 movdqa 208-128(%rax),%xmm6 paddd 80-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 160-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm8,%xmm7 movdqa %xmm8,%xmm2 psrld $6,%xmm7 movdqa %xmm8,%xmm1 pslld $7,%xmm2 movdqa %xmm5,192-128(%rax) paddd %xmm11,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 0(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm8,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm8,%xmm3 pslld $26-21,%xmm2 pandn %xmm10,%xmm0 pand %xmm9,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm12,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm12,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm13,%xmm3 movdqa %xmm12,%xmm7 pslld $10,%xmm2 pxor %xmm12,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm13,%xmm11 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm11 paddd %xmm5,%xmm15 pxor %xmm2,%xmm7 paddd %xmm5,%xmm11 paddd %xmm7,%xmm11 movdqa 224-128(%rax),%xmm5 paddd 96-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 176-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm15,%xmm7 movdqa %xmm15,%xmm2 psrld $6,%xmm7 movdqa %xmm15,%xmm1 pslld $7,%xmm2 movdqa %xmm6,208-128(%rax) paddd %xmm10,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 32(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm15,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm15,%xmm4 pslld $26-21,%xmm2 pandn %xmm9,%xmm0 pand %xmm8,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm11,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm11,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm12,%xmm4 movdqa %xmm11,%xmm7 pslld $10,%xmm2 pxor %xmm11,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm12,%xmm10 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm10 paddd %xmm6,%xmm14 pxor %xmm2,%xmm7 paddd %xmm6,%xmm10 paddd %xmm7,%xmm10 movdqa 240-128(%rax),%xmm6 paddd 112-128(%rax),%xmm5 movdqa %xmm6,%xmm7 movdqa %xmm6,%xmm1 psrld $3,%xmm7 movdqa %xmm6,%xmm2 psrld $7,%xmm1 movdqa 192-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm3 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm3,%xmm1 psrld $17,%xmm3 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 psrld $19-17,%xmm3 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm3,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm5 movdqa %xmm14,%xmm7 movdqa %xmm14,%xmm2 psrld $6,%xmm7 movdqa %xmm14,%xmm1 pslld $7,%xmm2 movdqa %xmm5,224-128(%rax) paddd %xmm9,%xmm5 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 64(%rbp),%xmm5 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm14,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm14,%xmm3 pslld $26-21,%xmm2 pandn %xmm8,%xmm0 pand %xmm15,%xmm3 pxor %xmm1,%xmm7 movdqa %xmm10,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm10,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm5 pxor %xmm3,%xmm0 movdqa %xmm11,%xmm3 movdqa %xmm10,%xmm7 pslld $10,%xmm2 pxor %xmm10,%xmm3 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm5 pslld $19-10,%xmm2 pand %xmm3,%xmm4 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm11,%xmm9 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm4,%xmm9 paddd %xmm5,%xmm13 pxor %xmm2,%xmm7 paddd %xmm5,%xmm9 paddd %xmm7,%xmm9 movdqa 0-128(%rax),%xmm5 paddd 128-128(%rax),%xmm6 movdqa %xmm5,%xmm7 movdqa %xmm5,%xmm1 psrld $3,%xmm7 movdqa %xmm5,%xmm2 psrld $7,%xmm1 movdqa 208-128(%rax),%xmm0 pslld $14,%xmm2 pxor %xmm1,%xmm7 psrld $18-7,%xmm1 movdqa %xmm0,%xmm4 pxor %xmm2,%xmm7 pslld $25-14,%xmm2 pxor %xmm1,%xmm7 psrld $10,%xmm0 movdqa %xmm4,%xmm1 psrld $17,%xmm4 pxor %xmm2,%xmm7 pslld $13,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 psrld $19-17,%xmm4 pxor %xmm1,%xmm0 pslld $15-13,%xmm1 pxor %xmm4,%xmm0 pxor %xmm1,%xmm0 paddd %xmm0,%xmm6 movdqa %xmm13,%xmm7 movdqa %xmm13,%xmm2 psrld $6,%xmm7 movdqa %xmm13,%xmm1 pslld $7,%xmm2 movdqa %xmm6,240-128(%rax) paddd %xmm8,%xmm6 psrld $11,%xmm1 pxor %xmm2,%xmm7 pslld $21-7,%xmm2 paddd 96(%rbp),%xmm6 pxor %xmm1,%xmm7 psrld $25-11,%xmm1 movdqa %xmm13,%xmm0 pxor %xmm2,%xmm7 movdqa %xmm13,%xmm4 pslld $26-21,%xmm2 pandn %xmm15,%xmm0 pand %xmm14,%xmm4 pxor %xmm1,%xmm7 movdqa %xmm9,%xmm1 pxor %xmm2,%xmm7 movdqa %xmm9,%xmm2 psrld $2,%xmm1 paddd %xmm7,%xmm6 pxor %xmm4,%xmm0 movdqa %xmm10,%xmm4 movdqa %xmm9,%xmm7 pslld $10,%xmm2 pxor %xmm9,%xmm4 psrld $13,%xmm7 pxor %xmm2,%xmm1 paddd %xmm0,%xmm6 pslld $19-10,%xmm2 pand %xmm4,%xmm3 pxor %xmm7,%xmm1 psrld $22-13,%xmm7 pxor %xmm2,%xmm1 movdqa %xmm10,%xmm8 pslld $30-19,%xmm2 pxor %xmm1,%xmm7 pxor %xmm3,%xmm8 paddd %xmm6,%xmm12 pxor %xmm2,%xmm7 paddd %xmm6,%xmm8 paddd %xmm7,%xmm8 leaq 256(%rbp),%rbp decl %ecx jnz .Loop_16_xx movl $1,%ecx leaq K256+128(%rip),%rbp movdqa (%rbx),%xmm7 cmpl 0(%rbx),%ecx pxor %xmm0,%xmm0 cmovgeq %rbp,%r8 cmpl 4(%rbx),%ecx movdqa %xmm7,%xmm6 cmovgeq %rbp,%r9 cmpl 8(%rbx),%ecx pcmpgtd %xmm0,%xmm6 cmovgeq %rbp,%r10 cmpl 12(%rbx),%ecx paddd %xmm6,%xmm7 cmovgeq %rbp,%r11 movdqu 0-128(%rdi),%xmm0 pand %xmm6,%xmm8 movdqu 32-128(%rdi),%xmm1 pand %xmm6,%xmm9 movdqu 64-128(%rdi),%xmm2 pand %xmm6,%xmm10 movdqu 96-128(%rdi),%xmm5 pand %xmm6,%xmm11 paddd %xmm0,%xmm8 movdqu 128-128(%rdi),%xmm0 pand %xmm6,%xmm12 paddd %xmm1,%xmm9 movdqu 160-128(%rdi),%xmm1 pand %xmm6,%xmm13 paddd %xmm2,%xmm10 movdqu 192-128(%rdi),%xmm2 pand %xmm6,%xmm14 paddd %xmm5,%xmm11 movdqu 224-128(%rdi),%xmm5 pand %xmm6,%xmm15 paddd %xmm0,%xmm12 paddd %xmm1,%xmm13 movdqu %xmm8,0-128(%rdi) paddd %xmm2,%xmm14 movdqu %xmm9,32-128(%rdi) paddd %xmm5,%xmm15 movdqu %xmm10,64-128(%rdi) movdqu %xmm11,96-128(%rdi) movdqu %xmm12,128-128(%rdi) movdqu %xmm13,160-128(%rdi) movdqu %xmm14,192-128(%rdi) movdqu %xmm15,224-128(%rdi) movdqa %xmm7,(%rbx) movdqa .Lpbswap(%rip),%xmm6 decl %edx jnz .Loop movl 280(%rsp),%edx leaq 16(%rdi),%rdi leaq 64(%rsi),%rsi decl %edx jnz .Loop_grande .Ldone: movq 272(%rsp),%rax .cfi_def_cfa %rax,8 movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc .size sha256_multi_block,.-sha256_multi_block .type sha256_multi_block_shaext,@function .align 32 sha256_multi_block_shaext: .cfi_startproc _shaext_shortcut: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 subq $288,%rsp shll $1,%edx andq $-256,%rsp leaq 128(%rdi),%rdi movq %rax,272(%rsp) .Lbody_shaext: leaq 256(%rsp),%rbx leaq K256_shaext+128(%rip),%rbp .Loop_grande_shaext: movl %edx,280(%rsp) xorl %edx,%edx movq 0(%rsi),%r8 movl 8(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,0(%rbx) cmovleq %rsp,%r8 movq 16(%rsi),%r9 movl 24(%rsi),%ecx cmpl %edx,%ecx cmovgl %ecx,%edx testl %ecx,%ecx movl %ecx,4(%rbx) cmovleq %rsp,%r9 testl %edx,%edx jz .Ldone_shaext movq 0-128(%rdi),%xmm12 movq 32-128(%rdi),%xmm4 movq 64-128(%rdi),%xmm13 movq 96-128(%rdi),%xmm5 movq 128-128(%rdi),%xmm8 movq 160-128(%rdi),%xmm9 movq 192-128(%rdi),%xmm10 movq 224-128(%rdi),%xmm11 punpckldq %xmm4,%xmm12 punpckldq %xmm5,%xmm13 punpckldq %xmm9,%xmm8 punpckldq %xmm11,%xmm10 movdqa K256_shaext-16(%rip),%xmm3 movdqa %xmm12,%xmm14 movdqa %xmm13,%xmm15 punpcklqdq %xmm8,%xmm12 punpcklqdq %xmm10,%xmm13 punpckhqdq %xmm8,%xmm14 punpckhqdq %xmm10,%xmm15 pshufd $27,%xmm12,%xmm12 pshufd $27,%xmm13,%xmm13 pshufd $27,%xmm14,%xmm14 pshufd $27,%xmm15,%xmm15 jmp .Loop_shaext .align 32 .Loop_shaext: movdqu 0(%r8),%xmm4 movdqu 0(%r9),%xmm8 movdqu 16(%r8),%xmm5 movdqu 16(%r9),%xmm9 movdqu 32(%r8),%xmm6 .byte 102,15,56,0,227 movdqu 32(%r9),%xmm10 .byte 102,68,15,56,0,195 movdqu 48(%r8),%xmm7 leaq 64(%r8),%r8 movdqu 48(%r9),%xmm11 leaq 64(%r9),%r9 movdqa 0-128(%rbp),%xmm0 .byte 102,15,56,0,235 paddd %xmm4,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm0,%xmm1 movdqa 0-128(%rbp),%xmm2 .byte 102,68,15,56,0,203 paddd %xmm8,%xmm2 movdqa %xmm13,80(%rsp) .byte 69,15,56,203,236 pxor %xmm14,%xmm8 movdqa %xmm2,%xmm0 movdqa %xmm15,112(%rsp) .byte 69,15,56,203,254 pshufd $0x0e,%xmm1,%xmm0 pxor %xmm12,%xmm4 movdqa %xmm12,64(%rsp) .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 pxor %xmm14,%xmm8 movdqa %xmm14,96(%rsp) movdqa 16-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 102,15,56,0,243 .byte 69,15,56,203,247 movdqa %xmm1,%xmm0 movdqa 16-128(%rbp),%xmm2 paddd %xmm9,%xmm2 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 prefetcht0 127(%r8) .byte 102,15,56,0,251 .byte 102,68,15,56,0,211 prefetcht0 127(%r9) .byte 69,15,56,203,254 pshufd $0x0e,%xmm1,%xmm0 .byte 102,68,15,56,0,219 .byte 15,56,204,229 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 32-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 movdqa %xmm1,%xmm0 movdqa 32-128(%rbp),%xmm2 paddd %xmm10,%xmm2 .byte 69,15,56,203,236 .byte 69,15,56,204,193 movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 69,15,56,203,254 pshufd $0x0e,%xmm1,%xmm0 .byte 102,15,58,15,222,4 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 48-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,202 movdqa %xmm1,%xmm0 movdqa 48-128(%rbp),%xmm2 paddd %xmm3,%xmm8 paddd %xmm11,%xmm2 .byte 15,56,205,231 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm4,%xmm3 .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 64-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,211 movdqa %xmm1,%xmm0 movdqa 64-128(%rbp),%xmm2 paddd %xmm3,%xmm9 paddd %xmm8,%xmm2 .byte 15,56,205,236 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm5,%xmm3 .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 80-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,216 movdqa %xmm1,%xmm0 movdqa 80-128(%rbp),%xmm2 paddd %xmm3,%xmm10 paddd %xmm9,%xmm2 .byte 15,56,205,245 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm6,%xmm3 .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 96-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,193 movdqa %xmm1,%xmm0 movdqa 96-128(%rbp),%xmm2 paddd %xmm3,%xmm11 paddd %xmm10,%xmm2 .byte 15,56,205,254 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 112-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,202 movdqa %xmm1,%xmm0 movdqa 112-128(%rbp),%xmm2 paddd %xmm3,%xmm8 paddd %xmm11,%xmm2 .byte 15,56,205,231 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm4,%xmm3 .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 128-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,211 movdqa %xmm1,%xmm0 movdqa 128-128(%rbp),%xmm2 paddd %xmm3,%xmm9 paddd %xmm8,%xmm2 .byte 15,56,205,236 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm5,%xmm3 .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 144-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,216 movdqa %xmm1,%xmm0 movdqa 144-128(%rbp),%xmm2 paddd %xmm3,%xmm10 paddd %xmm9,%xmm2 .byte 15,56,205,245 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm6,%xmm3 .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 .byte 15,56,204,229 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 160-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,193 movdqa %xmm1,%xmm0 movdqa 160-128(%rbp),%xmm2 paddd %xmm3,%xmm11 paddd %xmm10,%xmm2 .byte 15,56,205,254 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm7,%xmm3 .byte 102,15,58,15,222,4 .byte 69,15,56,203,254 .byte 69,15,56,205,218 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm4 movdqa %xmm11,%xmm3 .byte 102,65,15,58,15,218,4 .byte 15,56,204,238 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 176-128(%rbp),%xmm1 paddd %xmm7,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,202 movdqa %xmm1,%xmm0 movdqa 176-128(%rbp),%xmm2 paddd %xmm3,%xmm8 paddd %xmm11,%xmm2 .byte 15,56,205,231 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm4,%xmm3 .byte 102,15,58,15,223,4 .byte 69,15,56,203,254 .byte 69,15,56,205,195 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm5 movdqa %xmm8,%xmm3 .byte 102,65,15,58,15,219,4 .byte 15,56,204,247 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 192-128(%rbp),%xmm1 paddd %xmm4,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,211 movdqa %xmm1,%xmm0 movdqa 192-128(%rbp),%xmm2 paddd %xmm3,%xmm9 paddd %xmm8,%xmm2 .byte 15,56,205,236 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm5,%xmm3 .byte 102,15,58,15,220,4 .byte 69,15,56,203,254 .byte 69,15,56,205,200 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm6 movdqa %xmm9,%xmm3 .byte 102,65,15,58,15,216,4 .byte 15,56,204,252 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 208-128(%rbp),%xmm1 paddd %xmm5,%xmm1 .byte 69,15,56,203,247 .byte 69,15,56,204,216 movdqa %xmm1,%xmm0 movdqa 208-128(%rbp),%xmm2 paddd %xmm3,%xmm10 paddd %xmm9,%xmm2 .byte 15,56,205,245 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movdqa %xmm6,%xmm3 .byte 102,15,58,15,221,4 .byte 69,15,56,203,254 .byte 69,15,56,205,209 pshufd $0x0e,%xmm1,%xmm0 paddd %xmm3,%xmm7 movdqa %xmm10,%xmm3 .byte 102,65,15,58,15,217,4 nop .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 224-128(%rbp),%xmm1 paddd %xmm6,%xmm1 .byte 69,15,56,203,247 movdqa %xmm1,%xmm0 movdqa 224-128(%rbp),%xmm2 paddd %xmm3,%xmm11 paddd %xmm10,%xmm2 .byte 15,56,205,254 nop .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 movl $1,%ecx pxor %xmm6,%xmm6 .byte 69,15,56,203,254 .byte 69,15,56,205,218 pshufd $0x0e,%xmm1,%xmm0 movdqa 240-128(%rbp),%xmm1 paddd %xmm7,%xmm1 movq (%rbx),%xmm7 nop .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 movdqa 240-128(%rbp),%xmm2 paddd %xmm11,%xmm2 .byte 69,15,56,203,247 movdqa %xmm1,%xmm0 cmpl 0(%rbx),%ecx cmovgeq %rsp,%r8 cmpl 4(%rbx),%ecx cmovgeq %rsp,%r9 pshufd $0x00,%xmm7,%xmm9 .byte 69,15,56,203,236 movdqa %xmm2,%xmm0 pshufd $0x55,%xmm7,%xmm10 movdqa %xmm7,%xmm11 .byte 69,15,56,203,254 pshufd $0x0e,%xmm1,%xmm0 pcmpgtd %xmm6,%xmm9 pcmpgtd %xmm6,%xmm10 .byte 69,15,56,203,229 pshufd $0x0e,%xmm2,%xmm0 pcmpgtd %xmm6,%xmm11 movdqa K256_shaext-16(%rip),%xmm3 .byte 69,15,56,203,247 pand %xmm9,%xmm13 pand %xmm10,%xmm15 pand %xmm9,%xmm12 pand %xmm10,%xmm14 paddd %xmm7,%xmm11 paddd 80(%rsp),%xmm13 paddd 112(%rsp),%xmm15 paddd 64(%rsp),%xmm12 paddd 96(%rsp),%xmm14 movq %xmm11,(%rbx) decl %edx jnz .Loop_shaext movl 280(%rsp),%edx pshufd $27,%xmm12,%xmm12 pshufd $27,%xmm13,%xmm13 pshufd $27,%xmm14,%xmm14 pshufd $27,%xmm15,%xmm15 movdqa %xmm12,%xmm5 movdqa %xmm13,%xmm6 punpckldq %xmm14,%xmm12 punpckhdq %xmm14,%xmm5 punpckldq %xmm15,%xmm13 punpckhdq %xmm15,%xmm6 movq %xmm12,0-128(%rdi) psrldq $8,%xmm12 movq %xmm5,128-128(%rdi) psrldq $8,%xmm5 movq %xmm12,32-128(%rdi) movq %xmm5,160-128(%rdi) movq %xmm13,64-128(%rdi) psrldq $8,%xmm13 movq %xmm6,192-128(%rdi) psrldq $8,%xmm6 movq %xmm13,96-128(%rdi) movq %xmm6,224-128(%rdi) leaq 8(%rdi),%rdi leaq 32(%rsi),%rsi decl %edx jnz .Loop_grande_shaext .Ldone_shaext: movq -16(%rax),%rbp .cfi_restore %rbp movq -8(%rax),%rbx .cfi_restore %rbx leaq (%rax),%rsp .cfi_def_cfa_register %rsp .Lepilogue_shaext: .byte 0xf3,0xc3 .cfi_endproc .size sha256_multi_block_shaext,.-sha256_multi_block_shaext +.type sha256_multi_block_avx,@function +.align 32 +sha256_multi_block_avx: +.cfi_startproc +_avx_shortcut: + shrq $32,%rcx + cmpl $2,%edx + jb .Lavx + testl $32,%ecx + jnz _avx2_shortcut + jmp .Lavx +.align 32 +.Lavx: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + subq $288,%rsp + andq $-256,%rsp + movq %rax,272(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x90,0x02,0x06,0x23,0x08 +.Lbody_avx: + leaq K256+128(%rip),%rbp + leaq 256(%rsp),%rbx + leaq 128(%rdi),%rdi + +.Loop_grande_avx: + movl %edx,280(%rsp) + xorl %edx,%edx + movq 0(%rsi),%r8 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r8 + movq 16(%rsi),%r9 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r9 + movq 32(%rsi),%r10 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r10 + movq 48(%rsi),%r11 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r11 + testl %edx,%edx + jz .Ldone_avx + + vmovdqu 0-128(%rdi),%xmm8 + leaq 128(%rsp),%rax + vmovdqu 32-128(%rdi),%xmm9 + vmovdqu 64-128(%rdi),%xmm10 + vmovdqu 96-128(%rdi),%xmm11 + vmovdqu 128-128(%rdi),%xmm12 + vmovdqu 160-128(%rdi),%xmm13 + vmovdqu 192-128(%rdi),%xmm14 + vmovdqu 224-128(%rdi),%xmm15 + vmovdqu .Lpbswap(%rip),%xmm6 + jmp .Loop_avx + +.align 32 +.Loop_avx: + vpxor %xmm9,%xmm10,%xmm4 + vmovd 0(%r8),%xmm5 + vmovd 0(%r9),%xmm0 + vpinsrd $1,0(%r10),%xmm5,%xmm5 + vpinsrd $1,0(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,0-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovd 4(%r8),%xmm5 + vmovd 4(%r9),%xmm0 + vpinsrd $1,4(%r10),%xmm5,%xmm5 + vpinsrd $1,4(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm5,16-128(%rax) + vpaddd %xmm14,%xmm5,%xmm5 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm5,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovd 8(%r8),%xmm5 + vmovd 8(%r9),%xmm0 + vpinsrd $1,8(%r10),%xmm5,%xmm5 + vpinsrd $1,8(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,32-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovd 12(%r8),%xmm5 + vmovd 12(%r9),%xmm0 + vpinsrd $1,12(%r10),%xmm5,%xmm5 + vpinsrd $1,12(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm5,48-128(%rax) + vpaddd %xmm12,%xmm5,%xmm5 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm5,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovd 16(%r8),%xmm5 + vmovd 16(%r9),%xmm0 + vpinsrd $1,16(%r10),%xmm5,%xmm5 + vpinsrd $1,16(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,64-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovd 20(%r8),%xmm5 + vmovd 20(%r9),%xmm0 + vpinsrd $1,20(%r10),%xmm5,%xmm5 + vpinsrd $1,20(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm5,80-128(%rax) + vpaddd %xmm10,%xmm5,%xmm5 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm5,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovd 24(%r8),%xmm5 + vmovd 24(%r9),%xmm0 + vpinsrd $1,24(%r10),%xmm5,%xmm5 + vpinsrd $1,24(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,96-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovd 28(%r8),%xmm5 + vmovd 28(%r9),%xmm0 + vpinsrd $1,28(%r10),%xmm5,%xmm5 + vpinsrd $1,28(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm5,112-128(%rax) + vpaddd %xmm8,%xmm5,%xmm5 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm5,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + vmovd 32(%r8),%xmm5 + vmovd 32(%r9),%xmm0 + vpinsrd $1,32(%r10),%xmm5,%xmm5 + vpinsrd $1,32(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,128-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovd 36(%r8),%xmm5 + vmovd 36(%r9),%xmm0 + vpinsrd $1,36(%r10),%xmm5,%xmm5 + vpinsrd $1,36(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm5,144-128(%rax) + vpaddd %xmm14,%xmm5,%xmm5 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm5,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovd 40(%r8),%xmm5 + vmovd 40(%r9),%xmm0 + vpinsrd $1,40(%r10),%xmm5,%xmm5 + vpinsrd $1,40(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,160-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovd 44(%r8),%xmm5 + vmovd 44(%r9),%xmm0 + vpinsrd $1,44(%r10),%xmm5,%xmm5 + vpinsrd $1,44(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm5,176-128(%rax) + vpaddd %xmm12,%xmm5,%xmm5 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm5,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovd 48(%r8),%xmm5 + vmovd 48(%r9),%xmm0 + vpinsrd $1,48(%r10),%xmm5,%xmm5 + vpinsrd $1,48(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,192-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovd 52(%r8),%xmm5 + vmovd 52(%r9),%xmm0 + vpinsrd $1,52(%r10),%xmm5,%xmm5 + vpinsrd $1,52(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm5,208-128(%rax) + vpaddd %xmm10,%xmm5,%xmm5 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm5,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovd 56(%r8),%xmm5 + vmovd 56(%r9),%xmm0 + vpinsrd $1,56(%r10),%xmm5,%xmm5 + vpinsrd $1,56(%r11),%xmm0,%xmm0 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,224-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovd 60(%r8),%xmm5 + leaq 64(%r8),%r8 + vmovd 60(%r9),%xmm0 + leaq 64(%r9),%r9 + vpinsrd $1,60(%r10),%xmm5,%xmm5 + leaq 64(%r10),%r10 + vpinsrd $1,60(%r11),%xmm0,%xmm0 + leaq 64(%r11),%r11 + vpunpckldq %xmm0,%xmm5,%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm5,240-128(%rax) + vpaddd %xmm8,%xmm5,%xmm5 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + prefetcht0 63(%r8) + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + prefetcht0 63(%r9) + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + prefetcht0 63(%r10) + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + prefetcht0 63(%r11) + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm5,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + vmovdqu 0-128(%rax),%xmm5 + movl $3,%ecx + jmp .Loop_16_xx_avx +.align 32 +.Loop_16_xx_avx: + vmovdqu 16-128(%rax),%xmm6 + vpaddd 144-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 224-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,0-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovdqu 32-128(%rax),%xmm5 + vpaddd 160-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 240-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm6,16-128(%rax) + vpaddd %xmm14,%xmm6,%xmm6 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm6,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovdqu 48-128(%rax),%xmm6 + vpaddd 176-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 0-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,32-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovdqu 64-128(%rax),%xmm5 + vpaddd 192-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 16-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm6,48-128(%rax) + vpaddd %xmm12,%xmm6,%xmm6 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm6,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovdqu 80-128(%rax),%xmm6 + vpaddd 208-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 32-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,64-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovdqu 96-128(%rax),%xmm5 + vpaddd 224-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 48-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm6,80-128(%rax) + vpaddd %xmm10,%xmm6,%xmm6 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm6,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovdqu 112-128(%rax),%xmm6 + vpaddd 240-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 64-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,96-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovdqu 128-128(%rax),%xmm5 + vpaddd 0-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 80-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm6,112-128(%rax) + vpaddd %xmm8,%xmm6,%xmm6 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + vmovdqu 144-128(%rax),%xmm6 + vpaddd 16-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 96-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm12,%xmm7 + vpslld $26,%xmm12,%xmm2 + vmovdqu %xmm5,128-128(%rax) + vpaddd %xmm15,%xmm5,%xmm5 + + vpsrld $11,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm12,%xmm2 + vpaddd -128(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm12,%xmm2 + vpandn %xmm14,%xmm12,%xmm0 + vpand %xmm13,%xmm12,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm8,%xmm15 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm8,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm8,%xmm9,%xmm3 + + vpxor %xmm1,%xmm15,%xmm15 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm8,%xmm1 + + vpslld $19,%xmm8,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm15,%xmm7 + + vpsrld $22,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm8,%xmm2 + vpxor %xmm4,%xmm9,%xmm15 + vpaddd %xmm5,%xmm11,%xmm11 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm15,%xmm15 + vpaddd %xmm7,%xmm15,%xmm15 + vmovdqu 160-128(%rax),%xmm5 + vpaddd 32-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 112-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm11,%xmm7 + vpslld $26,%xmm11,%xmm2 + vmovdqu %xmm6,144-128(%rax) + vpaddd %xmm14,%xmm6,%xmm6 + + vpsrld $11,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm11,%xmm2 + vpaddd -96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm11,%xmm2 + vpandn %xmm13,%xmm11,%xmm0 + vpand %xmm12,%xmm11,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm15,%xmm14 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm15,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm15,%xmm8,%xmm4 + + vpxor %xmm1,%xmm14,%xmm14 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm15,%xmm1 + + vpslld $19,%xmm15,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm14,%xmm7 + + vpsrld $22,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm15,%xmm2 + vpxor %xmm3,%xmm8,%xmm14 + vpaddd %xmm6,%xmm10,%xmm10 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm14,%xmm14 + vpaddd %xmm7,%xmm14,%xmm14 + vmovdqu 176-128(%rax),%xmm6 + vpaddd 48-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 128-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm10,%xmm7 + vpslld $26,%xmm10,%xmm2 + vmovdqu %xmm5,160-128(%rax) + vpaddd %xmm13,%xmm5,%xmm5 + + vpsrld $11,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm10,%xmm2 + vpaddd -64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm10,%xmm2 + vpandn %xmm12,%xmm10,%xmm0 + vpand %xmm11,%xmm10,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm14,%xmm13 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm14,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm14,%xmm15,%xmm3 + + vpxor %xmm1,%xmm13,%xmm13 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm14,%xmm1 + + vpslld $19,%xmm14,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm13,%xmm7 + + vpsrld $22,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm14,%xmm2 + vpxor %xmm4,%xmm15,%xmm13 + vpaddd %xmm5,%xmm9,%xmm9 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm13,%xmm13 + vpaddd %xmm7,%xmm13,%xmm13 + vmovdqu 192-128(%rax),%xmm5 + vpaddd 64-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 144-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm9,%xmm7 + vpslld $26,%xmm9,%xmm2 + vmovdqu %xmm6,176-128(%rax) + vpaddd %xmm12,%xmm6,%xmm6 + + vpsrld $11,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm9,%xmm2 + vpaddd -32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm9,%xmm2 + vpandn %xmm11,%xmm9,%xmm0 + vpand %xmm10,%xmm9,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm13,%xmm12 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm13,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm13,%xmm14,%xmm4 + + vpxor %xmm1,%xmm12,%xmm12 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm13,%xmm1 + + vpslld $19,%xmm13,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm12,%xmm7 + + vpsrld $22,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm13,%xmm2 + vpxor %xmm3,%xmm14,%xmm12 + vpaddd %xmm6,%xmm8,%xmm8 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm12,%xmm12 + vpaddd %xmm7,%xmm12,%xmm12 + vmovdqu 208-128(%rax),%xmm6 + vpaddd 80-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 160-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm8,%xmm7 + vpslld $26,%xmm8,%xmm2 + vmovdqu %xmm5,192-128(%rax) + vpaddd %xmm11,%xmm5,%xmm5 + + vpsrld $11,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm8,%xmm2 + vpaddd 0(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm8,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm8,%xmm2 + vpandn %xmm10,%xmm8,%xmm0 + vpand %xmm9,%xmm8,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm12,%xmm11 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm12,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm12,%xmm13,%xmm3 + + vpxor %xmm1,%xmm11,%xmm11 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm12,%xmm1 + + vpslld $19,%xmm12,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm11,%xmm7 + + vpsrld $22,%xmm12,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm12,%xmm2 + vpxor %xmm4,%xmm13,%xmm11 + vpaddd %xmm5,%xmm15,%xmm15 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm11,%xmm11 + vpaddd %xmm7,%xmm11,%xmm11 + vmovdqu 224-128(%rax),%xmm5 + vpaddd 96-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 176-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm15,%xmm7 + vpslld $26,%xmm15,%xmm2 + vmovdqu %xmm6,208-128(%rax) + vpaddd %xmm10,%xmm6,%xmm6 + + vpsrld $11,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm15,%xmm2 + vpaddd 32(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm15,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm15,%xmm2 + vpandn %xmm9,%xmm15,%xmm0 + vpand %xmm8,%xmm15,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm11,%xmm10 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm11,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm11,%xmm12,%xmm4 + + vpxor %xmm1,%xmm10,%xmm10 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm11,%xmm1 + + vpslld $19,%xmm11,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm10,%xmm7 + + vpsrld $22,%xmm11,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm11,%xmm2 + vpxor %xmm3,%xmm12,%xmm10 + vpaddd %xmm6,%xmm14,%xmm14 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm10,%xmm10 + vpaddd %xmm7,%xmm10,%xmm10 + vmovdqu 240-128(%rax),%xmm6 + vpaddd 112-128(%rax),%xmm5,%xmm5 + + vpsrld $3,%xmm6,%xmm7 + vpsrld $7,%xmm6,%xmm1 + vpslld $25,%xmm6,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm6,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm6,%xmm2 + vmovdqu 192-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm1,%xmm3,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm5,%xmm5 + vpsrld $6,%xmm14,%xmm7 + vpslld $26,%xmm14,%xmm2 + vmovdqu %xmm5,224-128(%rax) + vpaddd %xmm9,%xmm5,%xmm5 + + vpsrld $11,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm14,%xmm2 + vpaddd 64(%rbp),%xmm5,%xmm5 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm14,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm14,%xmm2 + vpandn %xmm8,%xmm14,%xmm0 + vpand %xmm15,%xmm14,%xmm3 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm10,%xmm9 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm10,%xmm1 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm10,%xmm11,%xmm3 + + vpxor %xmm1,%xmm9,%xmm9 + vpaddd %xmm7,%xmm5,%xmm5 + + vpsrld $13,%xmm10,%xmm1 + + vpslld $19,%xmm10,%xmm2 + vpaddd %xmm0,%xmm5,%xmm5 + vpand %xmm3,%xmm4,%xmm4 + + vpxor %xmm1,%xmm9,%xmm7 + + vpsrld $22,%xmm10,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm10,%xmm2 + vpxor %xmm4,%xmm11,%xmm9 + vpaddd %xmm5,%xmm13,%xmm13 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm5,%xmm9,%xmm9 + vpaddd %xmm7,%xmm9,%xmm9 + vmovdqu 0-128(%rax),%xmm5 + vpaddd 128-128(%rax),%xmm6,%xmm6 + + vpsrld $3,%xmm5,%xmm7 + vpsrld $7,%xmm5,%xmm1 + vpslld $25,%xmm5,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $18,%xmm5,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $14,%xmm5,%xmm2 + vmovdqu 208-128(%rax),%xmm0 + vpsrld $10,%xmm0,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + vpsrld $17,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $15,%xmm0,%xmm2 + vpaddd %xmm7,%xmm6,%xmm6 + vpxor %xmm1,%xmm4,%xmm7 + vpsrld $19,%xmm0,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $13,%xmm0,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + vpaddd %xmm7,%xmm6,%xmm6 + vpsrld $6,%xmm13,%xmm7 + vpslld $26,%xmm13,%xmm2 + vmovdqu %xmm6,240-128(%rax) + vpaddd %xmm8,%xmm6,%xmm6 + + vpsrld $11,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpslld $21,%xmm13,%xmm2 + vpaddd 96(%rbp),%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $25,%xmm13,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $7,%xmm13,%xmm2 + vpandn %xmm15,%xmm13,%xmm0 + vpand %xmm14,%xmm13,%xmm4 + + vpxor %xmm1,%xmm7,%xmm7 + + vpsrld $2,%xmm9,%xmm8 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $30,%xmm9,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm9,%xmm10,%xmm4 + + vpxor %xmm1,%xmm8,%xmm8 + vpaddd %xmm7,%xmm6,%xmm6 + + vpsrld $13,%xmm9,%xmm1 + + vpslld $19,%xmm9,%xmm2 + vpaddd %xmm0,%xmm6,%xmm6 + vpand %xmm4,%xmm3,%xmm3 + + vpxor %xmm1,%xmm8,%xmm7 + + vpsrld $22,%xmm9,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + + vpslld $10,%xmm9,%xmm2 + vpxor %xmm3,%xmm10,%xmm8 + vpaddd %xmm6,%xmm12,%xmm12 + + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm2,%xmm7,%xmm7 + + vpaddd %xmm6,%xmm8,%xmm8 + vpaddd %xmm7,%xmm8,%xmm8 + addq $256,%rbp + decl %ecx + jnz .Loop_16_xx_avx + + movl $1,%ecx + leaq K256+128(%rip),%rbp + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqa (%rbx),%xmm7 + vpxor %xmm0,%xmm0,%xmm0 + vmovdqa %xmm7,%xmm6 + vpcmpgtd %xmm0,%xmm6,%xmm6 + vpaddd %xmm6,%xmm7,%xmm7 + + vmovdqu 0-128(%rdi),%xmm0 + vpand %xmm6,%xmm8,%xmm8 + vmovdqu 32-128(%rdi),%xmm1 + vpand %xmm6,%xmm9,%xmm9 + vmovdqu 64-128(%rdi),%xmm2 + vpand %xmm6,%xmm10,%xmm10 + vmovdqu 96-128(%rdi),%xmm5 + vpand %xmm6,%xmm11,%xmm11 + vpaddd %xmm0,%xmm8,%xmm8 + vmovdqu 128-128(%rdi),%xmm0 + vpand %xmm6,%xmm12,%xmm12 + vpaddd %xmm1,%xmm9,%xmm9 + vmovdqu 160-128(%rdi),%xmm1 + vpand %xmm6,%xmm13,%xmm13 + vpaddd %xmm2,%xmm10,%xmm10 + vmovdqu 192-128(%rdi),%xmm2 + vpand %xmm6,%xmm14,%xmm14 + vpaddd %xmm5,%xmm11,%xmm11 + vmovdqu 224-128(%rdi),%xmm5 + vpand %xmm6,%xmm15,%xmm15 + vpaddd %xmm0,%xmm12,%xmm12 + vpaddd %xmm1,%xmm13,%xmm13 + vmovdqu %xmm8,0-128(%rdi) + vpaddd %xmm2,%xmm14,%xmm14 + vmovdqu %xmm9,32-128(%rdi) + vpaddd %xmm5,%xmm15,%xmm15 + vmovdqu %xmm10,64-128(%rdi) + vmovdqu %xmm11,96-128(%rdi) + vmovdqu %xmm12,128-128(%rdi) + vmovdqu %xmm13,160-128(%rdi) + vmovdqu %xmm14,192-128(%rdi) + vmovdqu %xmm15,224-128(%rdi) + + vmovdqu %xmm7,(%rbx) + vmovdqu .Lpbswap(%rip),%xmm6 + decl %edx + jnz .Loop_avx + + movl 280(%rsp),%edx + leaq 16(%rdi),%rdi + leaq 64(%rsi),%rsi + decl %edx + jnz .Loop_grande_avx + +.Ldone_avx: + movq 272(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_multi_block_avx,.-sha256_multi_block_avx +.type sha256_multi_block_avx2,@function +.align 32 +sha256_multi_block_avx2: +.cfi_startproc +_avx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $576,%rsp + andq $-256,%rsp + movq %rax,544(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xa0,0x04,0x06,0x23,0x08 +.Lbody_avx2: + leaq K256+128(%rip),%rbp + leaq 128(%rdi),%rdi + +.Loop_grande_avx2: + movl %edx,552(%rsp) + xorl %edx,%edx + leaq 512(%rsp),%rbx + movq 0(%rsi),%r12 + movl 8(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,0(%rbx) + cmovleq %rbp,%r12 + movq 16(%rsi),%r13 + movl 24(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,4(%rbx) + cmovleq %rbp,%r13 + movq 32(%rsi),%r14 + movl 40(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,8(%rbx) + cmovleq %rbp,%r14 + movq 48(%rsi),%r15 + movl 56(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,12(%rbx) + cmovleq %rbp,%r15 + movq 64(%rsi),%r8 + movl 72(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,16(%rbx) + cmovleq %rbp,%r8 + movq 80(%rsi),%r9 + movl 88(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,20(%rbx) + cmovleq %rbp,%r9 + movq 96(%rsi),%r10 + movl 104(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,24(%rbx) + cmovleq %rbp,%r10 + movq 112(%rsi),%r11 + movl 120(%rsi),%ecx + cmpl %edx,%ecx + cmovgl %ecx,%edx + testl %ecx,%ecx + movl %ecx,28(%rbx) + cmovleq %rbp,%r11 + vmovdqu 0-128(%rdi),%ymm8 + leaq 128(%rsp),%rax + vmovdqu 32-128(%rdi),%ymm9 + leaq 256+128(%rsp),%rbx + vmovdqu 64-128(%rdi),%ymm10 + vmovdqu 96-128(%rdi),%ymm11 + vmovdqu 128-128(%rdi),%ymm12 + vmovdqu 160-128(%rdi),%ymm13 + vmovdqu 192-128(%rdi),%ymm14 + vmovdqu 224-128(%rdi),%ymm15 + vmovdqu .Lpbswap(%rip),%ymm6 + jmp .Loop_avx2 + +.align 32 +.Loop_avx2: + vpxor %ymm9,%ymm10,%ymm4 + vmovd 0(%r12),%xmm5 + vmovd 0(%r8),%xmm0 + vmovd 0(%r13),%xmm1 + vmovd 0(%r9),%xmm2 + vpinsrd $1,0(%r14),%xmm5,%xmm5 + vpinsrd $1,0(%r10),%xmm0,%xmm0 + vpinsrd $1,0(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,0(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,0-128(%rax) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovd 4(%r12),%xmm5 + vmovd 4(%r8),%xmm0 + vmovd 4(%r13),%xmm1 + vmovd 4(%r9),%xmm2 + vpinsrd $1,4(%r14),%xmm5,%xmm5 + vpinsrd $1,4(%r10),%xmm0,%xmm0 + vpinsrd $1,4(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,4(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm5,32-128(%rax) + vpaddd %ymm14,%ymm5,%ymm5 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm5,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovd 8(%r12),%xmm5 + vmovd 8(%r8),%xmm0 + vmovd 8(%r13),%xmm1 + vmovd 8(%r9),%xmm2 + vpinsrd $1,8(%r14),%xmm5,%xmm5 + vpinsrd $1,8(%r10),%xmm0,%xmm0 + vpinsrd $1,8(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,8(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,64-128(%rax) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovd 12(%r12),%xmm5 + vmovd 12(%r8),%xmm0 + vmovd 12(%r13),%xmm1 + vmovd 12(%r9),%xmm2 + vpinsrd $1,12(%r14),%xmm5,%xmm5 + vpinsrd $1,12(%r10),%xmm0,%xmm0 + vpinsrd $1,12(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,12(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm5,96-128(%rax) + vpaddd %ymm12,%ymm5,%ymm5 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm5,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovd 16(%r12),%xmm5 + vmovd 16(%r8),%xmm0 + vmovd 16(%r13),%xmm1 + vmovd 16(%r9),%xmm2 + vpinsrd $1,16(%r14),%xmm5,%xmm5 + vpinsrd $1,16(%r10),%xmm0,%xmm0 + vpinsrd $1,16(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,16(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,128-128(%rax) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovd 20(%r12),%xmm5 + vmovd 20(%r8),%xmm0 + vmovd 20(%r13),%xmm1 + vmovd 20(%r9),%xmm2 + vpinsrd $1,20(%r14),%xmm5,%xmm5 + vpinsrd $1,20(%r10),%xmm0,%xmm0 + vpinsrd $1,20(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,20(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm5,160-128(%rax) + vpaddd %ymm10,%ymm5,%ymm5 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm5,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovd 24(%r12),%xmm5 + vmovd 24(%r8),%xmm0 + vmovd 24(%r13),%xmm1 + vmovd 24(%r9),%xmm2 + vpinsrd $1,24(%r14),%xmm5,%xmm5 + vpinsrd $1,24(%r10),%xmm0,%xmm0 + vpinsrd $1,24(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,24(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,192-128(%rax) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovd 28(%r12),%xmm5 + vmovd 28(%r8),%xmm0 + vmovd 28(%r13),%xmm1 + vmovd 28(%r9),%xmm2 + vpinsrd $1,28(%r14),%xmm5,%xmm5 + vpinsrd $1,28(%r10),%xmm0,%xmm0 + vpinsrd $1,28(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,28(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm5,224-128(%rax) + vpaddd %ymm8,%ymm5,%ymm5 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm9,%ymm1 + + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm5,%ymm12,%ymm12 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + vmovd 32(%r12),%xmm5 + vmovd 32(%r8),%xmm0 + vmovd 32(%r13),%xmm1 + vmovd 32(%r9),%xmm2 + vpinsrd $1,32(%r14),%xmm5,%xmm5 + vpinsrd $1,32(%r10),%xmm0,%xmm0 + vpinsrd $1,32(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,32(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,256-256-128(%rbx) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovd 36(%r12),%xmm5 + vmovd 36(%r8),%xmm0 + vmovd 36(%r13),%xmm1 + vmovd 36(%r9),%xmm2 + vpinsrd $1,36(%r14),%xmm5,%xmm5 + vpinsrd $1,36(%r10),%xmm0,%xmm0 + vpinsrd $1,36(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,36(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm5,288-256-128(%rbx) + vpaddd %ymm14,%ymm5,%ymm5 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm5,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovd 40(%r12),%xmm5 + vmovd 40(%r8),%xmm0 + vmovd 40(%r13),%xmm1 + vmovd 40(%r9),%xmm2 + vpinsrd $1,40(%r14),%xmm5,%xmm5 + vpinsrd $1,40(%r10),%xmm0,%xmm0 + vpinsrd $1,40(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,40(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,320-256-128(%rbx) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovd 44(%r12),%xmm5 + vmovd 44(%r8),%xmm0 + vmovd 44(%r13),%xmm1 + vmovd 44(%r9),%xmm2 + vpinsrd $1,44(%r14),%xmm5,%xmm5 + vpinsrd $1,44(%r10),%xmm0,%xmm0 + vpinsrd $1,44(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,44(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm5,352-256-128(%rbx) + vpaddd %ymm12,%ymm5,%ymm5 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm5,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovd 48(%r12),%xmm5 + vmovd 48(%r8),%xmm0 + vmovd 48(%r13),%xmm1 + vmovd 48(%r9),%xmm2 + vpinsrd $1,48(%r14),%xmm5,%xmm5 + vpinsrd $1,48(%r10),%xmm0,%xmm0 + vpinsrd $1,48(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,48(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,384-256-128(%rbx) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovd 52(%r12),%xmm5 + vmovd 52(%r8),%xmm0 + vmovd 52(%r13),%xmm1 + vmovd 52(%r9),%xmm2 + vpinsrd $1,52(%r14),%xmm5,%xmm5 + vpinsrd $1,52(%r10),%xmm0,%xmm0 + vpinsrd $1,52(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,52(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm5,416-256-128(%rbx) + vpaddd %ymm10,%ymm5,%ymm5 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm5,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovd 56(%r12),%xmm5 + vmovd 56(%r8),%xmm0 + vmovd 56(%r13),%xmm1 + vmovd 56(%r9),%xmm2 + vpinsrd $1,56(%r14),%xmm5,%xmm5 + vpinsrd $1,56(%r10),%xmm0,%xmm0 + vpinsrd $1,56(%r15),%xmm1,%xmm1 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,56(%r11),%xmm2,%xmm2 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,448-256-128(%rbx) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovd 60(%r12),%xmm5 + leaq 64(%r12),%r12 + vmovd 60(%r8),%xmm0 + leaq 64(%r8),%r8 + vmovd 60(%r13),%xmm1 + leaq 64(%r13),%r13 + vmovd 60(%r9),%xmm2 + leaq 64(%r9),%r9 + vpinsrd $1,60(%r14),%xmm5,%xmm5 + leaq 64(%r14),%r14 + vpinsrd $1,60(%r10),%xmm0,%xmm0 + leaq 64(%r10),%r10 + vpinsrd $1,60(%r15),%xmm1,%xmm1 + leaq 64(%r15),%r15 + vpunpckldq %ymm1,%ymm5,%ymm5 + vpinsrd $1,60(%r11),%xmm2,%xmm2 + leaq 64(%r11),%r11 + vpunpckldq %ymm2,%ymm0,%ymm0 + vinserti128 $1,%xmm0,%ymm5,%ymm5 + vpshufb %ymm6,%ymm5,%ymm5 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm5,480-256-128(%rbx) + vpaddd %ymm8,%ymm5,%ymm5 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + prefetcht0 63(%r12) + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + prefetcht0 63(%r13) + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + prefetcht0 63(%r14) + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + prefetcht0 63(%r15) + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm9,%ymm1 + prefetcht0 63(%r8) + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm4,%ymm3,%ymm3 + prefetcht0 63(%r9) + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + prefetcht0 63(%r10) + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm5,%ymm12,%ymm12 + prefetcht0 63(%r11) + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + vmovdqu 0-128(%rax),%ymm5 + movl $3,%ecx + jmp .Loop_16_xx_avx2 +.align 32 +.Loop_16_xx_avx2: + vmovdqu 32-128(%rax),%ymm6 + vpaddd 288-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 448-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,0-128(%rax) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovdqu 64-128(%rax),%ymm5 + vpaddd 320-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 480-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm6,32-128(%rax) + vpaddd %ymm14,%ymm6,%ymm6 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm6,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovdqu 96-128(%rax),%ymm6 + vpaddd 352-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 0-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,64-128(%rax) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovdqu 128-128(%rax),%ymm5 + vpaddd 384-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 32-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm6,96-128(%rax) + vpaddd %ymm12,%ymm6,%ymm6 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm6,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovdqu 160-128(%rax),%ymm6 + vpaddd 416-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 64-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,128-128(%rax) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovdqu 192-128(%rax),%ymm5 + vpaddd 448-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 96-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm6,160-128(%rax) + vpaddd %ymm10,%ymm6,%ymm6 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm6,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovdqu 224-128(%rax),%ymm6 + vpaddd 480-256-128(%rbx),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 128-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,192-128(%rax) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovdqu 256-256-128(%rbx),%ymm5 + vpaddd 0-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 160-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm6,224-128(%rax) + vpaddd %ymm8,%ymm6,%ymm6 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm9,%ymm1 + + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm6,%ymm12,%ymm12 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + vmovdqu 288-256-128(%rbx),%ymm6 + vpaddd 32-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 192-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm12,%ymm7 + vpslld $26,%ymm12,%ymm2 + vmovdqu %ymm5,256-256-128(%rbx) + vpaddd %ymm15,%ymm5,%ymm5 + + vpsrld $11,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm12,%ymm2 + vpaddd -128(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm12,%ymm2 + vpandn %ymm14,%ymm12,%ymm0 + vpand %ymm13,%ymm12,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm8,%ymm15 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm8,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm8,%ymm9,%ymm3 + + vpxor %ymm1,%ymm15,%ymm15 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm8,%ymm1 + + vpslld $19,%ymm8,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm15,%ymm7 + + vpsrld $22,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm8,%ymm2 + vpxor %ymm4,%ymm9,%ymm15 + vpaddd %ymm5,%ymm11,%ymm11 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm15,%ymm15 + vpaddd %ymm7,%ymm15,%ymm15 + vmovdqu 320-256-128(%rbx),%ymm5 + vpaddd 64-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 224-128(%rax),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm11,%ymm7 + vpslld $26,%ymm11,%ymm2 + vmovdqu %ymm6,288-256-128(%rbx) + vpaddd %ymm14,%ymm6,%ymm6 + + vpsrld $11,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm11,%ymm2 + vpaddd -96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm11,%ymm2 + vpandn %ymm13,%ymm11,%ymm0 + vpand %ymm12,%ymm11,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm15,%ymm14 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm15,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm15,%ymm8,%ymm4 + + vpxor %ymm1,%ymm14,%ymm14 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm15,%ymm1 + + vpslld $19,%ymm15,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm14,%ymm7 + + vpsrld $22,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm15,%ymm2 + vpxor %ymm3,%ymm8,%ymm14 + vpaddd %ymm6,%ymm10,%ymm10 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm14,%ymm14 + vpaddd %ymm7,%ymm14,%ymm14 + vmovdqu 352-256-128(%rbx),%ymm6 + vpaddd 96-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 256-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm10,%ymm7 + vpslld $26,%ymm10,%ymm2 + vmovdqu %ymm5,320-256-128(%rbx) + vpaddd %ymm13,%ymm5,%ymm5 + + vpsrld $11,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm10,%ymm2 + vpaddd -64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm10,%ymm2 + vpandn %ymm12,%ymm10,%ymm0 + vpand %ymm11,%ymm10,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm14,%ymm13 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm14,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm14,%ymm15,%ymm3 + + vpxor %ymm1,%ymm13,%ymm13 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm14,%ymm1 + + vpslld $19,%ymm14,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm13,%ymm7 + + vpsrld $22,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm14,%ymm2 + vpxor %ymm4,%ymm15,%ymm13 + vpaddd %ymm5,%ymm9,%ymm9 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm13,%ymm13 + vpaddd %ymm7,%ymm13,%ymm13 + vmovdqu 384-256-128(%rbx),%ymm5 + vpaddd 128-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 288-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm9,%ymm7 + vpslld $26,%ymm9,%ymm2 + vmovdqu %ymm6,352-256-128(%rbx) + vpaddd %ymm12,%ymm6,%ymm6 + + vpsrld $11,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm9,%ymm2 + vpaddd -32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm9,%ymm2 + vpandn %ymm11,%ymm9,%ymm0 + vpand %ymm10,%ymm9,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm13,%ymm12 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm13,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm13,%ymm14,%ymm4 + + vpxor %ymm1,%ymm12,%ymm12 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm13,%ymm1 + + vpslld $19,%ymm13,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm12,%ymm7 + + vpsrld $22,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm13,%ymm2 + vpxor %ymm3,%ymm14,%ymm12 + vpaddd %ymm6,%ymm8,%ymm8 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm12,%ymm12 + vpaddd %ymm7,%ymm12,%ymm12 + vmovdqu 416-256-128(%rbx),%ymm6 + vpaddd 160-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 320-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm8,%ymm7 + vpslld $26,%ymm8,%ymm2 + vmovdqu %ymm5,384-256-128(%rbx) + vpaddd %ymm11,%ymm5,%ymm5 + + vpsrld $11,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm8,%ymm2 + vpaddd 0(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm8,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm8,%ymm2 + vpandn %ymm10,%ymm8,%ymm0 + vpand %ymm9,%ymm8,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm12,%ymm11 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm12,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm12,%ymm13,%ymm3 + + vpxor %ymm1,%ymm11,%ymm11 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm12,%ymm1 + + vpslld $19,%ymm12,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm11,%ymm7 + + vpsrld $22,%ymm12,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm12,%ymm2 + vpxor %ymm4,%ymm13,%ymm11 + vpaddd %ymm5,%ymm15,%ymm15 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm11,%ymm11 + vpaddd %ymm7,%ymm11,%ymm11 + vmovdqu 448-256-128(%rbx),%ymm5 + vpaddd 192-128(%rax),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 352-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm15,%ymm7 + vpslld $26,%ymm15,%ymm2 + vmovdqu %ymm6,416-256-128(%rbx) + vpaddd %ymm10,%ymm6,%ymm6 + + vpsrld $11,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm15,%ymm2 + vpaddd 32(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm15,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm15,%ymm2 + vpandn %ymm9,%ymm15,%ymm0 + vpand %ymm8,%ymm15,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm11,%ymm10 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm11,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm11,%ymm12,%ymm4 + + vpxor %ymm1,%ymm10,%ymm10 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm11,%ymm1 + + vpslld $19,%ymm11,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm10,%ymm7 + + vpsrld $22,%ymm11,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm11,%ymm2 + vpxor %ymm3,%ymm12,%ymm10 + vpaddd %ymm6,%ymm14,%ymm14 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm10,%ymm10 + vpaddd %ymm7,%ymm10,%ymm10 + vmovdqu 480-256-128(%rbx),%ymm6 + vpaddd 224-128(%rax),%ymm5,%ymm5 + + vpsrld $3,%ymm6,%ymm7 + vpsrld $7,%ymm6,%ymm1 + vpslld $25,%ymm6,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm6,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm6,%ymm2 + vmovdqu 384-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm5,%ymm5 + vpxor %ymm1,%ymm3,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm5,%ymm5 + vpsrld $6,%ymm14,%ymm7 + vpslld $26,%ymm14,%ymm2 + vmovdqu %ymm5,448-256-128(%rbx) + vpaddd %ymm9,%ymm5,%ymm5 + + vpsrld $11,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm14,%ymm2 + vpaddd 64(%rbp),%ymm5,%ymm5 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm14,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm14,%ymm2 + vpandn %ymm8,%ymm14,%ymm0 + vpand %ymm15,%ymm14,%ymm3 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm10,%ymm9 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm10,%ymm1 + vpxor %ymm3,%ymm0,%ymm0 + vpxor %ymm10,%ymm11,%ymm3 + + vpxor %ymm1,%ymm9,%ymm9 + vpaddd %ymm7,%ymm5,%ymm5 + + vpsrld $13,%ymm10,%ymm1 + + vpslld $19,%ymm10,%ymm2 + vpaddd %ymm0,%ymm5,%ymm5 + vpand %ymm3,%ymm4,%ymm4 + + vpxor %ymm1,%ymm9,%ymm7 + + vpsrld $22,%ymm10,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm10,%ymm2 + vpxor %ymm4,%ymm11,%ymm9 + vpaddd %ymm5,%ymm13,%ymm13 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm5,%ymm9,%ymm9 + vpaddd %ymm7,%ymm9,%ymm9 + vmovdqu 0-128(%rax),%ymm5 + vpaddd 256-256-128(%rbx),%ymm6,%ymm6 + + vpsrld $3,%ymm5,%ymm7 + vpsrld $7,%ymm5,%ymm1 + vpslld $25,%ymm5,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $18,%ymm5,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $14,%ymm5,%ymm2 + vmovdqu 416-256-128(%rbx),%ymm0 + vpsrld $10,%ymm0,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + vpsrld $17,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $15,%ymm0,%ymm2 + vpaddd %ymm7,%ymm6,%ymm6 + vpxor %ymm1,%ymm4,%ymm7 + vpsrld $19,%ymm0,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $13,%ymm0,%ymm2 + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + vpaddd %ymm7,%ymm6,%ymm6 + vpsrld $6,%ymm13,%ymm7 + vpslld $26,%ymm13,%ymm2 + vmovdqu %ymm6,480-256-128(%rbx) + vpaddd %ymm8,%ymm6,%ymm6 + + vpsrld $11,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + vpslld $21,%ymm13,%ymm2 + vpaddd 96(%rbp),%ymm6,%ymm6 + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $25,%ymm13,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $7,%ymm13,%ymm2 + vpandn %ymm15,%ymm13,%ymm0 + vpand %ymm14,%ymm13,%ymm4 + + vpxor %ymm1,%ymm7,%ymm7 + + vpsrld $2,%ymm9,%ymm8 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $30,%ymm9,%ymm1 + vpxor %ymm4,%ymm0,%ymm0 + vpxor %ymm9,%ymm10,%ymm4 + + vpxor %ymm1,%ymm8,%ymm8 + vpaddd %ymm7,%ymm6,%ymm6 + + vpsrld $13,%ymm9,%ymm1 + + vpslld $19,%ymm9,%ymm2 + vpaddd %ymm0,%ymm6,%ymm6 + vpand %ymm4,%ymm3,%ymm3 + + vpxor %ymm1,%ymm8,%ymm7 + + vpsrld $22,%ymm9,%ymm1 + vpxor %ymm2,%ymm7,%ymm7 + + vpslld $10,%ymm9,%ymm2 + vpxor %ymm3,%ymm10,%ymm8 + vpaddd %ymm6,%ymm12,%ymm12 + + vpxor %ymm1,%ymm7,%ymm7 + vpxor %ymm2,%ymm7,%ymm7 + + vpaddd %ymm6,%ymm8,%ymm8 + vpaddd %ymm7,%ymm8,%ymm8 + addq $256,%rbp + decl %ecx + jnz .Loop_16_xx_avx2 + + movl $1,%ecx + leaq 512(%rsp),%rbx + leaq K256+128(%rip),%rbp + cmpl 0(%rbx),%ecx + cmovgeq %rbp,%r12 + cmpl 4(%rbx),%ecx + cmovgeq %rbp,%r13 + cmpl 8(%rbx),%ecx + cmovgeq %rbp,%r14 + cmpl 12(%rbx),%ecx + cmovgeq %rbp,%r15 + cmpl 16(%rbx),%ecx + cmovgeq %rbp,%r8 + cmpl 20(%rbx),%ecx + cmovgeq %rbp,%r9 + cmpl 24(%rbx),%ecx + cmovgeq %rbp,%r10 + cmpl 28(%rbx),%ecx + cmovgeq %rbp,%r11 + vmovdqa (%rbx),%ymm7 + vpxor %ymm0,%ymm0,%ymm0 + vmovdqa %ymm7,%ymm6 + vpcmpgtd %ymm0,%ymm6,%ymm6 + vpaddd %ymm6,%ymm7,%ymm7 + + vmovdqu 0-128(%rdi),%ymm0 + vpand %ymm6,%ymm8,%ymm8 + vmovdqu 32-128(%rdi),%ymm1 + vpand %ymm6,%ymm9,%ymm9 + vmovdqu 64-128(%rdi),%ymm2 + vpand %ymm6,%ymm10,%ymm10 + vmovdqu 96-128(%rdi),%ymm5 + vpand %ymm6,%ymm11,%ymm11 + vpaddd %ymm0,%ymm8,%ymm8 + vmovdqu 128-128(%rdi),%ymm0 + vpand %ymm6,%ymm12,%ymm12 + vpaddd %ymm1,%ymm9,%ymm9 + vmovdqu 160-128(%rdi),%ymm1 + vpand %ymm6,%ymm13,%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vmovdqu 192-128(%rdi),%ymm2 + vpand %ymm6,%ymm14,%ymm14 + vpaddd %ymm5,%ymm11,%ymm11 + vmovdqu 224-128(%rdi),%ymm5 + vpand %ymm6,%ymm15,%ymm15 + vpaddd %ymm0,%ymm12,%ymm12 + vpaddd %ymm1,%ymm13,%ymm13 + vmovdqu %ymm8,0-128(%rdi) + vpaddd %ymm2,%ymm14,%ymm14 + vmovdqu %ymm9,32-128(%rdi) + vpaddd %ymm5,%ymm15,%ymm15 + vmovdqu %ymm10,64-128(%rdi) + vmovdqu %ymm11,96-128(%rdi) + vmovdqu %ymm12,128-128(%rdi) + vmovdqu %ymm13,160-128(%rdi) + vmovdqu %ymm14,192-128(%rdi) + vmovdqu %ymm15,224-128(%rdi) + + vmovdqu %ymm7,(%rbx) + leaq 256+128(%rsp),%rbx + vmovdqu .Lpbswap(%rip),%ymm6 + decl %edx + jnz .Loop_avx2 + + + + + + + +.Ldone_avx2: + movq 544(%rsp),%rax +.cfi_def_cfa %rax,8 + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_multi_block_avx2,.-sha256_multi_block_avx2 .align 256 K256: .long 1116352408,1116352408,1116352408,1116352408 .long 1116352408,1116352408,1116352408,1116352408 .long 1899447441,1899447441,1899447441,1899447441 .long 1899447441,1899447441,1899447441,1899447441 .long 3049323471,3049323471,3049323471,3049323471 .long 3049323471,3049323471,3049323471,3049323471 .long 3921009573,3921009573,3921009573,3921009573 .long 3921009573,3921009573,3921009573,3921009573 .long 961987163,961987163,961987163,961987163 .long 961987163,961987163,961987163,961987163 .long 1508970993,1508970993,1508970993,1508970993 .long 1508970993,1508970993,1508970993,1508970993 .long 2453635748,2453635748,2453635748,2453635748 .long 2453635748,2453635748,2453635748,2453635748 .long 2870763221,2870763221,2870763221,2870763221 .long 2870763221,2870763221,2870763221,2870763221 .long 3624381080,3624381080,3624381080,3624381080 .long 3624381080,3624381080,3624381080,3624381080 .long 310598401,310598401,310598401,310598401 .long 310598401,310598401,310598401,310598401 .long 607225278,607225278,607225278,607225278 .long 607225278,607225278,607225278,607225278 .long 1426881987,1426881987,1426881987,1426881987 .long 1426881987,1426881987,1426881987,1426881987 .long 1925078388,1925078388,1925078388,1925078388 .long 1925078388,1925078388,1925078388,1925078388 .long 2162078206,2162078206,2162078206,2162078206 .long 2162078206,2162078206,2162078206,2162078206 .long 2614888103,2614888103,2614888103,2614888103 .long 2614888103,2614888103,2614888103,2614888103 .long 3248222580,3248222580,3248222580,3248222580 .long 3248222580,3248222580,3248222580,3248222580 .long 3835390401,3835390401,3835390401,3835390401 .long 3835390401,3835390401,3835390401,3835390401 .long 4022224774,4022224774,4022224774,4022224774 .long 4022224774,4022224774,4022224774,4022224774 .long 264347078,264347078,264347078,264347078 .long 264347078,264347078,264347078,264347078 .long 604807628,604807628,604807628,604807628 .long 604807628,604807628,604807628,604807628 .long 770255983,770255983,770255983,770255983 .long 770255983,770255983,770255983,770255983 .long 1249150122,1249150122,1249150122,1249150122 .long 1249150122,1249150122,1249150122,1249150122 .long 1555081692,1555081692,1555081692,1555081692 .long 1555081692,1555081692,1555081692,1555081692 .long 1996064986,1996064986,1996064986,1996064986 .long 1996064986,1996064986,1996064986,1996064986 .long 2554220882,2554220882,2554220882,2554220882 .long 2554220882,2554220882,2554220882,2554220882 .long 2821834349,2821834349,2821834349,2821834349 .long 2821834349,2821834349,2821834349,2821834349 .long 2952996808,2952996808,2952996808,2952996808 .long 2952996808,2952996808,2952996808,2952996808 .long 3210313671,3210313671,3210313671,3210313671 .long 3210313671,3210313671,3210313671,3210313671 .long 3336571891,3336571891,3336571891,3336571891 .long 3336571891,3336571891,3336571891,3336571891 .long 3584528711,3584528711,3584528711,3584528711 .long 3584528711,3584528711,3584528711,3584528711 .long 113926993,113926993,113926993,113926993 .long 113926993,113926993,113926993,113926993 .long 338241895,338241895,338241895,338241895 .long 338241895,338241895,338241895,338241895 .long 666307205,666307205,666307205,666307205 .long 666307205,666307205,666307205,666307205 .long 773529912,773529912,773529912,773529912 .long 773529912,773529912,773529912,773529912 .long 1294757372,1294757372,1294757372,1294757372 .long 1294757372,1294757372,1294757372,1294757372 .long 1396182291,1396182291,1396182291,1396182291 .long 1396182291,1396182291,1396182291,1396182291 .long 1695183700,1695183700,1695183700,1695183700 .long 1695183700,1695183700,1695183700,1695183700 .long 1986661051,1986661051,1986661051,1986661051 .long 1986661051,1986661051,1986661051,1986661051 .long 2177026350,2177026350,2177026350,2177026350 .long 2177026350,2177026350,2177026350,2177026350 .long 2456956037,2456956037,2456956037,2456956037 .long 2456956037,2456956037,2456956037,2456956037 .long 2730485921,2730485921,2730485921,2730485921 .long 2730485921,2730485921,2730485921,2730485921 .long 2820302411,2820302411,2820302411,2820302411 .long 2820302411,2820302411,2820302411,2820302411 .long 3259730800,3259730800,3259730800,3259730800 .long 3259730800,3259730800,3259730800,3259730800 .long 3345764771,3345764771,3345764771,3345764771 .long 3345764771,3345764771,3345764771,3345764771 .long 3516065817,3516065817,3516065817,3516065817 .long 3516065817,3516065817,3516065817,3516065817 .long 3600352804,3600352804,3600352804,3600352804 .long 3600352804,3600352804,3600352804,3600352804 .long 4094571909,4094571909,4094571909,4094571909 .long 4094571909,4094571909,4094571909,4094571909 .long 275423344,275423344,275423344,275423344 .long 275423344,275423344,275423344,275423344 .long 430227734,430227734,430227734,430227734 .long 430227734,430227734,430227734,430227734 .long 506948616,506948616,506948616,506948616 .long 506948616,506948616,506948616,506948616 .long 659060556,659060556,659060556,659060556 .long 659060556,659060556,659060556,659060556 .long 883997877,883997877,883997877,883997877 .long 883997877,883997877,883997877,883997877 .long 958139571,958139571,958139571,958139571 .long 958139571,958139571,958139571,958139571 .long 1322822218,1322822218,1322822218,1322822218 .long 1322822218,1322822218,1322822218,1322822218 .long 1537002063,1537002063,1537002063,1537002063 .long 1537002063,1537002063,1537002063,1537002063 .long 1747873779,1747873779,1747873779,1747873779 .long 1747873779,1747873779,1747873779,1747873779 .long 1955562222,1955562222,1955562222,1955562222 .long 1955562222,1955562222,1955562222,1955562222 .long 2024104815,2024104815,2024104815,2024104815 .long 2024104815,2024104815,2024104815,2024104815 .long 2227730452,2227730452,2227730452,2227730452 .long 2227730452,2227730452,2227730452,2227730452 .long 2361852424,2361852424,2361852424,2361852424 .long 2361852424,2361852424,2361852424,2361852424 .long 2428436474,2428436474,2428436474,2428436474 .long 2428436474,2428436474,2428436474,2428436474 .long 2756734187,2756734187,2756734187,2756734187 .long 2756734187,2756734187,2756734187,2756734187 .long 3204031479,3204031479,3204031479,3204031479 .long 3204031479,3204031479,3204031479,3204031479 .long 3329325298,3329325298,3329325298,3329325298 .long 3329325298,3329325298,3329325298,3329325298 .Lpbswap: .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f K256_shaext: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .byte 83,72,65,50,53,54,32,109,117,108,116,105,45,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 Index: stable/12/secure/lib/libcrypto/amd64/sha256-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/sha256-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/sha256-x86_64.S (revision 364963) @@ -1,3089 +1,5458 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha512-x86_64.pl. */ .text .globl sha256_block_data_order .type sha256_block_data_order,@function .align 16 sha256_block_data_order: .cfi_startproc leaq OPENSSL_ia32cap_P(%rip),%r11 movl 0(%r11),%r9d movl 4(%r11),%r10d movl 8(%r11),%r11d testl $536870912,%r11d jnz _shaext_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut testl $512,%r10d jnz .Lssse3_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $64+32,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue: movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop .align 16 .Lloop: movl %ebx,%edi leaq K256(%rip),%rbp xorl %ecx,%edi movl 0(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp addl %r14d,%r11d movl 4(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp addl %r14d,%r10d movl 8(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp addl %r14d,%r9d movl 12(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp addl %r14d,%r8d movl 16(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp addl %r14d,%edx movl 20(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp addl %r14d,%ecx movl 24(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp addl %r14d,%ebx movl 28(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp addl %r14d,%eax movl 32(%rsi),%r12d movl %r8d,%r13d movl %eax,%r14d bswapl %r12d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp addl %r14d,%r11d movl 36(%rsi),%r12d movl %edx,%r13d movl %r11d,%r14d bswapl %r12d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp addl %r14d,%r10d movl 40(%rsi),%r12d movl %ecx,%r13d movl %r10d,%r14d bswapl %r12d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp addl %r14d,%r9d movl 44(%rsi),%r12d movl %ebx,%r13d movl %r9d,%r14d bswapl %r12d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp addl %r14d,%r8d movl 48(%rsi),%r12d movl %eax,%r13d movl %r8d,%r14d bswapl %r12d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp addl %r14d,%edx movl 52(%rsi),%r12d movl %r11d,%r13d movl %edx,%r14d bswapl %r12d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp addl %r14d,%ecx movl 56(%rsi),%r12d movl %r10d,%r13d movl %ecx,%r14d bswapl %r12d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp addl %r14d,%ebx movl 60(%rsi),%r12d movl %r9d,%r13d movl %ebx,%r14d bswapl %r12d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movl 4(%rsp),%r13d movl 56(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 36(%rsp),%r12d addl 0(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,0(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp movl 8(%rsp),%r13d movl 60(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 40(%rsp),%r12d addl 4(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,4(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp movl 12(%rsp),%r13d movl 0(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 44(%rsp),%r12d addl 8(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,8(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp movl 16(%rsp),%r13d movl 4(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 48(%rsp),%r12d addl 12(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,12(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp movl 20(%rsp),%r13d movl 8(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 52(%rsp),%r12d addl 16(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,16(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp movl 24(%rsp),%r13d movl 12(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 56(%rsp),%r12d addl 20(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,20(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp movl 28(%rsp),%r13d movl 16(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 60(%rsp),%r12d addl 24(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,24(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp movl 32(%rsp),%r13d movl 20(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 0(%rsp),%r12d addl 28(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,28(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp movl 36(%rsp),%r13d movl 24(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%eax movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 4(%rsp),%r12d addl 32(%rsp),%r12d movl %r8d,%r13d addl %r15d,%r12d movl %eax,%r14d rorl $14,%r13d movl %r9d,%r15d xorl %r8d,%r13d rorl $9,%r14d xorl %r10d,%r15d movl %r12d,32(%rsp) xorl %eax,%r14d andl %r8d,%r15d rorl $5,%r13d addl %r11d,%r12d xorl %r10d,%r15d rorl $11,%r14d xorl %r8d,%r13d addl %r15d,%r12d movl %eax,%r15d addl (%rbp),%r12d xorl %eax,%r14d xorl %ebx,%r15d rorl $6,%r13d movl %ebx,%r11d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r11d addl %r12d,%edx addl %r12d,%r11d leaq 4(%rbp),%rbp movl 40(%rsp),%r13d movl 28(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r11d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 8(%rsp),%r12d addl 36(%rsp),%r12d movl %edx,%r13d addl %edi,%r12d movl %r11d,%r14d rorl $14,%r13d movl %r8d,%edi xorl %edx,%r13d rorl $9,%r14d xorl %r9d,%edi movl %r12d,36(%rsp) xorl %r11d,%r14d andl %edx,%edi rorl $5,%r13d addl %r10d,%r12d xorl %r9d,%edi rorl $11,%r14d xorl %edx,%r13d addl %edi,%r12d movl %r11d,%edi addl (%rbp),%r12d xorl %r11d,%r14d xorl %eax,%edi rorl $6,%r13d movl %eax,%r10d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r10d addl %r12d,%ecx addl %r12d,%r10d leaq 4(%rbp),%rbp movl 44(%rsp),%r13d movl 32(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r10d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 12(%rsp),%r12d addl 40(%rsp),%r12d movl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r14d rorl $14,%r13d movl %edx,%r15d xorl %ecx,%r13d rorl $9,%r14d xorl %r8d,%r15d movl %r12d,40(%rsp) xorl %r10d,%r14d andl %ecx,%r15d rorl $5,%r13d addl %r9d,%r12d xorl %r8d,%r15d rorl $11,%r14d xorl %ecx,%r13d addl %r15d,%r12d movl %r10d,%r15d addl (%rbp),%r12d xorl %r10d,%r14d xorl %r11d,%r15d rorl $6,%r13d movl %r11d,%r9d andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%r9d addl %r12d,%ebx addl %r12d,%r9d leaq 4(%rbp),%rbp movl 48(%rsp),%r13d movl 36(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r9d movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 16(%rsp),%r12d addl 44(%rsp),%r12d movl %ebx,%r13d addl %edi,%r12d movl %r9d,%r14d rorl $14,%r13d movl %ecx,%edi xorl %ebx,%r13d rorl $9,%r14d xorl %edx,%edi movl %r12d,44(%rsp) xorl %r9d,%r14d andl %ebx,%edi rorl $5,%r13d addl %r8d,%r12d xorl %edx,%edi rorl $11,%r14d xorl %ebx,%r13d addl %edi,%r12d movl %r9d,%edi addl (%rbp),%r12d xorl %r9d,%r14d xorl %r10d,%edi rorl $6,%r13d movl %r10d,%r8d andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%r8d addl %r12d,%eax addl %r12d,%r8d leaq 20(%rbp),%rbp movl 52(%rsp),%r13d movl 40(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%r8d movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 20(%rsp),%r12d addl 48(%rsp),%r12d movl %eax,%r13d addl %r15d,%r12d movl %r8d,%r14d rorl $14,%r13d movl %ebx,%r15d xorl %eax,%r13d rorl $9,%r14d xorl %ecx,%r15d movl %r12d,48(%rsp) xorl %r8d,%r14d andl %eax,%r15d rorl $5,%r13d addl %edx,%r12d xorl %ecx,%r15d rorl $11,%r14d xorl %eax,%r13d addl %r15d,%r12d movl %r8d,%r15d addl (%rbp),%r12d xorl %r8d,%r14d xorl %r9d,%r15d rorl $6,%r13d movl %r9d,%edx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%edx addl %r12d,%r11d addl %r12d,%edx leaq 4(%rbp),%rbp movl 56(%rsp),%r13d movl 44(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%edx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 24(%rsp),%r12d addl 52(%rsp),%r12d movl %r11d,%r13d addl %edi,%r12d movl %edx,%r14d rorl $14,%r13d movl %eax,%edi xorl %r11d,%r13d rorl $9,%r14d xorl %ebx,%edi movl %r12d,52(%rsp) xorl %edx,%r14d andl %r11d,%edi rorl $5,%r13d addl %ecx,%r12d xorl %ebx,%edi rorl $11,%r14d xorl %r11d,%r13d addl %edi,%r12d movl %edx,%edi addl (%rbp),%r12d xorl %edx,%r14d xorl %r8d,%edi rorl $6,%r13d movl %r8d,%ecx andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%ecx addl %r12d,%r10d addl %r12d,%ecx leaq 4(%rbp),%rbp movl 60(%rsp),%r13d movl 48(%rsp),%r15d movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ecx movl %r15d,%r14d rorl $2,%r15d xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%r15d shrl $10,%r14d rorl $17,%r15d xorl %r13d,%r12d xorl %r14d,%r15d addl 28(%rsp),%r12d addl 56(%rsp),%r12d movl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r14d rorl $14,%r13d movl %r11d,%r15d xorl %r10d,%r13d rorl $9,%r14d xorl %eax,%r15d movl %r12d,56(%rsp) xorl %ecx,%r14d andl %r10d,%r15d rorl $5,%r13d addl %ebx,%r12d xorl %eax,%r15d rorl $11,%r14d xorl %r10d,%r13d addl %r15d,%r12d movl %ecx,%r15d addl (%rbp),%r12d xorl %ecx,%r14d xorl %edx,%r15d rorl $6,%r13d movl %edx,%ebx andl %r15d,%edi rorl $2,%r14d addl %r13d,%r12d xorl %edi,%ebx addl %r12d,%r9d addl %r12d,%ebx leaq 4(%rbp),%rbp movl 0(%rsp),%r13d movl 52(%rsp),%edi movl %r13d,%r12d rorl $11,%r13d addl %r14d,%ebx movl %edi,%r14d rorl $2,%edi xorl %r12d,%r13d shrl $3,%r12d rorl $7,%r13d xorl %r14d,%edi shrl $10,%r14d rorl $17,%edi xorl %r13d,%r12d xorl %r14d,%edi addl 32(%rsp),%r12d addl 60(%rsp),%r12d movl %r9d,%r13d addl %edi,%r12d movl %ebx,%r14d rorl $14,%r13d movl %r10d,%edi xorl %r9d,%r13d rorl $9,%r14d xorl %r11d,%edi movl %r12d,60(%rsp) xorl %ebx,%r14d andl %r9d,%edi rorl $5,%r13d addl %eax,%r12d xorl %r11d,%edi rorl $11,%r14d xorl %r9d,%r13d addl %edi,%r12d movl %ebx,%edi addl (%rbp),%r12d xorl %ebx,%r14d xorl %ecx,%edi rorl $6,%r13d movl %ecx,%eax andl %edi,%r15d rorl $2,%r14d addl %r13d,%r12d xorl %r15d,%eax addl %r12d,%r8d addl %r12d,%eax leaq 20(%rbp),%rbp cmpb $0,3(%rbp) jnz .Lrounds_16_xx movq 64+0(%rsp),%rdi addl %r14d,%eax leaq 64(%rsi),%rsi addl 0(%rdi),%eax addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop movq 88(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order,.-sha256_block_data_order .align 64 .type K256,@object K256: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .type sha256_block_data_order_shaext,@function .align 64 sha256_block_data_order_shaext: _shaext_shortcut: .cfi_startproc leaq K256+128(%rip),%rcx movdqu (%rdi),%xmm1 movdqu 16(%rdi),%xmm2 movdqa 512-128(%rcx),%xmm7 pshufd $0x1b,%xmm1,%xmm0 pshufd $0xb1,%xmm1,%xmm1 pshufd $0x1b,%xmm2,%xmm2 movdqa %xmm7,%xmm8 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 jmp .Loop_shaext .align 16 .Loop_shaext: movdqu (%rsi),%xmm3 movdqu 16(%rsi),%xmm4 movdqu 32(%rsi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%rsi),%xmm6 movdqa 0-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 movdqa %xmm2,%xmm10 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 nop movdqa %xmm1,%xmm9 .byte 15,56,203,202 movdqa 32-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 leaq 64(%rsi),%rsi .byte 15,56,204,220 .byte 15,56,203,202 movdqa 64-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 96-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 128-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 160-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 192-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 224-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 256-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 288-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 320-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 352-128(%rcx),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 384-128(%rcx),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 416-128(%rcx),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 448-128(%rcx),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 .byte 15,56,205,245 movdqa %xmm8,%xmm7 .byte 15,56,203,202 movdqa 480-128(%rcx),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $0x0e,%xmm0,%xmm0 decq %rdx nop .byte 15,56,203,202 paddd %xmm10,%xmm2 paddd %xmm9,%xmm1 jnz .Loop_shaext pshufd $0xb1,%xmm2,%xmm2 pshufd $0x1b,%xmm1,%xmm7 pshufd $0xb1,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movdqu %xmm1,(%rdi) movdqu %xmm2,16(%rdi) .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext .type sha256_block_data_order_ssse3,@function .align 64 sha256_block_data_order_ssse3: .cfi_startproc .Lssse3_shortcut: movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $96,%rsp leaq (%rsi,%rdx,4),%rdx andq $-64,%rsp movq %rdi,64+0(%rsp) movq %rsi,64+8(%rsp) movq %rdx,64+16(%rsp) movq %rax,88(%rsp) .cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 .Lprologue_ssse3: movl 0(%rdi),%eax movl 4(%rdi),%ebx movl 8(%rdi),%ecx movl 12(%rdi),%edx movl 16(%rdi),%r8d movl 20(%rdi),%r9d movl 24(%rdi),%r10d movl 28(%rdi),%r11d jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: movdqa K256+512(%rip),%xmm7 movdqu 0(%rsi),%xmm0 movdqu 16(%rsi),%xmm1 movdqu 32(%rsi),%xmm2 .byte 102,15,56,0,199 movdqu 48(%rsi),%xmm3 leaq K256(%rip),%rbp .byte 102,15,56,0,207 movdqa 0(%rbp),%xmm4 movdqa 32(%rbp),%xmm5 .byte 102,15,56,0,215 paddd %xmm0,%xmm4 movdqa 64(%rbp),%xmm6 .byte 102,15,56,0,223 movdqa 96(%rbp),%xmm7 paddd %xmm1,%xmm5 paddd %xmm2,%xmm6 paddd %xmm3,%xmm7 movdqa %xmm4,0(%rsp) movl %eax,%r14d movdqa %xmm5,16(%rsp) movl %ebx,%edi movdqa %xmm6,32(%rsp) xorl %ecx,%edi movdqa %xmm7,48(%rsp) movl %r8d,%r13d jmp .Lssse3_00_47 .align 16 .Lssse3_00_47: subq $-128,%rbp rorl $14,%r13d movdqa %xmm1,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm3,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,224,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,250,4 addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm0 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm3,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 4(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm0 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm0 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm0,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 0(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm0 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm0,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,0(%rsp) rorl $14,%r13d movdqa %xmm2,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm0,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,225,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,251,4 addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm1 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm0,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 20(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm1 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm1 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm1,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 32(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm1 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm1,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,16(%rsp) rorl $14,%r13d movdqa %xmm3,%xmm4 movl %r14d,%eax movl %r9d,%r12d movdqa %xmm1,%xmm7 rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d .byte 102,15,58,15,226,4 andl %r8d,%r12d xorl %r8d,%r13d .byte 102,15,58,15,248,4 addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %ebx,%r15d addl %r12d,%r11d movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi paddd %xmm7,%xmm2 rorl $2,%r14d addl %r11d,%edx psrld $7,%xmm6 addl %edi,%r11d movl %edx,%r13d pshufd $250,%xmm1,%xmm7 addl %r11d,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%r11d movl %r8d,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %r11d,%r14d pxor %xmm5,%xmm4 andl %edx,%r12d xorl %edx,%r13d pslld $11,%xmm5 addl 36(%rsp),%r10d movl %r11d,%edi pxor %xmm6,%xmm4 xorl %r9d,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %eax,%edi addl %r12d,%r10d pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d psrld $10,%xmm7 addl %r13d,%r10d xorl %eax,%r15d paddd %xmm4,%xmm2 rorl $2,%r14d addl %r10d,%ecx psrlq $17,%xmm6 addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %ecx,%r13d xorl %r8d,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d pshufd $128,%xmm7,%xmm7 xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d psrldq $8,%xmm7 xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d paddd %xmm7,%xmm2 andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d pshufd $80,%xmm2,%xmm7 xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx movdqa %xmm7,%xmm6 addl %edi,%r9d movl %ebx,%r13d psrld $10,%xmm7 addl %r9d,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%r9d movl %ecx,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d psrlq $2,%xmm6 andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d pxor %xmm6,%xmm7 movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %r10d,%edi addl %r12d,%r8d movdqa 64(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d paddd %xmm7,%xmm2 rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d paddd %xmm2,%xmm6 movl %eax,%r13d addl %r8d,%r14d movdqa %xmm6,32(%rsp) rorl $14,%r13d movdqa %xmm0,%xmm4 movl %r14d,%r8d movl %ebx,%r12d movdqa %xmm2,%xmm7 rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d .byte 102,15,58,15,227,4 andl %eax,%r12d xorl %eax,%r13d .byte 102,15,58,15,249,4 addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d movdqa %xmm4,%xmm5 xorl %r9d,%r15d addl %r12d,%edx movdqa %xmm4,%xmm6 rorl $6,%r13d andl %r15d,%edi psrld $3,%xmm4 xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi paddd %xmm7,%xmm3 rorl $2,%r14d addl %edx,%r11d psrld $7,%xmm6 addl %edi,%edx movl %r11d,%r13d pshufd $250,%xmm2,%xmm7 addl %edx,%r14d rorl $14,%r13d pslld $14,%xmm5 movl %r14d,%edx movl %eax,%r12d pxor %xmm6,%xmm4 rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d psrld $11,%xmm6 xorl %edx,%r14d pxor %xmm5,%xmm4 andl %r11d,%r12d xorl %r11d,%r13d pslld $11,%xmm5 addl 52(%rsp),%ecx movl %edx,%edi pxor %xmm6,%xmm4 xorl %ebx,%r12d rorl $11,%r14d movdqa %xmm7,%xmm6 xorl %r8d,%edi addl %r12d,%ecx pxor %xmm5,%xmm4 rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d psrld $10,%xmm7 addl %r13d,%ecx xorl %r8d,%r15d paddd %xmm4,%xmm3 rorl $2,%r14d addl %ecx,%r10d psrlq $17,%xmm6 addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d pxor %xmm6,%xmm7 rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d psrlq $2,%xmm6 xorl %r10d,%r13d xorl %eax,%r12d pxor %xmm6,%xmm7 rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d pshufd $128,%xmm7,%xmm7 xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d psrldq $8,%xmm7 xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d paddd %xmm7,%xmm3 andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx pshufd $80,%xmm3,%xmm7 xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d movdqa %xmm7,%xmm6 addl %edi,%ebx movl %r9d,%r13d psrld $10,%xmm7 addl %ebx,%r14d rorl $14,%r13d psrlq $17,%xmm6 movl %r14d,%ebx movl %r10d,%r12d pxor %xmm6,%xmm7 rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d psrlq $2,%xmm6 andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax pxor %xmm6,%xmm7 movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d pshufd $8,%xmm7,%xmm7 xorl %ecx,%edi addl %r12d,%eax movdqa 96(%rbp),%xmm6 rorl $6,%r13d andl %edi,%r15d pslldq $8,%xmm7 xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d paddd %xmm7,%xmm3 rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax paddd %xmm3,%xmm6 movl %r8d,%r13d addl %eax,%r14d movdqa %xmm6,48(%rsp) cmpb $0,131(%rbp) jne .Lssse3_00_47 rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 0(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 4(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 8(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 12(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 16(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 20(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 24(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 28(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d rorl $14,%r13d movl %r14d,%eax movl %r9d,%r12d rorl $9,%r14d xorl %r8d,%r13d xorl %r10d,%r12d rorl $5,%r13d xorl %eax,%r14d andl %r8d,%r12d xorl %r8d,%r13d addl 32(%rsp),%r11d movl %eax,%r15d xorl %r10d,%r12d rorl $11,%r14d xorl %ebx,%r15d addl %r12d,%r11d rorl $6,%r13d andl %r15d,%edi xorl %eax,%r14d addl %r13d,%r11d xorl %ebx,%edi rorl $2,%r14d addl %r11d,%edx addl %edi,%r11d movl %edx,%r13d addl %r11d,%r14d rorl $14,%r13d movl %r14d,%r11d movl %r8d,%r12d rorl $9,%r14d xorl %edx,%r13d xorl %r9d,%r12d rorl $5,%r13d xorl %r11d,%r14d andl %edx,%r12d xorl %edx,%r13d addl 36(%rsp),%r10d movl %r11d,%edi xorl %r9d,%r12d rorl $11,%r14d xorl %eax,%edi addl %r12d,%r10d rorl $6,%r13d andl %edi,%r15d xorl %r11d,%r14d addl %r13d,%r10d xorl %eax,%r15d rorl $2,%r14d addl %r10d,%ecx addl %r15d,%r10d movl %ecx,%r13d addl %r10d,%r14d rorl $14,%r13d movl %r14d,%r10d movl %edx,%r12d rorl $9,%r14d xorl %ecx,%r13d xorl %r8d,%r12d rorl $5,%r13d xorl %r10d,%r14d andl %ecx,%r12d xorl %ecx,%r13d addl 40(%rsp),%r9d movl %r10d,%r15d xorl %r8d,%r12d rorl $11,%r14d xorl %r11d,%r15d addl %r12d,%r9d rorl $6,%r13d andl %r15d,%edi xorl %r10d,%r14d addl %r13d,%r9d xorl %r11d,%edi rorl $2,%r14d addl %r9d,%ebx addl %edi,%r9d movl %ebx,%r13d addl %r9d,%r14d rorl $14,%r13d movl %r14d,%r9d movl %ecx,%r12d rorl $9,%r14d xorl %ebx,%r13d xorl %edx,%r12d rorl $5,%r13d xorl %r9d,%r14d andl %ebx,%r12d xorl %ebx,%r13d addl 44(%rsp),%r8d movl %r9d,%edi xorl %edx,%r12d rorl $11,%r14d xorl %r10d,%edi addl %r12d,%r8d rorl $6,%r13d andl %edi,%r15d xorl %r9d,%r14d addl %r13d,%r8d xorl %r10d,%r15d rorl $2,%r14d addl %r8d,%eax addl %r15d,%r8d movl %eax,%r13d addl %r8d,%r14d rorl $14,%r13d movl %r14d,%r8d movl %ebx,%r12d rorl $9,%r14d xorl %eax,%r13d xorl %ecx,%r12d rorl $5,%r13d xorl %r8d,%r14d andl %eax,%r12d xorl %eax,%r13d addl 48(%rsp),%edx movl %r8d,%r15d xorl %ecx,%r12d rorl $11,%r14d xorl %r9d,%r15d addl %r12d,%edx rorl $6,%r13d andl %r15d,%edi xorl %r8d,%r14d addl %r13d,%edx xorl %r9d,%edi rorl $2,%r14d addl %edx,%r11d addl %edi,%edx movl %r11d,%r13d addl %edx,%r14d rorl $14,%r13d movl %r14d,%edx movl %eax,%r12d rorl $9,%r14d xorl %r11d,%r13d xorl %ebx,%r12d rorl $5,%r13d xorl %edx,%r14d andl %r11d,%r12d xorl %r11d,%r13d addl 52(%rsp),%ecx movl %edx,%edi xorl %ebx,%r12d rorl $11,%r14d xorl %r8d,%edi addl %r12d,%ecx rorl $6,%r13d andl %edi,%r15d xorl %edx,%r14d addl %r13d,%ecx xorl %r8d,%r15d rorl $2,%r14d addl %ecx,%r10d addl %r15d,%ecx movl %r10d,%r13d addl %ecx,%r14d rorl $14,%r13d movl %r14d,%ecx movl %r11d,%r12d rorl $9,%r14d xorl %r10d,%r13d xorl %eax,%r12d rorl $5,%r13d xorl %ecx,%r14d andl %r10d,%r12d xorl %r10d,%r13d addl 56(%rsp),%ebx movl %ecx,%r15d xorl %eax,%r12d rorl $11,%r14d xorl %edx,%r15d addl %r12d,%ebx rorl $6,%r13d andl %r15d,%edi xorl %ecx,%r14d addl %r13d,%ebx xorl %edx,%edi rorl $2,%r14d addl %ebx,%r9d addl %edi,%ebx movl %r9d,%r13d addl %ebx,%r14d rorl $14,%r13d movl %r14d,%ebx movl %r10d,%r12d rorl $9,%r14d xorl %r9d,%r13d xorl %r11d,%r12d rorl $5,%r13d xorl %ebx,%r14d andl %r9d,%r12d xorl %r9d,%r13d addl 60(%rsp),%eax movl %ebx,%edi xorl %r11d,%r12d rorl $11,%r14d xorl %ecx,%edi addl %r12d,%eax rorl $6,%r13d andl %edi,%r15d xorl %ebx,%r14d addl %r13d,%eax xorl %ecx,%r15d rorl $2,%r14d addl %eax,%r8d addl %r15d,%eax movl %r8d,%r13d addl %eax,%r14d movq 64+0(%rsp),%rdi movl %r14d,%eax addl 0(%rdi),%eax leaq 64(%rsi),%rsi addl 4(%rdi),%ebx addl 8(%rdi),%ecx addl 12(%rdi),%edx addl 16(%rdi),%r8d addl 20(%rdi),%r9d addl 24(%rdi),%r10d addl 28(%rdi),%r11d cmpq 64+16(%rsp),%rsi movl %eax,0(%rdi) movl %ebx,4(%rdi) movl %ecx,8(%rdi) movl %edx,12(%rdi) movl %r8d,16(%rdi) movl %r9d,20(%rdi) movl %r10d,24(%rdi) movl %r11d,28(%rdi) jb .Lloop_ssse3 movq 88(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue_ssse3: .byte 0xf3,0xc3 .cfi_endproc .size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.cfi_startproc +.Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx +.type sha256_block_data_order_avx2,@function +.align 64 +sha256_block_data_order_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $544,%rsp + shlq $4,%rdx + andq $-1024,%rsp + leaq (%rsi,%rdx,4),%rdx + addq $448,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-64,%rsi + movl 0(%rdi),%eax + movq %rsi,%r12 + movl 4(%rdi),%ebx + cmpq %rdx,%rsi + movl 8(%rdi),%ecx + cmoveq %rsp,%r12 + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%ymm8 + vmovdqa K256+512+64(%rip),%ymm9 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa K256+512(%rip),%ymm7 + vmovdqu -64+0(%rsi),%xmm0 + vmovdqu -64+16(%rsi),%xmm1 + vmovdqu -64+32(%rsi),%xmm2 + vmovdqu -64+48(%rsi),%xmm3 + + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm7,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm7,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + + leaq K256(%rip),%rbp + vpshufb %ymm7,%ymm2,%ymm2 + vpaddd 0(%rbp),%ymm0,%ymm4 + vpshufb %ymm7,%ymm3,%ymm3 + vpaddd 32(%rbp),%ymm1,%ymm5 + vpaddd 64(%rbp),%ymm2,%ymm6 + vpaddd 96(%rbp),%ymm3,%ymm7 + vmovdqa %ymm4,0(%rsp) + xorl %r14d,%r14d + vmovdqa %ymm5,32(%rsp) + + movq 88(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -64(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + movl %ebx,%edi + vmovdqa %ymm6,0(%rsp) + xorl %ecx,%edi + vmovdqa %ymm7,32(%rsp) + movl %r9d,%r12d + subq $-32*4,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm0,%ymm1,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm2,%ymm3,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm0,%ymm0 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm3,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm0,%ymm0 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm0,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm0,%ymm0 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 0(%rbp),%ymm0,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm1,%ymm2,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm3,%ymm0,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm1,%ymm1 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm0,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm1,%ymm1 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm1,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm1,%ymm1 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 32(%rbp),%ymm1,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq -64(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x38,0x06,0x23,0x08 + + pushq 64-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $4,%ymm2,%ymm3,%ymm4 + addl 0+128(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + vpalignr $4,%ymm0,%ymm1,%ymm7 + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + vpsrld $7,%ymm4,%ymm6 + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + vpaddd %ymm7,%ymm2,%ymm2 + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + vpshufd $250,%ymm1,%ymm7 + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 4+128(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + vpslld $11,%ymm5,%ymm5 + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + vpaddd %ymm4,%ymm2,%ymm2 + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 8+128(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + vpxor %ymm7,%ymm6,%ymm6 + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + vpshufd $80,%ymm2,%ymm7 + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 12+128(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + vpxor %ymm7,%ymm6,%ymm6 + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + vpaddd %ymm6,%ymm2,%ymm2 + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + vpaddd 64(%rbp),%ymm2,%ymm6 + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + vmovdqa %ymm6,0(%rsp) + vpalignr $4,%ymm3,%ymm0,%ymm4 + addl 32+128(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + vpalignr $4,%ymm1,%ymm2,%ymm7 + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + vpsrld $7,%ymm4,%ymm6 + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + vpaddd %ymm7,%ymm3,%ymm3 + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + vpsrld $3,%ymm4,%ymm7 + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + vpslld $14,%ymm4,%ymm5 + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + vpxor %ymm6,%ymm7,%ymm4 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + vpshufd $250,%ymm2,%ymm7 + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + vpsrld $11,%ymm6,%ymm6 + addl 36+128(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + vpxor %ymm5,%ymm4,%ymm4 + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + vpslld $11,%ymm5,%ymm5 + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + vpxor %ymm6,%ymm4,%ymm4 + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + vpsrld $10,%ymm7,%ymm6 + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + vpxor %ymm5,%ymm4,%ymm4 + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + vpsrlq $17,%ymm7,%ymm7 + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + vpaddd %ymm4,%ymm3,%ymm3 + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 40+128(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + vpxor %ymm7,%ymm6,%ymm6 + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + vpshufb %ymm8,%ymm6,%ymm6 + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + vpshufd $80,%ymm3,%ymm7 + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + vpsrld $10,%ymm7,%ymm6 + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + vpsrlq $17,%ymm7,%ymm7 + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + vpxor %ymm7,%ymm6,%ymm6 + addl 44+128(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + vpsrlq $2,%ymm7,%ymm7 + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + vpxor %ymm7,%ymm6,%ymm6 + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + vpshufb %ymm9,%ymm6,%ymm6 + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + vpaddd %ymm6,%ymm3,%ymm3 + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + vpaddd 96(%rbp),%ymm3,%ymm6 + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + vmovdqa %ymm6,32(%rsp) + leaq 128(%rbp),%rbp + cmpb $0,3(%rbp) + jne .Lavx2_00_47 + addl 0+64(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+64(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+64(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+64(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+64(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+64(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+64(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+64(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + addl 0(%rsp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4(%rsp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8(%rsp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12(%rsp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32(%rsp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36(%rsp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40(%rsp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44(%rsp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rbp + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + cmpq 80(%rbp),%rsi + je .Ldone_avx2 + + xorl %r14d,%r14d + movl %ebx,%edi + xorl %ecx,%edi + movl %r9d,%r12d + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addl 0+16(%rbp),%r11d + andl %r8d,%r12d + rorxl $25,%r8d,%r13d + rorxl $11,%r8d,%r15d + leal (%rax,%r14,1),%eax + leal (%r11,%r12,1),%r11d + andnl %r10d,%r8d,%r12d + xorl %r15d,%r13d + rorxl $6,%r8d,%r14d + leal (%r11,%r12,1),%r11d + xorl %r14d,%r13d + movl %eax,%r15d + rorxl $22,%eax,%r12d + leal (%r11,%r13,1),%r11d + xorl %ebx,%r15d + rorxl $13,%eax,%r14d + rorxl $2,%eax,%r13d + leal (%rdx,%r11,1),%edx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %ebx,%edi + xorl %r13d,%r14d + leal (%r11,%rdi,1),%r11d + movl %r8d,%r12d + addl 4+16(%rbp),%r10d + andl %edx,%r12d + rorxl $25,%edx,%r13d + rorxl $11,%edx,%edi + leal (%r11,%r14,1),%r11d + leal (%r10,%r12,1),%r10d + andnl %r9d,%edx,%r12d + xorl %edi,%r13d + rorxl $6,%edx,%r14d + leal (%r10,%r12,1),%r10d + xorl %r14d,%r13d + movl %r11d,%edi + rorxl $22,%r11d,%r12d + leal (%r10,%r13,1),%r10d + xorl %eax,%edi + rorxl $13,%r11d,%r14d + rorxl $2,%r11d,%r13d + leal (%rcx,%r10,1),%ecx + andl %edi,%r15d + xorl %r12d,%r14d + xorl %eax,%r15d + xorl %r13d,%r14d + leal (%r10,%r15,1),%r10d + movl %edx,%r12d + addl 8+16(%rbp),%r9d + andl %ecx,%r12d + rorxl $25,%ecx,%r13d + rorxl $11,%ecx,%r15d + leal (%r10,%r14,1),%r10d + leal (%r9,%r12,1),%r9d + andnl %r8d,%ecx,%r12d + xorl %r15d,%r13d + rorxl $6,%ecx,%r14d + leal (%r9,%r12,1),%r9d + xorl %r14d,%r13d + movl %r10d,%r15d + rorxl $22,%r10d,%r12d + leal (%r9,%r13,1),%r9d + xorl %r11d,%r15d + rorxl $13,%r10d,%r14d + rorxl $2,%r10d,%r13d + leal (%rbx,%r9,1),%ebx + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r11d,%edi + xorl %r13d,%r14d + leal (%r9,%rdi,1),%r9d + movl %ecx,%r12d + addl 12+16(%rbp),%r8d + andl %ebx,%r12d + rorxl $25,%ebx,%r13d + rorxl $11,%ebx,%edi + leal (%r9,%r14,1),%r9d + leal (%r8,%r12,1),%r8d + andnl %edx,%ebx,%r12d + xorl %edi,%r13d + rorxl $6,%ebx,%r14d + leal (%r8,%r12,1),%r8d + xorl %r14d,%r13d + movl %r9d,%edi + rorxl $22,%r9d,%r12d + leal (%r8,%r13,1),%r8d + xorl %r10d,%edi + rorxl $13,%r9d,%r14d + rorxl $2,%r9d,%r13d + leal (%rax,%r8,1),%eax + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r10d,%r15d + xorl %r13d,%r14d + leal (%r8,%r15,1),%r8d + movl %ebx,%r12d + addl 32+16(%rbp),%edx + andl %eax,%r12d + rorxl $25,%eax,%r13d + rorxl $11,%eax,%r15d + leal (%r8,%r14,1),%r8d + leal (%rdx,%r12,1),%edx + andnl %ecx,%eax,%r12d + xorl %r15d,%r13d + rorxl $6,%eax,%r14d + leal (%rdx,%r12,1),%edx + xorl %r14d,%r13d + movl %r8d,%r15d + rorxl $22,%r8d,%r12d + leal (%rdx,%r13,1),%edx + xorl %r9d,%r15d + rorxl $13,%r8d,%r14d + rorxl $2,%r8d,%r13d + leal (%r11,%rdx,1),%r11d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %r9d,%edi + xorl %r13d,%r14d + leal (%rdx,%rdi,1),%edx + movl %eax,%r12d + addl 36+16(%rbp),%ecx + andl %r11d,%r12d + rorxl $25,%r11d,%r13d + rorxl $11,%r11d,%edi + leal (%rdx,%r14,1),%edx + leal (%rcx,%r12,1),%ecx + andnl %ebx,%r11d,%r12d + xorl %edi,%r13d + rorxl $6,%r11d,%r14d + leal (%rcx,%r12,1),%ecx + xorl %r14d,%r13d + movl %edx,%edi + rorxl $22,%edx,%r12d + leal (%rcx,%r13,1),%ecx + xorl %r8d,%edi + rorxl $13,%edx,%r14d + rorxl $2,%edx,%r13d + leal (%r10,%rcx,1),%r10d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %r8d,%r15d + xorl %r13d,%r14d + leal (%rcx,%r15,1),%ecx + movl %r11d,%r12d + addl 40+16(%rbp),%ebx + andl %r10d,%r12d + rorxl $25,%r10d,%r13d + rorxl $11,%r10d,%r15d + leal (%rcx,%r14,1),%ecx + leal (%rbx,%r12,1),%ebx + andnl %eax,%r10d,%r12d + xorl %r15d,%r13d + rorxl $6,%r10d,%r14d + leal (%rbx,%r12,1),%ebx + xorl %r14d,%r13d + movl %ecx,%r15d + rorxl $22,%ecx,%r12d + leal (%rbx,%r13,1),%ebx + xorl %edx,%r15d + rorxl $13,%ecx,%r14d + rorxl $2,%ecx,%r13d + leal (%r9,%rbx,1),%r9d + andl %r15d,%edi + xorl %r12d,%r14d + xorl %edx,%edi + xorl %r13d,%r14d + leal (%rbx,%rdi,1),%ebx + movl %r10d,%r12d + addl 44+16(%rbp),%eax + andl %r9d,%r12d + rorxl $25,%r9d,%r13d + rorxl $11,%r9d,%edi + leal (%rbx,%r14,1),%ebx + leal (%rax,%r12,1),%eax + andnl %r11d,%r9d,%r12d + xorl %edi,%r13d + rorxl $6,%r9d,%r14d + leal (%rax,%r12,1),%eax + xorl %r14d,%r13d + movl %ebx,%edi + rorxl $22,%ebx,%r12d + leal (%rax,%r13,1),%eax + xorl %ecx,%edi + rorxl $13,%ebx,%r14d + rorxl $2,%ebx,%r13d + leal (%r8,%rax,1),%r8d + andl %edi,%r15d + xorl %r12d,%r14d + xorl %ecx,%r15d + xorl %r13d,%r14d + leal (%rax,%r15,1),%eax + movl %r9d,%r12d + leaq -64(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 512(%rsp),%rdi + addl %r14d,%eax + + leaq 448(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + leaq 128(%rsi),%rsi + addl 24(%rdi),%r10d + movq %rsi,%r12 + addl 28(%rdi),%r11d + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + cmoveq %rsp,%r12 + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0xd8,0x00,0x06,0x23,0x08 + +.Ldone_avx2: + movq 88(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha256_block_data_order_avx2,.-sha256_block_data_order_avx2 Index: stable/12/secure/lib/libcrypto/amd64/sha512-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/sha512-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/sha512-x86_64.S (revision 364963) @@ -1,1803 +1,5463 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha512-x86_64.pl. */ .text .globl sha512_block_data_order .type sha512_block_data_order,@function .align 16 sha512_block_data_order: .cfi_startproc + leaq OPENSSL_ia32cap_P(%rip),%r11 + movl 0(%r11),%r9d + movl 4(%r11),%r10d + movl 8(%r11),%r11d + testl $2048,%r10d + jnz .Lxop_shortcut + andl $296,%r11d + cmpl $296,%r11d + je .Lavx2_shortcut + andl $1073741824,%r9d + andl $268435968,%r10d + orl %r9d,%r10d + cmpl $1342177792,%r10d + je .Lavx_shortcut movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 shlq $4,%rdx subq $128+32,%rsp leaq (%rsi,%rdx,8),%rdx andq $-64,%rsp movq %rdi,128+0(%rsp) movq %rsi,128+8(%rsp) movq %rdx,128+16(%rsp) movq %rax,152(%rsp) .cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 .Lprologue: movq 0(%rdi),%rax movq 8(%rdi),%rbx movq 16(%rdi),%rcx movq 24(%rdi),%rdx movq 32(%rdi),%r8 movq 40(%rdi),%r9 movq 48(%rdi),%r10 movq 56(%rdi),%r11 jmp .Lloop .align 16 .Lloop: movq %rbx,%rdi leaq K512(%rip),%rbp xorq %rcx,%rdi movq 0(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,0(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp addq %r14,%r11 movq 8(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,8(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp addq %r14,%r10 movq 16(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,16(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp addq %r14,%r9 movq 24(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,24(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp addq %r14,%r8 movq 32(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,32(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp addq %r14,%rdx movq 40(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,40(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp addq %r14,%rcx movq 48(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,48(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp addq %r14,%rbx movq 56(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,56(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp addq %r14,%rax movq 64(%rsi),%r12 movq %r8,%r13 movq %rax,%r14 bswapq %r12 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,64(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp addq %r14,%r11 movq 72(%rsi),%r12 movq %rdx,%r13 movq %r11,%r14 bswapq %r12 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,72(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp addq %r14,%r10 movq 80(%rsi),%r12 movq %rcx,%r13 movq %r10,%r14 bswapq %r12 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,80(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp addq %r14,%r9 movq 88(%rsi),%r12 movq %rbx,%r13 movq %r9,%r14 bswapq %r12 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,88(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp addq %r14,%r8 movq 96(%rsi),%r12 movq %rax,%r13 movq %r8,%r14 bswapq %r12 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,96(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp addq %r14,%rdx movq 104(%rsi),%r12 movq %r11,%r13 movq %rdx,%r14 bswapq %r12 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,104(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp addq %r14,%rcx movq 112(%rsi),%r12 movq %r10,%r13 movq %rcx,%r14 bswapq %r12 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,112(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp addq %r14,%rbx movq 120(%rsi),%r12 movq %r9,%r13 movq %rbx,%r14 bswapq %r12 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,120(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp jmp .Lrounds_16_xx .align 16 .Lrounds_16_xx: movq 8(%rsp),%r13 movq 112(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rax movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 72(%rsp),%r12 addq 0(%rsp),%r12 movq %r8,%r13 addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,0(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp movq 16(%rsp),%r13 movq 120(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r11 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 80(%rsp),%r12 addq 8(%rsp),%r12 movq %rdx,%r13 addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,8(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp movq 24(%rsp),%r13 movq 0(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r10 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 88(%rsp),%r12 addq 16(%rsp),%r12 movq %rcx,%r13 addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,16(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp movq 32(%rsp),%r13 movq 8(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r9 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 96(%rsp),%r12 addq 24(%rsp),%r12 movq %rbx,%r13 addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,24(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp movq 40(%rsp),%r13 movq 16(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r8 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 104(%rsp),%r12 addq 32(%rsp),%r12 movq %rax,%r13 addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,32(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp movq 48(%rsp),%r13 movq 24(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rdx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 112(%rsp),%r12 addq 40(%rsp),%r12 movq %r11,%r13 addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,40(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp movq 56(%rsp),%r13 movq 32(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rcx movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 120(%rsp),%r12 addq 48(%rsp),%r12 movq %r10,%r13 addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,48(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp movq 64(%rsp),%r13 movq 40(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rbx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 0(%rsp),%r12 addq 56(%rsp),%r12 movq %r9,%r13 addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,56(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp movq 72(%rsp),%r13 movq 48(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rax movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 8(%rsp),%r12 addq 64(%rsp),%r12 movq %r8,%r13 addq %r15,%r12 movq %rax,%r14 rorq $23,%r13 movq %r9,%r15 xorq %r8,%r13 rorq $5,%r14 xorq %r10,%r15 movq %r12,64(%rsp) xorq %rax,%r14 andq %r8,%r15 rorq $4,%r13 addq %r11,%r12 xorq %r10,%r15 rorq $6,%r14 xorq %r8,%r13 addq %r15,%r12 movq %rax,%r15 addq (%rbp),%r12 xorq %rax,%r14 xorq %rbx,%r15 rorq $14,%r13 movq %rbx,%r11 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r11 addq %r12,%rdx addq %r12,%r11 leaq 8(%rbp),%rbp movq 80(%rsp),%r13 movq 56(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r11 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 16(%rsp),%r12 addq 72(%rsp),%r12 movq %rdx,%r13 addq %rdi,%r12 movq %r11,%r14 rorq $23,%r13 movq %r8,%rdi xorq %rdx,%r13 rorq $5,%r14 xorq %r9,%rdi movq %r12,72(%rsp) xorq %r11,%r14 andq %rdx,%rdi rorq $4,%r13 addq %r10,%r12 xorq %r9,%rdi rorq $6,%r14 xorq %rdx,%r13 addq %rdi,%r12 movq %r11,%rdi addq (%rbp),%r12 xorq %r11,%r14 xorq %rax,%rdi rorq $14,%r13 movq %rax,%r10 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r10 addq %r12,%rcx addq %r12,%r10 leaq 24(%rbp),%rbp movq 88(%rsp),%r13 movq 64(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r10 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 24(%rsp),%r12 addq 80(%rsp),%r12 movq %rcx,%r13 addq %r15,%r12 movq %r10,%r14 rorq $23,%r13 movq %rdx,%r15 xorq %rcx,%r13 rorq $5,%r14 xorq %r8,%r15 movq %r12,80(%rsp) xorq %r10,%r14 andq %rcx,%r15 rorq $4,%r13 addq %r9,%r12 xorq %r8,%r15 rorq $6,%r14 xorq %rcx,%r13 addq %r15,%r12 movq %r10,%r15 addq (%rbp),%r12 xorq %r10,%r14 xorq %r11,%r15 rorq $14,%r13 movq %r11,%r9 andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%r9 addq %r12,%rbx addq %r12,%r9 leaq 8(%rbp),%rbp movq 96(%rsp),%r13 movq 72(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%r9 movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 32(%rsp),%r12 addq 88(%rsp),%r12 movq %rbx,%r13 addq %rdi,%r12 movq %r9,%r14 rorq $23,%r13 movq %rcx,%rdi xorq %rbx,%r13 rorq $5,%r14 xorq %rdx,%rdi movq %r12,88(%rsp) xorq %r9,%r14 andq %rbx,%rdi rorq $4,%r13 addq %r8,%r12 xorq %rdx,%rdi rorq $6,%r14 xorq %rbx,%r13 addq %rdi,%r12 movq %r9,%rdi addq (%rbp),%r12 xorq %r9,%r14 xorq %r10,%rdi rorq $14,%r13 movq %r10,%r8 andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%r8 addq %r12,%rax addq %r12,%r8 leaq 24(%rbp),%rbp movq 104(%rsp),%r13 movq 80(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%r8 movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 40(%rsp),%r12 addq 96(%rsp),%r12 movq %rax,%r13 addq %r15,%r12 movq %r8,%r14 rorq $23,%r13 movq %rbx,%r15 xorq %rax,%r13 rorq $5,%r14 xorq %rcx,%r15 movq %r12,96(%rsp) xorq %r8,%r14 andq %rax,%r15 rorq $4,%r13 addq %rdx,%r12 xorq %rcx,%r15 rorq $6,%r14 xorq %rax,%r13 addq %r15,%r12 movq %r8,%r15 addq (%rbp),%r12 xorq %r8,%r14 xorq %r9,%r15 rorq $14,%r13 movq %r9,%rdx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rdx addq %r12,%r11 addq %r12,%rdx leaq 8(%rbp),%rbp movq 112(%rsp),%r13 movq 88(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rdx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 48(%rsp),%r12 addq 104(%rsp),%r12 movq %r11,%r13 addq %rdi,%r12 movq %rdx,%r14 rorq $23,%r13 movq %rax,%rdi xorq %r11,%r13 rorq $5,%r14 xorq %rbx,%rdi movq %r12,104(%rsp) xorq %rdx,%r14 andq %r11,%rdi rorq $4,%r13 addq %rcx,%r12 xorq %rbx,%rdi rorq $6,%r14 xorq %r11,%r13 addq %rdi,%r12 movq %rdx,%rdi addq (%rbp),%r12 xorq %rdx,%r14 xorq %r8,%rdi rorq $14,%r13 movq %r8,%rcx andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rcx addq %r12,%r10 addq %r12,%rcx leaq 24(%rbp),%rbp movq 120(%rsp),%r13 movq 96(%rsp),%r15 movq %r13,%r12 rorq $7,%r13 addq %r14,%rcx movq %r15,%r14 rorq $42,%r15 xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%r15 shrq $6,%r14 rorq $19,%r15 xorq %r13,%r12 xorq %r14,%r15 addq 56(%rsp),%r12 addq 112(%rsp),%r12 movq %r10,%r13 addq %r15,%r12 movq %rcx,%r14 rorq $23,%r13 movq %r11,%r15 xorq %r10,%r13 rorq $5,%r14 xorq %rax,%r15 movq %r12,112(%rsp) xorq %rcx,%r14 andq %r10,%r15 rorq $4,%r13 addq %rbx,%r12 xorq %rax,%r15 rorq $6,%r14 xorq %r10,%r13 addq %r15,%r12 movq %rcx,%r15 addq (%rbp),%r12 xorq %rcx,%r14 xorq %rdx,%r15 rorq $14,%r13 movq %rdx,%rbx andq %r15,%rdi rorq $28,%r14 addq %r13,%r12 xorq %rdi,%rbx addq %r12,%r9 addq %r12,%rbx leaq 8(%rbp),%rbp movq 0(%rsp),%r13 movq 104(%rsp),%rdi movq %r13,%r12 rorq $7,%r13 addq %r14,%rbx movq %rdi,%r14 rorq $42,%rdi xorq %r12,%r13 shrq $7,%r12 rorq $1,%r13 xorq %r14,%rdi shrq $6,%r14 rorq $19,%rdi xorq %r13,%r12 xorq %r14,%rdi addq 64(%rsp),%r12 addq 120(%rsp),%r12 movq %r9,%r13 addq %rdi,%r12 movq %rbx,%r14 rorq $23,%r13 movq %r10,%rdi xorq %r9,%r13 rorq $5,%r14 xorq %r11,%rdi movq %r12,120(%rsp) xorq %rbx,%r14 andq %r9,%rdi rorq $4,%r13 addq %rax,%r12 xorq %r11,%rdi rorq $6,%r14 xorq %r9,%r13 addq %rdi,%r12 movq %rbx,%rdi addq (%rbp),%r12 xorq %rbx,%r14 xorq %rcx,%rdi rorq $14,%r13 movq %rcx,%rax andq %rdi,%r15 rorq $28,%r14 addq %r13,%r12 xorq %r15,%rax addq %r12,%r8 addq %r12,%rax leaq 24(%rbp),%rbp cmpb $0,7(%rbp) jnz .Lrounds_16_xx movq 128+0(%rsp),%rdi addq %r14,%rax leaq 128(%rsi),%rsi addq 0(%rdi),%rax addq 8(%rdi),%rbx addq 16(%rdi),%rcx addq 24(%rdi),%rdx addq 32(%rdi),%r8 addq 40(%rdi),%r9 addq 48(%rdi),%r10 addq 56(%rdi),%r11 cmpq 128+16(%rsp),%rsi movq %rax,0(%rdi) movq %rbx,8(%rdi) movq %rcx,16(%rdi) movq %rdx,24(%rdi) movq %r8,32(%rdi) movq %r9,40(%rdi) movq %r10,48(%rdi) movq %r11,56(%rdi) jb .Lloop movq 152(%rsp),%rsi .cfi_def_cfa %rsi,8 movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lepilogue: .byte 0xf3,0xc3 .cfi_endproc .size sha512_block_data_order,.-sha512_block_data_order .align 64 .type K512,@object K512: .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x28db77f523047d84,0x32caab7b40c72493 .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f .quad 0x0001020304050607,0x08090a0b0c0d0e0f .byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.type sha512_block_data_order_xop,@function +.align 64 +sha512_block_data_order_xop: +.cfi_startproc +.Lxop_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_xop: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_xop +.align 16 +.Lloop_xop: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lxop_00_47 + +.align 16 +.Lxop_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm0,%xmm0 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,223,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm7,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm0,%xmm0 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm1,%xmm1 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,216,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm0,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm1,%xmm1 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm2,%xmm2 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,217,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm1,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm2,%xmm2 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm3,%xmm3 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,218,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm2,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm3,%xmm3 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + rorq $23,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r8,%r13 + xorq %r10,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rax,%r14 + vpaddq %xmm11,%xmm4,%xmm4 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 +.byte 143,72,120,195,209,7 + xorq %r10,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,219,3 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm3,%xmm10 + addq %r11,%rdx + addq %rdi,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %rdx,%r13 + addq %r11,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r11 + vpxor %xmm10,%xmm11,%xmm11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + vpaddq %xmm11,%xmm4,%xmm4 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + rorq $23,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rcx,%r13 + xorq %r8,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r10,%r14 + vpaddq %xmm11,%xmm5,%xmm5 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 +.byte 143,72,120,195,209,7 + xorq %r8,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,220,3 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm4,%xmm10 + addq %r9,%rbx + addq %rdi,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rbx,%r13 + addq %r9,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%r9 + vpxor %xmm10,%xmm11,%xmm11 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + vpaddq %xmm11,%xmm5,%xmm5 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + rorq $23,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %rax,%r13 + xorq %rcx,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %r8,%r14 + vpaddq %xmm11,%xmm6,%xmm6 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 +.byte 143,72,120,195,209,7 + xorq %rcx,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,221,3 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm5,%xmm10 + addq %rdx,%r11 + addq %rdi,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %r11,%r13 + addq %rdx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rdx + vpxor %xmm10,%xmm11,%xmm11 + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + vpaddq %xmm11,%xmm6,%xmm6 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + rorq $23,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + rorq $5,%r14 +.byte 143,72,120,195,200,56 + xorq %r10,%r13 + xorq %rax,%r12 + vpsrlq $7,%xmm8,%xmm8 + rorq $4,%r13 + xorq %rcx,%r14 + vpaddq %xmm11,%xmm7,%xmm7 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 +.byte 143,72,120,195,209,7 + xorq %rax,%r12 + rorq $6,%r14 + vpxor %xmm9,%xmm8,%xmm8 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi +.byte 143,104,120,195,222,3 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + rorq $28,%r14 + vpsrlq $6,%xmm6,%xmm10 + addq %rbx,%r9 + addq %rdi,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r9,%r13 + addq %rbx,%r14 +.byte 143,72,120,195,203,42 + rorq $23,%r13 + movq %r14,%rbx + vpxor %xmm10,%xmm11,%xmm11 + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm9,%xmm11,%xmm11 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + vpaddq %xmm11,%xmm7,%xmm7 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lxop_00_47 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + rorq $23,%r13 + movq %r14,%rax + movq %r9,%r12 + rorq $5,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + rorq $4,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + rorq $6,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + rorq $14,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + rorq $28,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + rorq $23,%r13 + movq %r14,%r11 + movq %r8,%r12 + rorq $5,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + rorq $4,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + rorq $6,%r14 + xorq %rax,%rdi + addq %r12,%r10 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + rorq $28,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + rorq $23,%r13 + movq %r14,%r10 + movq %rdx,%r12 + rorq $5,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + rorq $4,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + rorq $6,%r14 + xorq %r11,%r15 + addq %r12,%r9 + rorq $14,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + rorq $28,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + rorq $23,%r13 + movq %r14,%r9 + movq %rcx,%r12 + rorq $5,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + rorq $4,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + rorq $6,%r14 + xorq %r10,%rdi + addq %r12,%r8 + rorq $14,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + rorq $28,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + rorq $23,%r13 + movq %r14,%r8 + movq %rbx,%r12 + rorq $5,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + rorq $4,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + rorq $6,%r14 + xorq %r9,%r15 + addq %r12,%rdx + rorq $14,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + rorq $28,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + rorq $23,%r13 + movq %r14,%rdx + movq %rax,%r12 + rorq $5,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + rorq $4,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + rorq $6,%r14 + xorq %r8,%rdi + addq %r12,%rcx + rorq $14,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + rorq $28,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + rorq $23,%r13 + movq %r14,%rcx + movq %r11,%r12 + rorq $5,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + rorq $4,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + rorq $6,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + rorq $14,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + rorq $28,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + rorq $23,%r13 + movq %r14,%rbx + movq %r10,%r12 + rorq $5,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + rorq $4,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + rorq $6,%r14 + xorq %rcx,%rdi + addq %r12,%rax + rorq $14,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + rorq $28,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_xop + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_xop: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_xop,.-sha512_block_data_order_xop +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.cfi_startproc +.Lavx_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx +.type sha512_block_data_order_avx2,@function +.align 64 +sha512_block_data_order_avx2: +.cfi_startproc +.Lavx2_shortcut: + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + subq $1312,%rsp + shlq $4,%rdx + andq $-2048,%rsp + leaq (%rsi,%rdx,8),%rdx + addq $1152,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx2: + + vzeroupper + subq $-128,%rsi + movq 0(%rdi),%rax + movq %rsi,%r12 + movq 8(%rdi),%rbx + cmpq %rdx,%rsi + movq 16(%rdi),%rcx + cmoveq %rsp,%r12 + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -128(%rsi),%xmm0 + vmovdqu -128+16(%rsi),%xmm1 + vmovdqu -128+32(%rsi),%xmm2 + leaq K512+128(%rip),%rbp + vmovdqu -128+48(%rsi),%xmm3 + vmovdqu -128+64(%rsi),%xmm4 + vmovdqu -128+80(%rsi),%xmm5 + vmovdqu -128+96(%rsi),%xmm6 + vmovdqu -128+112(%rsi),%xmm7 + + vmovdqa 1152(%rbp),%ymm10 + vinserti128 $1,(%r12),%ymm0,%ymm0 + vinserti128 $1,16(%r12),%ymm1,%ymm1 + vpshufb %ymm10,%ymm0,%ymm0 + vinserti128 $1,32(%r12),%ymm2,%ymm2 + vpshufb %ymm10,%ymm1,%ymm1 + vinserti128 $1,48(%r12),%ymm3,%ymm3 + vpshufb %ymm10,%ymm2,%ymm2 + vinserti128 $1,64(%r12),%ymm4,%ymm4 + vpshufb %ymm10,%ymm3,%ymm3 + vinserti128 $1,80(%r12),%ymm5,%ymm5 + vpshufb %ymm10,%ymm4,%ymm4 + vinserti128 $1,96(%r12),%ymm6,%ymm6 + vpshufb %ymm10,%ymm5,%ymm5 + vinserti128 $1,112(%r12),%ymm7,%ymm7 + + vpaddq -128(%rbp),%ymm0,%ymm8 + vpshufb %ymm10,%ymm6,%ymm6 + vpaddq -96(%rbp),%ymm1,%ymm9 + vpshufb %ymm10,%ymm7,%ymm7 + vpaddq -64(%rbp),%ymm2,%ymm10 + vpaddq -32(%rbp),%ymm3,%ymm11 + vmovdqa %ymm8,0(%rsp) + vpaddq 0(%rbp),%ymm4,%ymm8 + vmovdqa %ymm9,32(%rsp) + vpaddq 32(%rbp),%ymm5,%ymm9 + vmovdqa %ymm10,64(%rsp) + vpaddq 64(%rbp),%ymm6,%ymm10 + vmovdqa %ymm11,96(%rsp) + + movq 152(%rsp),%rdi +.cfi_def_cfa %rdi,8 + leaq -128(%rsp),%rsp + + + + movq %rdi,-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpaddq 96(%rbp),%ymm7,%ymm11 + vmovdqa %ymm8,0(%rsp) + xorq %r14,%r14 + vmovdqa %ymm9,32(%rsp) + movq %rbx,%rdi + vmovdqa %ymm10,64(%rsp) + xorq %rcx,%rdi + vmovdqa %ymm11,96(%rsp) + movq %r9,%r12 + addq $32*8,%rbp + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm0,%ymm1,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm4,%ymm5,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm0,%ymm0 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm7,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm7,%ymm10 + vpaddq %ymm8,%ymm0,%ymm0 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm7,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm0,%ymm0 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq -128(%rbp),%ymm0,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm5,%ymm6,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm1,%ymm1 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm0,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm0,%ymm10 + vpaddq %ymm8,%ymm1,%ymm1 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm0,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm1,%ymm1 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq -96(%rbp),%ymm1,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm6,%ymm7,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm2,%ymm2 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm1,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm1,%ymm10 + vpaddq %ymm8,%ymm2,%ymm2 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm1,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm2,%ymm2 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq -64(%rbp),%ymm2,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm7,%ymm0,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm3,%ymm3 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm2,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm2,%ymm10 + vpaddq %ymm8,%ymm3,%ymm3 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm2,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm3,%ymm3 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq -32(%rbp),%ymm3,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq -128(%rsp),%rsp +.cfi_escape 0x0f,0x06,0x77,0xf8,0x00,0x06,0x23,0x08 + + pushq 128-8(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x00,0x06,0x23,0x08 + leaq 8(%rsp),%rsp +.cfi_escape 0x0f,0x05,0x77,0x78,0x06,0x23,0x08 + vpalignr $8,%ymm4,%ymm5,%ymm8 + addq 0+256(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + vpalignr $8,%ymm0,%ymm1,%ymm11 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + vpsrlq $1,%ymm8,%ymm10 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + vpaddq %ymm11,%ymm4,%ymm4 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + vpsrlq $6,%ymm3,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + vpsllq $3,%ymm3,%ymm10 + vpaddq %ymm8,%ymm4,%ymm4 + addq 8+256(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + vpsrlq $19,%ymm3,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + vpaddq %ymm11,%ymm4,%ymm4 + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + vpaddq 0(%rbp),%ymm4,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + vmovdqa %ymm10,0(%rsp) + vpalignr $8,%ymm5,%ymm6,%ymm8 + addq 32+256(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + vpalignr $8,%ymm1,%ymm2,%ymm11 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + vpsrlq $1,%ymm8,%ymm10 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + vpaddq %ymm11,%ymm5,%ymm5 + vpsrlq $7,%ymm8,%ymm11 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + vpsrlq $6,%ymm4,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + vpsllq $3,%ymm4,%ymm10 + vpaddq %ymm8,%ymm5,%ymm5 + addq 40+256(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + vpsrlq $19,%ymm4,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + vpaddq %ymm11,%ymm5,%ymm5 + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + vpaddq 32(%rbp),%ymm5,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + vmovdqa %ymm10,32(%rsp) + vpalignr $8,%ymm6,%ymm7,%ymm8 + addq 64+256(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + vpalignr $8,%ymm2,%ymm3,%ymm11 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + vpsrlq $1,%ymm8,%ymm10 + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + vpaddq %ymm11,%ymm6,%ymm6 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + vpsrlq $6,%ymm5,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + vpsllq $3,%ymm5,%ymm10 + vpaddq %ymm8,%ymm6,%ymm6 + addq 72+256(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + vpsrlq $19,%ymm5,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + vpaddq %ymm11,%ymm6,%ymm6 + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + vpaddq 64(%rbp),%ymm6,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + vmovdqa %ymm10,64(%rsp) + vpalignr $8,%ymm7,%ymm0,%ymm8 + addq 96+256(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + vpalignr $8,%ymm3,%ymm4,%ymm11 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + vpsrlq $1,%ymm8,%ymm10 + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + vpaddq %ymm11,%ymm7,%ymm7 + vpsrlq $7,%ymm8,%ymm11 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + vpsllq $56,%ymm8,%ymm9 + vpxor %ymm10,%ymm11,%ymm8 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + vpsrlq $7,%ymm10,%ymm10 + vpxor %ymm9,%ymm8,%ymm8 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + vpsllq $7,%ymm9,%ymm9 + vpxor %ymm10,%ymm8,%ymm8 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + vpsrlq $6,%ymm6,%ymm11 + vpxor %ymm9,%ymm8,%ymm8 + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + vpsllq $3,%ymm6,%ymm10 + vpaddq %ymm8,%ymm7,%ymm7 + addq 104+256(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + vpsrlq $19,%ymm6,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + vpsllq $42,%ymm10,%ymm10 + vpxor %ymm9,%ymm11,%ymm11 + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + vpsrlq $42,%ymm9,%ymm9 + vpxor %ymm10,%ymm11,%ymm11 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + vpxor %ymm9,%ymm11,%ymm11 + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + vpaddq %ymm11,%ymm7,%ymm7 + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + vpaddq 96(%rbp),%ymm7,%ymm10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + vmovdqa %ymm10,96(%rsp) + leaq 256(%rbp),%rbp + cmpb $0,-121(%rbp) + jne .Lavx2_00_47 + addq 0+128(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+128(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+128(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+128(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+128(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+128(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+128(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+128(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + addq 0(%rsp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8(%rsp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32(%rsp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40(%rsp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64(%rsp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72(%rsp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96(%rsp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104(%rsp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rbp + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + cmpq 144(%rbp),%rsi + je .Ldone_avx2 + + xorq %r14,%r14 + movq %rbx,%rdi + xorq %rcx,%rdi + movq %r9,%r12 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: + addq 0+16(%rbp),%r11 + andq %r8,%r12 + rorxq $41,%r8,%r13 + rorxq $18,%r8,%r15 + leaq (%rax,%r14,1),%rax + leaq (%r11,%r12,1),%r11 + andnq %r10,%r8,%r12 + xorq %r15,%r13 + rorxq $14,%r8,%r14 + leaq (%r11,%r12,1),%r11 + xorq %r14,%r13 + movq %rax,%r15 + rorxq $39,%rax,%r12 + leaq (%r11,%r13,1),%r11 + xorq %rbx,%r15 + rorxq $34,%rax,%r14 + rorxq $28,%rax,%r13 + leaq (%rdx,%r11,1),%rdx + andq %r15,%rdi + xorq %r12,%r14 + xorq %rbx,%rdi + xorq %r13,%r14 + leaq (%r11,%rdi,1),%r11 + movq %r8,%r12 + addq 8+16(%rbp),%r10 + andq %rdx,%r12 + rorxq $41,%rdx,%r13 + rorxq $18,%rdx,%rdi + leaq (%r11,%r14,1),%r11 + leaq (%r10,%r12,1),%r10 + andnq %r9,%rdx,%r12 + xorq %rdi,%r13 + rorxq $14,%rdx,%r14 + leaq (%r10,%r12,1),%r10 + xorq %r14,%r13 + movq %r11,%rdi + rorxq $39,%r11,%r12 + leaq (%r10,%r13,1),%r10 + xorq %rax,%rdi + rorxq $34,%r11,%r14 + rorxq $28,%r11,%r13 + leaq (%rcx,%r10,1),%rcx + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rax,%r15 + xorq %r13,%r14 + leaq (%r10,%r15,1),%r10 + movq %rdx,%r12 + addq 32+16(%rbp),%r9 + andq %rcx,%r12 + rorxq $41,%rcx,%r13 + rorxq $18,%rcx,%r15 + leaq (%r10,%r14,1),%r10 + leaq (%r9,%r12,1),%r9 + andnq %r8,%rcx,%r12 + xorq %r15,%r13 + rorxq $14,%rcx,%r14 + leaq (%r9,%r12,1),%r9 + xorq %r14,%r13 + movq %r10,%r15 + rorxq $39,%r10,%r12 + leaq (%r9,%r13,1),%r9 + xorq %r11,%r15 + rorxq $34,%r10,%r14 + rorxq $28,%r10,%r13 + leaq (%rbx,%r9,1),%rbx + andq %r15,%rdi + xorq %r12,%r14 + xorq %r11,%rdi + xorq %r13,%r14 + leaq (%r9,%rdi,1),%r9 + movq %rcx,%r12 + addq 40+16(%rbp),%r8 + andq %rbx,%r12 + rorxq $41,%rbx,%r13 + rorxq $18,%rbx,%rdi + leaq (%r9,%r14,1),%r9 + leaq (%r8,%r12,1),%r8 + andnq %rdx,%rbx,%r12 + xorq %rdi,%r13 + rorxq $14,%rbx,%r14 + leaq (%r8,%r12,1),%r8 + xorq %r14,%r13 + movq %r9,%rdi + rorxq $39,%r9,%r12 + leaq (%r8,%r13,1),%r8 + xorq %r10,%rdi + rorxq $34,%r9,%r14 + rorxq $28,%r9,%r13 + leaq (%rax,%r8,1),%rax + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r10,%r15 + xorq %r13,%r14 + leaq (%r8,%r15,1),%r8 + movq %rbx,%r12 + addq 64+16(%rbp),%rdx + andq %rax,%r12 + rorxq $41,%rax,%r13 + rorxq $18,%rax,%r15 + leaq (%r8,%r14,1),%r8 + leaq (%rdx,%r12,1),%rdx + andnq %rcx,%rax,%r12 + xorq %r15,%r13 + rorxq $14,%rax,%r14 + leaq (%rdx,%r12,1),%rdx + xorq %r14,%r13 + movq %r8,%r15 + rorxq $39,%r8,%r12 + leaq (%rdx,%r13,1),%rdx + xorq %r9,%r15 + rorxq $34,%r8,%r14 + rorxq $28,%r8,%r13 + leaq (%r11,%rdx,1),%r11 + andq %r15,%rdi + xorq %r12,%r14 + xorq %r9,%rdi + xorq %r13,%r14 + leaq (%rdx,%rdi,1),%rdx + movq %rax,%r12 + addq 72+16(%rbp),%rcx + andq %r11,%r12 + rorxq $41,%r11,%r13 + rorxq $18,%r11,%rdi + leaq (%rdx,%r14,1),%rdx + leaq (%rcx,%r12,1),%rcx + andnq %rbx,%r11,%r12 + xorq %rdi,%r13 + rorxq $14,%r11,%r14 + leaq (%rcx,%r12,1),%rcx + xorq %r14,%r13 + movq %rdx,%rdi + rorxq $39,%rdx,%r12 + leaq (%rcx,%r13,1),%rcx + xorq %r8,%rdi + rorxq $34,%rdx,%r14 + rorxq $28,%rdx,%r13 + leaq (%r10,%rcx,1),%r10 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %r8,%r15 + xorq %r13,%r14 + leaq (%rcx,%r15,1),%rcx + movq %r11,%r12 + addq 96+16(%rbp),%rbx + andq %r10,%r12 + rorxq $41,%r10,%r13 + rorxq $18,%r10,%r15 + leaq (%rcx,%r14,1),%rcx + leaq (%rbx,%r12,1),%rbx + andnq %rax,%r10,%r12 + xorq %r15,%r13 + rorxq $14,%r10,%r14 + leaq (%rbx,%r12,1),%rbx + xorq %r14,%r13 + movq %rcx,%r15 + rorxq $39,%rcx,%r12 + leaq (%rbx,%r13,1),%rbx + xorq %rdx,%r15 + rorxq $34,%rcx,%r14 + rorxq $28,%rcx,%r13 + leaq (%r9,%rbx,1),%r9 + andq %r15,%rdi + xorq %r12,%r14 + xorq %rdx,%rdi + xorq %r13,%r14 + leaq (%rbx,%rdi,1),%rbx + movq %r10,%r12 + addq 104+16(%rbp),%rax + andq %r9,%r12 + rorxq $41,%r9,%r13 + rorxq $18,%r9,%rdi + leaq (%rbx,%r14,1),%rbx + leaq (%rax,%r12,1),%rax + andnq %r11,%r9,%r12 + xorq %rdi,%r13 + rorxq $14,%r9,%r14 + leaq (%rax,%r12,1),%rax + xorq %r14,%r13 + movq %rbx,%rdi + rorxq $39,%rbx,%r12 + leaq (%rax,%r13,1),%rax + xorq %rcx,%rdi + rorxq $34,%rbx,%r14 + rorxq $28,%rbx,%r13 + leaq (%r8,%rax,1),%r8 + andq %rdi,%r15 + xorq %r12,%r14 + xorq %rcx,%r15 + xorq %r13,%r14 + leaq (%rax,%r15,1),%rax + movq %r9,%r12 + leaq -128(%rbp),%rbp + cmpq %rsp,%rbp + jae .Lower_avx2 + + movq 1280(%rsp),%rdi + addq %r14,%rax + + leaq 1152(%rsp),%rsp + +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + leaq 256(%rsi),%rsi + addq 48(%rdi),%r10 + movq %rsi,%r12 + addq 56(%rdi),%r11 + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + cmoveq %rsp,%r12 + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + + jbe .Loop_avx2 + leaq (%rsp),%rbp + + +.cfi_escape 0x0f,0x06,0x76,0x98,0x01,0x06,0x23,0x08 + +.Ldone_avx2: + movq 152(%rbp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + .byte 0xf3,0xc3 +.cfi_endproc +.size sha512_block_data_order_avx2,.-sha512_block_data_order_avx2 Index: stable/12/secure/lib/libcrypto/amd64/x25519-x86_64.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/x25519-x86_64.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/x25519-x86_64.S (revision 364963) @@ -1,428 +1,804 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from x25519-x86_64.pl. */ .text .globl x25519_fe51_mul .type x25519_fe51_mul,@function .align 32 x25519_fe51_mul: .cfi_startproc pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_mul_body: movq 0(%rsi),%rax movq 0(%rdx),%r11 movq 8(%rdx),%r12 movq 16(%rdx),%r13 movq 24(%rdx),%rbp movq 32(%rdx),%r14 movq %rdi,32(%rsp) movq %rax,%rdi mulq %r11 movq %r11,0(%rsp) movq %rax,%rbx movq %rdi,%rax movq %rdx,%rcx mulq %r12 movq %r12,8(%rsp) movq %rax,%r8 movq %rdi,%rax leaq (%r14,%r14,8),%r15 movq %rdx,%r9 mulq %r13 movq %r13,16(%rsp) movq %rax,%r10 movq %rdi,%rax leaq (%r14,%r15,2),%rdi movq %rdx,%r11 mulq %rbp movq %rax,%r12 movq 0(%rsi),%rax movq %rdx,%r13 mulq %r14 movq %rax,%r14 movq 8(%rsi),%rax movq %rdx,%r15 mulq %rdi addq %rax,%rbx movq 16(%rsi),%rax adcq %rdx,%rcx mulq %rdi addq %rax,%r8 movq 24(%rsi),%rax adcq %rdx,%r9 mulq %rdi addq %rax,%r10 movq 32(%rsi),%rax adcq %rdx,%r11 mulq %rdi imulq $19,%rbp,%rdi addq %rax,%r12 movq 8(%rsi),%rax adcq %rdx,%r13 mulq %rbp movq 16(%rsp),%rbp addq %rax,%r14 movq 16(%rsi),%rax adcq %rdx,%r15 mulq %rdi addq %rax,%rbx movq 24(%rsi),%rax adcq %rdx,%rcx mulq %rdi addq %rax,%r8 movq 32(%rsi),%rax adcq %rdx,%r9 mulq %rdi imulq $19,%rbp,%rdi addq %rax,%r10 movq 8(%rsi),%rax adcq %rdx,%r11 mulq %rbp addq %rax,%r12 movq 16(%rsi),%rax adcq %rdx,%r13 mulq %rbp movq 8(%rsp),%rbp addq %rax,%r14 movq 24(%rsi),%rax adcq %rdx,%r15 mulq %rdi addq %rax,%rbx movq 32(%rsi),%rax adcq %rdx,%rcx mulq %rdi addq %rax,%r8 movq 8(%rsi),%rax adcq %rdx,%r9 mulq %rbp imulq $19,%rbp,%rdi addq %rax,%r10 movq 16(%rsi),%rax adcq %rdx,%r11 mulq %rbp addq %rax,%r12 movq 24(%rsi),%rax adcq %rdx,%r13 mulq %rbp movq 0(%rsp),%rbp addq %rax,%r14 movq 32(%rsi),%rax adcq %rdx,%r15 mulq %rdi addq %rax,%rbx movq 8(%rsi),%rax adcq %rdx,%rcx mulq %rbp addq %rax,%r8 movq 16(%rsi),%rax adcq %rdx,%r9 mulq %rbp addq %rax,%r10 movq 24(%rsi),%rax adcq %rdx,%r11 mulq %rbp addq %rax,%r12 movq 32(%rsi),%rax adcq %rdx,%r13 mulq %rbp addq %rax,%r14 adcq %rdx,%r15 movq 32(%rsp),%rdi jmp .Lreduce51 .Lfe51_mul_epilogue: .cfi_endproc .size x25519_fe51_mul,.-x25519_fe51_mul .globl x25519_fe51_sqr .type x25519_fe51_sqr,@function .align 32 x25519_fe51_sqr: .cfi_startproc pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_sqr_body: movq 0(%rsi),%rax movq 16(%rsi),%r15 movq 32(%rsi),%rbp movq %rdi,32(%rsp) leaq (%rax,%rax,1),%r14 mulq %rax movq %rax,%rbx movq 8(%rsi),%rax movq %rdx,%rcx mulq %r14 movq %rax,%r8 movq %r15,%rax movq %r15,0(%rsp) movq %rdx,%r9 mulq %r14 movq %rax,%r10 movq 24(%rsi),%rax movq %rdx,%r11 imulq $19,%rbp,%rdi mulq %r14 movq %rax,%r12 movq %rbp,%rax movq %rdx,%r13 mulq %r14 movq %rax,%r14 movq %rbp,%rax movq %rdx,%r15 mulq %rdi addq %rax,%r12 movq 8(%rsi),%rax adcq %rdx,%r13 movq 24(%rsi),%rsi leaq (%rax,%rax,1),%rbp mulq %rax addq %rax,%r10 movq 0(%rsp),%rax adcq %rdx,%r11 mulq %rbp addq %rax,%r12 movq %rbp,%rax adcq %rdx,%r13 mulq %rsi addq %rax,%r14 movq %rbp,%rax adcq %rdx,%r15 imulq $19,%rsi,%rbp mulq %rdi addq %rax,%rbx leaq (%rsi,%rsi,1),%rax adcq %rdx,%rcx mulq %rdi addq %rax,%r10 movq %rsi,%rax adcq %rdx,%r11 mulq %rbp addq %rax,%r8 movq 0(%rsp),%rax adcq %rdx,%r9 leaq (%rax,%rax,1),%rsi mulq %rax addq %rax,%r14 movq %rbp,%rax adcq %rdx,%r15 mulq %rsi addq %rax,%rbx movq %rsi,%rax adcq %rdx,%rcx mulq %rdi addq %rax,%r8 adcq %rdx,%r9 movq 32(%rsp),%rdi jmp .Lreduce51 .align 32 .Lreduce51: movq $0x7ffffffffffff,%rbp movq %r10,%rdx shrq $51,%r10 shlq $13,%r11 andq %rbp,%rdx orq %r10,%r11 addq %r11,%r12 adcq $0,%r13 movq %rbx,%rax shrq $51,%rbx shlq $13,%rcx andq %rbp,%rax orq %rbx,%rcx addq %rcx,%r8 adcq $0,%r9 movq %r12,%rbx shrq $51,%r12 shlq $13,%r13 andq %rbp,%rbx orq %r12,%r13 addq %r13,%r14 adcq $0,%r15 movq %r8,%rcx shrq $51,%r8 shlq $13,%r9 andq %rbp,%rcx orq %r8,%r9 addq %r9,%rdx movq %r14,%r10 shrq $51,%r14 shlq $13,%r15 andq %rbp,%r10 orq %r14,%r15 leaq (%r15,%r15,8),%r14 leaq (%r15,%r14,2),%r15 addq %r15,%rax movq %rdx,%r8 andq %rbp,%rdx shrq $51,%r8 addq %r8,%rbx movq %rax,%r9 andq %rbp,%rax shrq $51,%r9 addq %r9,%rcx movq %rax,0(%rdi) movq %rcx,8(%rdi) movq %rdx,16(%rdi) movq %rbx,24(%rdi) movq %r10,32(%rdi) movq 40(%rsp),%r15 .cfi_restore %r15 movq 48(%rsp),%r14 .cfi_restore %r14 movq 56(%rsp),%r13 .cfi_restore %r13 movq 64(%rsp),%r12 .cfi_restore %r12 movq 72(%rsp),%rbx .cfi_restore %rbx movq 80(%rsp),%rbp .cfi_restore %rbp leaq 88(%rsp),%rsp .cfi_adjust_cfa_offset 88 .Lfe51_sqr_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size x25519_fe51_sqr,.-x25519_fe51_sqr .globl x25519_fe51_mul121666 .type x25519_fe51_mul121666,@function .align 32 x25519_fe51_mul121666: .cfi_startproc pushq %rbp .cfi_adjust_cfa_offset 8 .cfi_offset %rbp,-16 pushq %rbx .cfi_adjust_cfa_offset 8 .cfi_offset %rbx,-24 pushq %r12 .cfi_adjust_cfa_offset 8 .cfi_offset %r12,-32 pushq %r13 .cfi_adjust_cfa_offset 8 .cfi_offset %r13,-40 pushq %r14 .cfi_adjust_cfa_offset 8 .cfi_offset %r14,-48 pushq %r15 .cfi_adjust_cfa_offset 8 .cfi_offset %r15,-56 leaq -40(%rsp),%rsp .cfi_adjust_cfa_offset 40 .Lfe51_mul121666_body: movl $121666,%eax mulq 0(%rsi) movq %rax,%rbx movl $121666,%eax movq %rdx,%rcx mulq 8(%rsi) movq %rax,%r8 movl $121666,%eax movq %rdx,%r9 mulq 16(%rsi) movq %rax,%r10 movl $121666,%eax movq %rdx,%r11 mulq 24(%rsi) movq %rax,%r12 movl $121666,%eax movq %rdx,%r13 mulq 32(%rsi) movq %rax,%r14 movq %rdx,%r15 jmp .Lreduce51 .Lfe51_mul121666_epilogue: .cfi_endproc .size x25519_fe51_mul121666,.-x25519_fe51_mul121666 + .globl x25519_fe64_eligible .type x25519_fe64_eligible,@function .align 32 x25519_fe64_eligible: .cfi_startproc + movl OPENSSL_ia32cap_P+8(%rip),%ecx xorl %eax,%eax + andl $0x80100,%ecx + cmpl $0x80100,%ecx + cmovel %ecx,%eax .byte 0xf3,0xc3 .cfi_endproc .size x25519_fe64_eligible,.-x25519_fe64_eligible .globl x25519_fe64_mul .type x25519_fe64_mul,@function -.globl x25519_fe64_sqr -.globl x25519_fe64_mul121666 -.globl x25519_fe64_add -.globl x25519_fe64_sub -.globl x25519_fe64_tobytes +.align 32 x25519_fe64_mul: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 +.cfi_offset %rdi,-64 + leaq -16(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_mul_body: + + movq %rdx,%rax + movq 0(%rdx),%rbp + movq 0(%rsi),%rdx + movq 8(%rax),%rcx + movq 16(%rax),%r14 + movq 24(%rax),%r15 + + mulxq %rbp,%r8,%rax + xorl %edi,%edi + mulxq %rcx,%r9,%rbx + adcxq %rax,%r9 + mulxq %r14,%r10,%rax + adcxq %rbx,%r10 + mulxq %r15,%r11,%r12 + movq 8(%rsi),%rdx + adcxq %rax,%r11 + movq %r14,(%rsp) + adcxq %rdi,%r12 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r9 + adcxq %rbx,%r10 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r10 + adcxq %rbx,%r11 + mulxq %r14,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %r15,%rax,%r13 + movq 16(%rsi),%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + adoxq %rdi,%r13 + + mulxq %rbp,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %rcx,%rax,%rbx + adcxq %rax,%r11 + adoxq %rbx,%r12 + mulxq %r14,%rax,%rbx + adcxq %rax,%r12 + adoxq %rbx,%r13 + mulxq %r15,%rax,%r14 + movq 24(%rsi),%rdx + adcxq %rax,%r13 + adoxq %rdi,%r14 + adcxq %rdi,%r14 + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rcx,%rax,%rbx + adoxq %rax,%r12 + adcxq %rbx,%r13 + mulxq (%rsp),%rax,%rbx + adoxq %rax,%r13 + adcxq %rbx,%r14 + mulxq %r15,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + + jmp .Lreduce64 +.Lfe64_mul_epilogue: +.cfi_endproc +.size x25519_fe64_mul,.-x25519_fe64_mul + +.globl x25519_fe64_sqr +.type x25519_fe64_sqr,@function +.align 32 x25519_fe64_sqr: +.cfi_startproc + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + pushq %rdi +.cfi_adjust_cfa_offset 8 +.cfi_offset %rdi,-64 + leaq -16(%rsp),%rsp +.cfi_adjust_cfa_offset 16 +.Lfe64_sqr_body: + + movq 0(%rsi),%rdx + movq 8(%rsi),%rcx + movq 16(%rsi),%rbp + movq 24(%rsi),%rsi + + + mulxq %rdx,%r8,%r15 + mulxq %rcx,%r9,%rax + xorl %edi,%edi + mulxq %rbp,%r10,%rbx + adcxq %rax,%r10 + mulxq %rsi,%r11,%r12 + movq %rcx,%rdx + adcxq %rbx,%r11 + adcxq %rdi,%r12 + + + mulxq %rbp,%rax,%rbx + adoxq %rax,%r11 + adcxq %rbx,%r12 + mulxq %rsi,%rax,%r13 + movq %rbp,%rdx + adoxq %rax,%r12 + adcxq %rdi,%r13 + + + mulxq %rsi,%rax,%r14 + movq %rcx,%rdx + adoxq %rax,%r13 + adcxq %rdi,%r14 + adoxq %rdi,%r14 + + adcxq %r9,%r9 + adoxq %r15,%r9 + adcxq %r10,%r10 + mulxq %rdx,%rax,%rbx + movq %rbp,%rdx + adcxq %r11,%r11 + adoxq %rax,%r10 + adcxq %r12,%r12 + adoxq %rbx,%r11 + mulxq %rdx,%rax,%rbx + movq %rsi,%rdx + adcxq %r13,%r13 + adoxq %rax,%r12 + adcxq %r14,%r14 + adoxq %rbx,%r13 + mulxq %rdx,%rax,%r15 + movl $38,%edx + adoxq %rax,%r14 + adcxq %rdi,%r15 + adoxq %rdi,%r15 + jmp .Lreduce64 + +.align 32 +.Lreduce64: + mulxq %r12,%rax,%rbx + adcxq %rax,%r8 + adoxq %rbx,%r9 + mulxq %r13,%rax,%rbx + adcxq %rax,%r9 + adoxq %rbx,%r10 + mulxq %r14,%rax,%rbx + adcxq %rax,%r10 + adoxq %rbx,%r11 + mulxq %r15,%rax,%r12 + adcxq %rax,%r11 + adoxq %rdi,%r12 + adcxq %rdi,%r12 + + movq 16(%rsp),%rdi + imulq %rdx,%r12 + + addq %r12,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + + movq 24(%rsp),%r15 +.cfi_restore %r15 + movq 32(%rsp),%r14 +.cfi_restore %r14 + movq 40(%rsp),%r13 +.cfi_restore %r13 + movq 48(%rsp),%r12 +.cfi_restore %r12 + movq 56(%rsp),%rbx +.cfi_restore %rbx + movq 64(%rsp),%rbp +.cfi_restore %rbp + leaq 72(%rsp),%rsp +.cfi_adjust_cfa_offset 88 +.Lfe64_sqr_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_sqr,.-x25519_fe64_sqr + +.globl x25519_fe64_mul121666 +.type x25519_fe64_mul121666,@function +.align 32 x25519_fe64_mul121666: +.Lfe64_mul121666_body: +.cfi_startproc + movl $121666,%edx + mulxq 0(%rsi),%r8,%rcx + mulxq 8(%rsi),%r9,%rax + addq %rcx,%r9 + mulxq 16(%rsi),%r10,%rcx + adcq %rax,%r10 + mulxq 24(%rsi),%r11,%rax + adcq %rcx,%r11 + adcq $0,%rax + + imulq $38,%rax,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r8,0(%rdi) + +.Lfe64_mul121666_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_mul121666,.-x25519_fe64_mul121666 + +.globl x25519_fe64_add +.type x25519_fe64_add,@function +.align 32 x25519_fe64_add: +.Lfe64_add_body: +.cfi_startproc + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + addq %rax,%r8 + adcq $0,%r9 + adcq $0,%r10 + movq %r9,8(%rdi) + adcq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + addq %rax,%r8 + movq %r8,0(%rdi) + +.Lfe64_add_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_add,.-x25519_fe64_add + +.globl x25519_fe64_sub +.type x25519_fe64_sub,@function +.align 32 x25519_fe64_sub: +.Lfe64_sub_body: +.cfi_startproc + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + subq 0(%rdx),%r8 + sbbq 8(%rdx),%r9 + sbbq 16(%rdx),%r10 + sbbq 24(%rdx),%r11 + + sbbq %rax,%rax + andq $38,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + movq %r9,8(%rdi) + sbbq $0,%r11 + movq %r10,16(%rdi) + sbbq %rax,%rax + movq %r11,24(%rdi) + andq $38,%rax + + subq %rax,%r8 + movq %r8,0(%rdi) + +.Lfe64_sub_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size x25519_fe64_sub,.-x25519_fe64_sub + +.globl x25519_fe64_tobytes +.type x25519_fe64_tobytes,@function +.align 32 x25519_fe64_tobytes: +.Lfe64_to_body: .cfi_startproc -.byte 0x0f,0x0b + movq 0(%rsi),%r8 + movq 8(%rsi),%r9 + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + + + leaq (%r11,%r11,1),%rax + sarq $63,%r11 + shrq $1,%rax + andq $19,%r11 + addq $19,%r11 + + addq %r11,%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%rax + + leaq (%rax,%rax,1),%r11 + sarq $63,%rax + shrq $1,%r11 + notq %rax + andq $19,%rax + + subq %rax,%r8 + sbbq $0,%r9 + sbbq $0,%r10 + sbbq $0,%r11 + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + +.Lfe64_to_epilogue: .byte 0xf3,0xc3 .cfi_endproc -.size x25519_fe64_mul,.-x25519_fe64_mul +.size x25519_fe64_tobytes,.-x25519_fe64_tobytes .byte 88,50,53,53,49,57,32,112,114,105,109,105,116,105,118,101,115,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 Index: stable/12/secure/lib/libcrypto/amd64/x86_64-mont.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/x86_64-mont.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/x86_64-mont.S (revision 364963) @@ -1,861 +1,1241 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from x86_64-mont.pl. */ .text .globl bn_mul_mont .type bn_mul_mont,@function .align 16 bn_mul_mont: .cfi_startproc movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax testl $3,%r9d jnz .Lmul_enter cmpl $8,%r9d jb .Lmul_enter + movl OPENSSL_ia32cap_P+8(%rip),%r11d cmpq %rsi,%rdx jne .Lmul4x_enter testl $7,%r9d jz .Lsqr8x_enter jmp .Lmul4x_enter .align 16 .Lmul_enter: pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 leaq -16(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .align 16 .Lmul_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: movq %rax,8(%rsp,%r9,8) .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%r13 leaq 1(%r15),%r15 jmp .L1st_enter .align 16 .L1st: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%r13 movq %r10,%r11 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 .L1st_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx leaq 1(%r15),%r15 movq %rdx,%r10 mulq %rbp cmpq %r9,%r15 jne .L1st addq %rax,%r13 movq (%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 movq %r10,%r11 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 jmp .Louter .align 16 .Louter: movq (%r12,%r14,8),%rbx xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq 8(%rsp),%r10 movq %rdx,%r13 leaq 1(%r15),%r15 jmp .Linner_enter .align 16 .Linner: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 .Linner_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 leaq 1(%r15),%r15 mulq %rbp cmpq %r9,%r15 jne .Linner addq %rax,%r13 movq (%rsi),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 cmpq %r9,%r14 jb .Louter xorq %r14,%r14 movq (%rsp),%rax movq %r9,%r15 .align 16 .Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsp,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz .Lsub sbbq $0,%rax movq $-1,%rbx xorq %rax,%rbx xorq %r14,%r14 movq %r9,%r15 .Lcopy: movq (%rdi,%r14,8),%rcx movq (%rsp,%r14,8),%rdx andq %rbx,%rcx andq %rax,%rdx movq %r9,(%rsp,%r14,8) orq %rcx,%rdx movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul_mont,.-bn_mul_mont .type bn_mul4x_mont,@function .align 16 bn_mul4x_mont: .cfi_startproc movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80100,%r11d + cmpl $0x80100,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 leaq -32(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: movq %rax,8(%rsp,%r9,8) .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul4x_body: movq %rdi,16(%rsp,%r9,8) movq %rdx,%r12 movq (%r8),%r8 movq (%r12),%rbx movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi),%rax adcq $0,%rdx addq %r11,%rdi leaq 4(%r15),%r15 adcq $0,%rdx movq %rdi,(%rsp) movq %rdx,%r13 jmp .L1st4x .align 16 .L1st4x: mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq (%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx,%r15,8),%rax adcq $0,%rdx leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq -16(%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 jb .L1st4x mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi movq %r13,-8(%rsp,%r15,8) movq %rdi,(%rsp,%r15,8) leaq 1(%r14),%r14 .align 4 .Louter4x: movq (%r12,%r14,8),%rbx xorq %r15,%r15 movq (%rsp),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%rsp),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi),%rax adcq $0,%rdx addq %r11,%rdi leaq 4(%r15),%r15 adcq $0,%rdx movq %rdi,(%rsp) movq %rdx,%r13 jmp .Linner4x .align 16 .Linner4x: mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx addq -16(%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx addq -8(%rsp,%r15,8),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx,%r15,8),%rax adcq $0,%rdx addq 8(%rsp,%r15,8),%r11 adcq $0,%rdx leaq 4(%r15),%r15 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq -16(%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-32(%rsp,%r15,8) movq %rdx,%r13 cmpq %r9,%r15 jb .Linner4x mulq %rbx addq %rax,%r10 movq -16(%rcx,%r15,8),%rax adcq $0,%rdx addq -16(%rsp,%r15,8),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%rsp,%r15,8) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx,%r15,8),%rax adcq $0,%rdx addq -8(%rsp,%r15,8),%r11 adcq $0,%rdx leaq 1(%r14),%r14 movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%rsp,%r15,8) movq %rdx,%r13 xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi addq (%rsp,%r9,8),%r13 adcq $0,%rdi movq %r13,-8(%rsp,%r15,8) movq %rdi,(%rsp,%r15,8) cmpq %r9,%r14 jb .Louter4x movq 16(%rsp,%r9,8),%rdi leaq -4(%r9),%r15 movq 0(%rsp),%rax movq 8(%rsp),%rdx shrq $2,%r15 leaq (%rsp),%rsi xorq %r14,%r14 subq 0(%rcx),%rax movq 16(%rsi),%rbx movq 24(%rsi),%rbp sbbq 8(%rcx),%rdx .Lsub4x: movq %rax,0(%rdi,%r14,8) movq %rdx,8(%rdi,%r14,8) sbbq 16(%rcx,%r14,8),%rbx movq 32(%rsi,%r14,8),%rax movq 40(%rsi,%r14,8),%rdx sbbq 24(%rcx,%r14,8),%rbp movq %rbx,16(%rdi,%r14,8) movq %rbp,24(%rdi,%r14,8) sbbq 32(%rcx,%r14,8),%rax movq 48(%rsi,%r14,8),%rbx movq 56(%rsi,%r14,8),%rbp sbbq 40(%rcx,%r14,8),%rdx leaq 4(%r14),%r14 decq %r15 jnz .Lsub4x movq %rax,0(%rdi,%r14,8) movq 32(%rsi,%r14,8),%rax sbbq 16(%rcx,%r14,8),%rbx movq %rdx,8(%rdi,%r14,8) sbbq 24(%rcx,%r14,8),%rbp movq %rbx,16(%rdi,%r14,8) sbbq $0,%rax movq %rbp,24(%rdi,%r14,8) pxor %xmm0,%xmm0 .byte 102,72,15,110,224 pcmpeqd %xmm5,%xmm5 pshufd $0,%xmm4,%xmm4 movq %r9,%r15 pxor %xmm4,%xmm5 shrq $2,%r15 xorl %eax,%eax jmp .Lcopy4x .align 16 .Lcopy4x: movdqa (%rsp,%rax,1),%xmm1 movdqu (%rdi,%rax,1),%xmm2 pand %xmm4,%xmm1 pand %xmm5,%xmm2 movdqa 16(%rsp,%rax,1),%xmm3 movdqa %xmm0,(%rsp,%rax,1) por %xmm2,%xmm1 movdqu 16(%rdi,%rax,1),%xmm2 movdqu %xmm1,(%rdi,%rax,1) pand %xmm4,%xmm3 pand %xmm5,%xmm2 movdqa %xmm0,16(%rsp,%rax,1) por %xmm2,%xmm3 movdqu %xmm3,16(%rdi,%rax,1) leaq 32(%rax),%rax decq %r15 jnz .Lcopy4x movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi, 8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul4x_mont,.-bn_mul4x_mont + .type bn_sqr8x_mont,@function .align 32 bn_sqr8x_mont: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax .Lsqr8x_enter: pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lsqr8x_prologue: movl %r9d,%r10d shll $3,%r9d shlq $3+2,%r10 negq %r9 leaq -64(%rsp,%r9,2),%r11 movq %rsp,%rbp movq (%r8),%r8 subq %rsi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lsqr8x_sp_alt subq %r11,%rbp leaq -64(%rbp,%r9,2),%rbp jmp .Lsqr8x_sp_done .align 32 .Lsqr8x_sp_alt: leaq 4096-64(,%r9,2),%r10 leaq -64(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lsqr8x_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lsqr8x_page_walk jmp .Lsqr8x_page_walk_done .align 16 .Lsqr8x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lsqr8x_page_walk .Lsqr8x_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lsqr8x_body: .byte 102,72,15,110,209 pxor %xmm0,%xmm0 .byte 102,72,15,110,207 .byte 102,73,15,110,218 + movl OPENSSL_ia32cap_P+8(%rip),%eax + andl $0x80100,%eax + cmpl $0x80100,%eax + jne .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx +.byte 102,72,15,126,207 + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: call bn_sqr8x_internal leaq (%rdi,%r9,1),%rbx movq %r9,%rcx movq %r9,%rdx .byte 102,72,15,126,207 sarq $3+2,%rcx jmp .Lsqr8x_sub .align 32 .Lsqr8x_sub: movq 0(%rbx),%r12 movq 8(%rbx),%r13 movq 16(%rbx),%r14 movq 24(%rbx),%r15 leaq 32(%rbx),%rbx sbbq 0(%rbp),%r12 sbbq 8(%rbp),%r13 sbbq 16(%rbp),%r14 sbbq 24(%rbp),%r15 leaq 32(%rbp),%rbp movq %r12,0(%rdi) movq %r13,8(%rdi) movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr8x_sub sbbq $0,%rax leaq (%rbx,%r9,1),%rbx leaq (%rdi,%r9,1),%rdi .byte 102,72,15,110,200 pxor %xmm0,%xmm0 pshufd $0,%xmm1,%xmm1 movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 jmp .Lsqr8x_cond_copy .align 32 .Lsqr8x_cond_copy: movdqa 0(%rbx),%xmm2 movdqa 16(%rbx),%xmm3 leaq 32(%rbx),%rbx movdqu 0(%rdi),%xmm4 movdqu 16(%rdi),%xmm5 leaq 32(%rdi),%rdi movdqa %xmm0,-32(%rbx) movdqa %xmm0,-16(%rbx) movdqa %xmm0,-32(%rbx,%rdx,1) movdqa %xmm0,-16(%rbx,%rdx,1) pcmpeqd %xmm1,%xmm0 pand %xmm1,%xmm2 pand %xmm1,%xmm3 pand %xmm0,%xmm4 pand %xmm0,%xmm5 pxor %xmm0,%xmm0 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqu %xmm4,-32(%rdi) movdqu %xmm5,-16(%rdi) addq $32,%r9 jnz .Lsqr8x_cond_copy movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lsqr8x_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_mont,.-bn_sqr8x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + +.byte 102,73,15,110,207 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .align 16 Index: stable/12/secure/lib/libcrypto/amd64/x86_64-mont5.S =================================================================== --- stable/12/secure/lib/libcrypto/amd64/x86_64-mont5.S (revision 364962) +++ stable/12/secure/lib/libcrypto/amd64/x86_64-mont5.S (revision 364963) @@ -1,2419 +1,3784 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from x86_64-mont5.pl. */ .text .globl bn_mul_mont_gather5 .type bn_mul_mont_gather5,@function .align 64 bn_mul_mont_gather5: .cfi_startproc movl %r9d,%r9d movq %rsp,%rax .cfi_def_cfa_register %rax testl $7,%r9d jnz .Lmul_enter + movl OPENSSL_ia32cap_P+8(%rip),%r11d jmp .Lmul4x_enter .align 16 .Lmul_enter: movd 8(%rsp),%xmm5 pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 negq %r9 movq %rsp,%r11 leaq -280(%rsp,%r9,8),%r10 negq %r9 andq $-1024,%r10 subq %r10,%r11 andq $-4096,%r11 leaq (%r10,%r11,1),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul_page_walk jmp .Lmul_page_walk_done .Lmul_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r11 cmpq %r10,%rsp ja .Lmul_page_walk .Lmul_page_walk_done: leaq .Linc(%rip),%r10 movq %rax,8(%rsp,%r9,8) .cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 .Lmul_body: leaq 128(%rdx),%r12 movdqa 0(%r10),%xmm0 movdqa 16(%r10),%xmm1 leaq 24-112(%rsp,%r9,8),%r10 andq $-16,%r10 pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 .byte 0x67 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,112(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,128(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,144(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,160(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,176(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,192(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,208(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,224(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,240(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,256(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,272(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,288(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,304(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,320(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,336(%r10) pand 64(%r12),%xmm0 pand 80(%r12),%xmm1 pand 96(%r12),%xmm2 movdqa %xmm3,352(%r10) pand 112(%r12),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -128(%r12),%xmm4 movdqa -112(%r12),%xmm5 movdqa -96(%r12),%xmm2 pand 112(%r10),%xmm4 movdqa -80(%r12),%xmm3 pand 128(%r10),%xmm5 por %xmm4,%xmm0 pand 144(%r10),%xmm2 por %xmm5,%xmm1 pand 160(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -64(%r12),%xmm4 movdqa -48(%r12),%xmm5 movdqa -32(%r12),%xmm2 pand 176(%r10),%xmm4 movdqa -16(%r12),%xmm3 pand 192(%r10),%xmm5 por %xmm4,%xmm0 pand 208(%r10),%xmm2 por %xmm5,%xmm1 pand 224(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa 0(%r12),%xmm4 movdqa 16(%r12),%xmm5 movdqa 32(%r12),%xmm2 pand 240(%r10),%xmm4 movdqa 48(%r12),%xmm3 pand 256(%r10),%xmm5 por %xmm4,%xmm0 pand 272(%r10),%xmm2 por %xmm5,%xmm1 pand 288(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 por %xmm1,%xmm0 pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq (%r8),%r8 movq (%rsi),%rax xorq %r14,%r14 xorq %r15,%r15 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq %rdx,%r13 leaq 1(%r15),%r15 jmp .L1st_enter .align 16 .L1st: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r11,%r13 movq %r10,%r11 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 .L1st_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx leaq 1(%r15),%r15 movq %rdx,%r10 mulq %rbp cmpq %r9,%r15 jne .L1st addq %rax,%r13 adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 movq %r10,%r11 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 jmp .Louter .align 16 .Louter: leaq 24+128(%rsp,%r9,8),%rdx andq $-16,%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r12),%xmm0 movdqa -112(%r12),%xmm1 movdqa -96(%r12),%xmm2 movdqa -80(%r12),%xmm3 pand -128(%rdx),%xmm0 pand -112(%rdx),%xmm1 por %xmm0,%xmm4 pand -96(%rdx),%xmm2 por %xmm1,%xmm5 pand -80(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r12),%xmm0 movdqa -48(%r12),%xmm1 movdqa -32(%r12),%xmm2 movdqa -16(%r12),%xmm3 pand -64(%rdx),%xmm0 pand -48(%rdx),%xmm1 por %xmm0,%xmm4 pand -32(%rdx),%xmm2 por %xmm1,%xmm5 pand -16(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r12),%xmm0 movdqa 16(%r12),%xmm1 movdqa 32(%r12),%xmm2 movdqa 48(%r12),%xmm3 pand 0(%rdx),%xmm0 pand 16(%rdx),%xmm1 por %xmm0,%xmm4 pand 32(%rdx),%xmm2 por %xmm1,%xmm5 pand 48(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r12),%xmm0 movdqa 80(%r12),%xmm1 movdqa 96(%r12),%xmm2 movdqa 112(%r12),%xmm3 pand 64(%rdx),%xmm0 pand 80(%rdx),%xmm1 por %xmm0,%xmm4 pand 96(%rdx),%xmm2 por %xmm1,%xmm5 pand 112(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 leaq 256(%r12),%r12 movq (%rsi),%rax .byte 102,72,15,126,195 xorq %r15,%r15 movq %r8,%rbp movq (%rsp),%r10 mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi),%rax adcq $0,%rdx movq 8(%rsp),%r10 movq %rdx,%r13 leaq 1(%r15),%r15 jmp .Linner_enter .align 16 .Linner: addq %rax,%r13 movq (%rsi,%r15,8),%rax adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r15,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r15,8) movq %rdx,%r13 .Linner_enter: mulq %rbx addq %rax,%r11 movq (%rcx,%r15,8),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 leaq 1(%r15),%r15 mulq %rbp cmpq %r9,%r15 jne .Linner addq %rax,%r13 adcq $0,%rdx addq %r10,%r13 movq (%rsp,%r9,8),%r10 adcq $0,%rdx movq %r13,-16(%rsp,%r9,8) movq %rdx,%r13 xorq %rdx,%rdx addq %r11,%r13 adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%rsp,%r9,8) movq %rdx,(%rsp,%r9,8) leaq 1(%r14),%r14 cmpq %r9,%r14 jb .Louter xorq %r14,%r14 movq (%rsp),%rax leaq (%rsp),%rsi movq %r9,%r15 jmp .Lsub .align 16 .Lsub: sbbq (%rcx,%r14,8),%rax movq %rax,(%rdi,%r14,8) movq 8(%rsi,%r14,8),%rax leaq 1(%r14),%r14 decq %r15 jnz .Lsub sbbq $0,%rax movq $-1,%rbx xorq %rax,%rbx xorq %r14,%r14 movq %r9,%r15 .Lcopy: movq (%rdi,%r14,8),%rcx movq (%rsp,%r14,8),%rdx andq %rbx,%rcx andq %rax,%rdx movq %r14,(%rsp,%r14,8) orq %rcx,%rdx movq %rdx,(%rdi,%r14,8) leaq 1(%r14),%r14 subq $1,%r15 jnz .Lcopy movq 8(%rsp,%r9,8),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul_mont_gather5,.-bn_mul_mont_gather5 .type bn_mul4x_mont_gather5,@function .align 32 bn_mul4x_mont_gather5: .cfi_startproc .byte 0x67 movq %rsp,%rax .cfi_def_cfa_register %rax .Lmul4x_enter: + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lmulx4x_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lmul4x_prologue: .byte 0x67 shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lmul4xsp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lmul4xsp_done .align 32 .Lmul4xsp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lmul4xsp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmul4x_page_walk jmp .Lmul4x_page_walk_done .Lmul4x_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lmul4x_page_walk .Lmul4x_page_walk_done: negq %r9 movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lmul4x_body: call mul4x_internal movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lmul4x_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 .type mul4x_internal,@function .align 32 mul4x_internal: .cfi_startproc shlq $5,%r9 movd 8(%rax),%xmm5 leaq .Linc(%rip),%rax leaq 128(%rdx,%r9,1),%r13 shrq $5,%r9 movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 88-112(%rsp,%r9,1),%r10 leaq 128(%rdx),%r12 pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 .byte 0x67,0x67 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 .byte 0x67 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,112(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,128(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,144(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,160(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,176(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,192(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,208(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,224(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,240(%r10) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,256(%r10) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,272(%r10) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,288(%r10) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,304(%r10) paddd %xmm2,%xmm3 .byte 0x67 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,320(%r10) pcmpeqd %xmm5,%xmm3 movdqa %xmm2,336(%r10) pand 64(%r12),%xmm0 pand 80(%r12),%xmm1 pand 96(%r12),%xmm2 movdqa %xmm3,352(%r10) pand 112(%r12),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -128(%r12),%xmm4 movdqa -112(%r12),%xmm5 movdqa -96(%r12),%xmm2 pand 112(%r10),%xmm4 movdqa -80(%r12),%xmm3 pand 128(%r10),%xmm5 por %xmm4,%xmm0 pand 144(%r10),%xmm2 por %xmm5,%xmm1 pand 160(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa -64(%r12),%xmm4 movdqa -48(%r12),%xmm5 movdqa -32(%r12),%xmm2 pand 176(%r10),%xmm4 movdqa -16(%r12),%xmm3 pand 192(%r10),%xmm5 por %xmm4,%xmm0 pand 208(%r10),%xmm2 por %xmm5,%xmm1 pand 224(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 movdqa 0(%r12),%xmm4 movdqa 16(%r12),%xmm5 movdqa 32(%r12),%xmm2 pand 240(%r10),%xmm4 movdqa 48(%r12),%xmm3 pand 256(%r10),%xmm5 por %xmm4,%xmm0 pand 272(%r10),%xmm2 por %xmm5,%xmm1 pand 288(%r10),%xmm3 por %xmm2,%xmm0 por %xmm3,%xmm1 por %xmm1,%xmm0 pshufd $0x4e,%xmm0,%xmm1 por %xmm1,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq %r13,16+8(%rsp) movq %rdi,56+8(%rsp) movq (%r8),%r8 movq (%rsi),%rax leaq (%rsi,%r9,1),%rsi negq %r9 movq %r8,%rbp mulq %rbx movq %rax,%r10 movq (%rcx),%rax imulq %r10,%rbp leaq 64+8(%rsp),%r14 movq %rdx,%r11 mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 jmp .L1st4x .align 32 .L1st4x: mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq 0(%rcx),%rax adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdi,(%r14) movq %rdx,%r13 addq $32,%r15 jnz .L1st4x mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%r13 leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi movq %r13,-8(%r14) jmp .Louter4x .align 32 .Louter4x: leaq 16+128(%r14),%rdx pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r12),%xmm0 movdqa -112(%r12),%xmm1 movdqa -96(%r12),%xmm2 movdqa -80(%r12),%xmm3 pand -128(%rdx),%xmm0 pand -112(%rdx),%xmm1 por %xmm0,%xmm4 pand -96(%rdx),%xmm2 por %xmm1,%xmm5 pand -80(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r12),%xmm0 movdqa -48(%r12),%xmm1 movdqa -32(%r12),%xmm2 movdqa -16(%r12),%xmm3 pand -64(%rdx),%xmm0 pand -48(%rdx),%xmm1 por %xmm0,%xmm4 pand -32(%rdx),%xmm2 por %xmm1,%xmm5 pand -16(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r12),%xmm0 movdqa 16(%r12),%xmm1 movdqa 32(%r12),%xmm2 movdqa 48(%r12),%xmm3 pand 0(%rdx),%xmm0 pand 16(%rdx),%xmm1 por %xmm0,%xmm4 pand 32(%rdx),%xmm2 por %xmm1,%xmm5 pand 48(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r12),%xmm0 movdqa 80(%r12),%xmm1 movdqa 96(%r12),%xmm2 movdqa 112(%r12),%xmm3 pand 64(%rdx),%xmm0 pand 80(%rdx),%xmm1 por %xmm0,%xmm4 pand 96(%rdx),%xmm2 por %xmm1,%xmm5 pand 112(%rdx),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 leaq 256(%r12),%r12 .byte 102,72,15,126,195 movq (%r14,%r9,1),%r10 movq %r8,%rbp mulq %rbx addq %rax,%r10 movq (%rcx),%rax adcq $0,%rdx imulq %r10,%rbp movq %rdx,%r11 movq %rdi,(%r14) leaq (%r14,%r9,1),%r14 mulq %rbp addq %rax,%r10 movq 8(%rsi,%r9,1),%rax adcq $0,%rdx movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%r9),%r15 leaq 32(%rcx),%rcx adcq $0,%rdx movq %rdx,%r13 jmp .Linner4x .align 32 .Linner4x: mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq -8(%rcx),%rax adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%r13 mulq %rbx addq %rax,%r10 movq 0(%rcx),%rax adcq $0,%rdx addq (%r14),%r10 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq 8(%rsi,%r15,1),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-16(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq 8(%rcx),%rax adcq $0,%rdx addq 8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq 16(%rsi,%r15,1),%rax adcq $0,%rdx addq %r11,%rdi leaq 32(%rcx),%rcx adcq $0,%rdx movq %r13,-8(%r14) movq %rdx,%r13 addq $32,%r15 jnz .Linner4x mulq %rbx addq %rax,%r10 movq -16(%rcx),%rax adcq $0,%rdx addq 16(%r14),%r10 leaq 32(%r14),%r14 adcq $0,%rdx movq %rdx,%r11 mulq %rbp addq %rax,%r13 movq -8(%rsi),%rax adcq $0,%rdx addq %r10,%r13 adcq $0,%rdx movq %rdi,-32(%r14) movq %rdx,%rdi mulq %rbx addq %rax,%r11 movq %rbp,%rax movq -8(%rcx),%rbp adcq $0,%rdx addq -8(%r14),%r11 adcq $0,%rdx movq %rdx,%r10 mulq %rbp addq %rax,%rdi movq (%rsi,%r9,1),%rax adcq $0,%rdx addq %r11,%rdi adcq $0,%rdx movq %r13,-24(%r14) movq %rdx,%r13 movq %rdi,-16(%r14) leaq (%rcx,%r9,1),%rcx xorq %rdi,%rdi addq %r10,%r13 adcq $0,%rdi addq (%r14),%r13 adcq $0,%rdi movq %r13,-8(%r14) cmpq 16+8(%rsp),%r12 jb .Louter4x xorq %rax,%rax subq %r13,%rbp adcq %r15,%r15 orq %r15,%rdi subq %rdi,%rax leaq (%r14,%r9,1),%rbx movq (%rcx),%r12 leaq (%rcx),%rbp movq %r9,%rcx sarq $3+2,%rcx movq 56+8(%rsp),%rdi decq %r12 xorq %r10,%r10 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry .cfi_endproc .size mul4x_internal,.-mul4x_internal .globl bn_power5 .type bn_power5,@function .align 32 bn_power5: .cfi_startproc movq %rsp,%rax .cfi_def_cfa_register %rax + movl OPENSSL_ia32cap_P+8(%rip),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + je .Lpowerx5_enter pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lpower5_prologue: shll $3,%r9d leal (%r9,%r9,2),%r10d negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lpwr_sp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lpwr_sp_done .align 32 .Lpwr_sp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lpwr_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lpwr_page_walk jmp .Lpwr_page_walk_done .Lpwr_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lpwr_page_walk .Lpwr_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lpower5_body: .byte 102,72,15,110,207 .byte 102,72,15,110,209 .byte 102,73,15,110,218 .byte 102,72,15,110,226 call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal call __bn_sqr8x_internal call __bn_post4x_internal .byte 102,72,15,126,209 .byte 102,72,15,126,226 movq %rsi,%rdi movq 40(%rsp),%rax leaq 32(%rsp),%r8 call mul4x_internal movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lpower5_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_power5,.-bn_power5 .globl bn_sqr8x_internal .hidden bn_sqr8x_internal .type bn_sqr8x_internal,@function .align 32 bn_sqr8x_internal: __bn_sqr8x_internal: .cfi_startproc leaq 32(%r10),%rbp leaq (%rsi,%r9,1),%rsi movq %r9,%rcx movq -32(%rsi,%rbp,1),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi,%rbp,1),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi,%rbp,1),%rbx movq %rax,%r15 mulq %r14 movq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 movq %r10,-24(%rdi,%rbp,1) mulq %r14 addq %rax,%r11 movq %rbx,%rax adcq $0,%rdx movq %r11,-16(%rdi,%rbp,1) movq %rdx,%r10 movq -8(%rsi,%rbp,1),%rbx mulq %r15 movq %rax,%r12 movq %rbx,%rax movq %rdx,%r13 leaq (%rbp),%rcx mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) jmp .Lsqr4x_1st .align 32 .Lsqr4x_1st: movq (%rsi,%rcx,1),%rbx mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %rdx,%r12 adcq $0,%r12 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 8(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,(%rdi,%rcx,1) movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq 16(%rsi,%rcx,1),%rbx movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %r10,8(%rdi,%rcx,1) movq %rdx,%r12 adcq $0,%r12 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 24(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,16(%rdi,%rcx,1) movq %rdx,%r13 adcq $0,%r13 leaq 32(%rcx),%rcx mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) cmpq $0,%rcx jne .Lsqr4x_1st mulq %r15 addq %rax,%r13 leaq 16(%rbp),%rbp adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) jmp .Lsqr4x_outer .align 32 .Lsqr4x_outer: movq -32(%rsi,%rbp,1),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi,%rbp,1),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi,%rbp,1),%rbx movq %rax,%r15 mulq %r14 movq -24(%rdi,%rbp,1),%r10 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx movq %r10,-24(%rdi,%rbp,1) movq %rdx,%r11 mulq %r14 addq %rax,%r11 movq %rbx,%rax adcq $0,%rdx addq -16(%rdi,%rbp,1),%r11 movq %rdx,%r10 adcq $0,%r10 movq %r11,-16(%rdi,%rbp,1) xorq %r12,%r12 movq -8(%rsi,%rbp,1),%rbx mulq %r15 addq %rax,%r12 movq %rbx,%rax adcq $0,%rdx addq -8(%rdi,%rbp,1),%r12 movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx addq %r12,%r10 movq %rdx,%r11 adcq $0,%r11 movq %r10,-8(%rdi,%rbp,1) leaq (%rbp),%rcx jmp .Lsqr4x_inner .align 32 .Lsqr4x_inner: movq (%rsi,%rcx,1),%rbx mulq %r15 addq %rax,%r13 movq %rbx,%rax movq %rdx,%r12 adcq $0,%r12 addq (%rdi,%rcx,1),%r13 adcq $0,%r12 .byte 0x67 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq 8(%rsi,%rcx,1),%rbx movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %r11,(%rdi,%rcx,1) movq %rbx,%rax movq %rdx,%r13 adcq $0,%r13 addq 8(%rdi,%rcx,1),%r12 leaq 16(%rcx),%rcx adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax adcq $0,%rdx addq %r12,%r10 movq %rdx,%r11 adcq $0,%r11 movq %r10,-8(%rdi,%rcx,1) cmpq $0,%rcx jne .Lsqr4x_inner .byte 0x67 mulq %r15 addq %rax,%r13 adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) addq $16,%rbp jnz .Lsqr4x_outer movq -32(%rsi),%r14 leaq 48+8(%rsp,%r9,2),%rdi movq -24(%rsi),%rax leaq -32(%rdi,%rbp,1),%rdi movq -16(%rsi),%rbx movq %rax,%r15 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 mulq %r14 addq %rax,%r11 movq %rbx,%rax movq %r10,-24(%rdi) movq %rdx,%r10 adcq $0,%r10 addq %r13,%r11 movq -8(%rsi),%rbx adcq $0,%r10 mulq %r15 addq %rax,%r12 movq %rbx,%rax movq %r11,-16(%rdi) movq %rdx,%r13 adcq $0,%r13 mulq %r14 addq %rax,%r10 movq %rbx,%rax movq %rdx,%r11 adcq $0,%r11 addq %r12,%r10 adcq $0,%r11 movq %r10,-8(%rdi) mulq %r15 addq %rax,%r13 movq -16(%rsi),%rax adcq $0,%rdx addq %r11,%r13 adcq $0,%rdx movq %r13,(%rdi) movq %rdx,%r12 movq %rdx,8(%rdi) mulq %rbx addq $16,%rbp xorq %r14,%r14 subq %r9,%rbp xorq %r15,%r15 addq %r12,%rax adcq $0,%rdx movq %rax,8(%rdi) movq %rdx,16(%rdi) movq %r15,24(%rdi) movq -16(%rsi,%rbp,1),%rax leaq 48+8(%rsp),%rdi xorq %r10,%r10 movq 8(%rdi),%r11 leaq (%r14,%r10,2),%r12 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq 16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 24(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi,%rbp,1),%rax movq %r12,(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 32(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 40(%rdi),%r11 adcq %rax,%rbx movq 0(%rsi,%rbp,1),%rax movq %rbx,16(%rdi) adcq %rdx,%r8 leaq 16(%rbp),%rbp movq %r8,24(%rdi) sbbq %r15,%r15 leaq 64(%rdi),%rdi jmp .Lsqr4x_shift_n_add .align 32 .Lsqr4x_shift_n_add: leaq (%r14,%r10,2),%r12 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq -16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq -8(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi,%rbp,1),%rax movq %r12,-32(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,-24(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 0(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 8(%rdi),%r11 adcq %rax,%rbx movq 0(%rsi,%rbp,1),%rax movq %rbx,-16(%rdi) adcq %rdx,%r8 leaq (%r14,%r10,2),%r12 movq %r8,-8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq 16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 24(%rdi),%r11 adcq %rax,%r12 movq 8(%rsi,%rbp,1),%rax movq %r12,0(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,8(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 movq 32(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq 40(%rdi),%r11 adcq %rax,%rbx movq 16(%rsi,%rbp,1),%rax movq %rbx,16(%rdi) adcq %rdx,%r8 movq %r8,24(%rdi) sbbq %r15,%r15 leaq 64(%rdi),%rdi addq $32,%rbp jnz .Lsqr4x_shift_n_add leaq (%r14,%r10,2),%r12 .byte 0x67 shrq $63,%r10 leaq (%rcx,%r11,2),%r13 shrq $63,%r11 orq %r10,%r13 movq -16(%rdi),%r10 movq %r11,%r14 mulq %rax negq %r15 movq -8(%rdi),%r11 adcq %rax,%r12 movq -8(%rsi),%rax movq %r12,-32(%rdi) adcq %rdx,%r13 leaq (%r14,%r10,2),%rbx movq %r13,-24(%rdi) sbbq %r15,%r15 shrq $63,%r10 leaq (%rcx,%r11,2),%r8 shrq $63,%r11 orq %r10,%r8 mulq %rax negq %r15 adcq %rax,%rbx adcq %rdx,%r8 movq %rbx,-16(%rdi) movq %r8,-8(%rdi) .byte 102,72,15,126,213 __bn_sqr8x_reduction: xorq %rax,%rax leaq (%r9,%rbp,1),%rcx leaq 48+8(%rsp,%r9,2),%rdx movq %rcx,0+8(%rsp) leaq 48+8(%rsp,%r9,1),%rdi movq %rdx,8+8(%rsp) negq %r9 jmp .L8x_reduction_loop .align 32 .L8x_reduction_loop: leaq (%rdi,%r9,1),%rdi .byte 0x66 movq 0(%rdi),%rbx movq 8(%rdi),%r9 movq 16(%rdi),%r10 movq 24(%rdi),%r11 movq 32(%rdi),%r12 movq 40(%rdi),%r13 movq 48(%rdi),%r14 movq 56(%rdi),%r15 movq %rax,(%rdx) leaq 64(%rdi),%rdi .byte 0x67 movq %rbx,%r8 imulq 32+8(%rsp),%rbx movq 0(%rbp),%rax movl $8,%ecx jmp .L8x_reduce .align 32 .L8x_reduce: mulq %rbx movq 8(%rbp),%rax negq %r8 movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 movq %rbx,48-8+8(%rsp,%rcx,8) movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq 32+8(%rsp),%rsi movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx imulq %r8,%rsi addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq %rsi,%rbx addq %rax,%r15 movq 0(%rbp),%rax adcq $0,%rdx addq %r15,%r14 movq %rdx,%r15 adcq $0,%r15 decl %ecx jnz .L8x_reduce leaq 64(%rbp),%rbp xorq %rax,%rax movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_no_tail .byte 0x66 addq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 sbbq %rsi,%rsi movq 48+56+8(%rsp),%rbx movl $8,%ecx movq 0(%rbp),%rax jmp .L8x_tail .align 32 .L8x_tail: mulq %rbx addq %rax,%r8 movq 8(%rbp),%rax movq %r8,(%rdi) movq %rdx,%r8 adcq $0,%r8 mulq %rbx addq %rax,%r9 movq 16(%rbp),%rax adcq $0,%rdx addq %r9,%r8 leaq 8(%rdi),%rdi movq %rdx,%r9 adcq $0,%r9 mulq %rbx addq %rax,%r10 movq 24(%rbp),%rax adcq $0,%rdx addq %r10,%r9 movq %rdx,%r10 adcq $0,%r10 mulq %rbx addq %rax,%r11 movq 32(%rbp),%rax adcq $0,%rdx addq %r11,%r10 movq %rdx,%r11 adcq $0,%r11 mulq %rbx addq %rax,%r12 movq 40(%rbp),%rax adcq $0,%rdx addq %r12,%r11 movq %rdx,%r12 adcq $0,%r12 mulq %rbx addq %rax,%r13 movq 48(%rbp),%rax adcq $0,%rdx addq %r13,%r12 movq %rdx,%r13 adcq $0,%r13 mulq %rbx addq %rax,%r14 movq 56(%rbp),%rax adcq $0,%rdx addq %r14,%r13 movq %rdx,%r14 adcq $0,%r14 mulq %rbx movq 48-16+8(%rsp,%rcx,8),%rbx addq %rax,%r15 adcq $0,%rdx addq %r15,%r14 movq 0(%rbp),%rax movq %rdx,%r15 adcq $0,%r15 decl %ecx jnz .L8x_tail leaq 64(%rbp),%rbp movq 8+8(%rsp),%rdx cmpq 0+8(%rsp),%rbp jae .L8x_tail_done movq 48+56+8(%rsp),%rbx negq %rsi movq 0(%rbp),%rax adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 sbbq %rsi,%rsi movl $8,%ecx jmp .L8x_tail .align 32 .L8x_tail_done: xorq %rax,%rax addq (%rdx),%r8 adcq $0,%r9 adcq $0,%r10 adcq $0,%r11 adcq $0,%r12 adcq $0,%r13 adcq $0,%r14 adcq $0,%r15 adcq $0,%rax negq %rsi .L8x_no_tail: adcq 0(%rdi),%r8 adcq 8(%rdi),%r9 adcq 16(%rdi),%r10 adcq 24(%rdi),%r11 adcq 32(%rdi),%r12 adcq 40(%rdi),%r13 adcq 48(%rdi),%r14 adcq 56(%rdi),%r15 adcq $0,%rax movq -8(%rbp),%rcx xorq %rsi,%rsi .byte 102,72,15,126,213 movq %r8,0(%rdi) movq %r9,8(%rdi) .byte 102,73,15,126,217 movq %r10,16(%rdi) movq %r11,24(%rdi) movq %r12,32(%rdi) movq %r13,40(%rdi) movq %r14,48(%rdi) movq %r15,56(%rdi) leaq 64(%rdi),%rdi cmpq %rdx,%rdi jb .L8x_reduction_loop .byte 0xf3,0xc3 .cfi_endproc .size bn_sqr8x_internal,.-bn_sqr8x_internal .type __bn_post4x_internal,@function .align 32 __bn_post4x_internal: .cfi_startproc movq 0(%rbp),%r12 leaq (%rdi,%r9,1),%rbx movq %r9,%rcx .byte 102,72,15,126,207 negq %rax .byte 102,72,15,126,206 sarq $3+2,%rcx decq %r12 xorq %r10,%r10 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 jmp .Lsqr4x_sub_entry .align 16 .Lsqr4x_sub: movq 0(%rbp),%r12 movq 8(%rbp),%r13 movq 16(%rbp),%r14 movq 24(%rbp),%r15 .Lsqr4x_sub_entry: leaq 32(%rbp),%rbp notq %r12 notq %r13 notq %r14 notq %r15 andq %rax,%r12 andq %rax,%r13 andq %rax,%r14 andq %rax,%r15 negq %r10 adcq 0(%rbx),%r12 adcq 8(%rbx),%r13 adcq 16(%rbx),%r14 adcq 24(%rbx),%r15 movq %r12,0(%rdi) leaq 32(%rbx),%rbx movq %r13,8(%rdi) sbbq %r10,%r10 movq %r14,16(%rdi) movq %r15,24(%rdi) leaq 32(%rdi),%rdi incq %rcx jnz .Lsqr4x_sub movq %r9,%r10 negq %r9 .byte 0xf3,0xc3 .cfi_endproc .size __bn_post4x_internal,.-__bn_post4x_internal .globl bn_from_montgomery .type bn_from_montgomery,@function .align 32 bn_from_montgomery: .cfi_startproc testl $7,%r9d jz bn_from_mont8x xorl %eax,%eax .byte 0xf3,0xc3 .cfi_endproc .size bn_from_montgomery,.-bn_from_montgomery .type bn_from_mont8x,@function .align 32 bn_from_mont8x: .cfi_startproc .byte 0x67 movq %rsp,%rax .cfi_def_cfa_register %rax pushq %rbx .cfi_offset %rbx,-16 pushq %rbp .cfi_offset %rbp,-24 pushq %r12 .cfi_offset %r12,-32 pushq %r13 .cfi_offset %r13,-40 pushq %r14 .cfi_offset %r14,-48 pushq %r15 .cfi_offset %r15,-56 .Lfrom_prologue: shll $3,%r9d leaq (%r9,%r9,2),%r10 negq %r9 movq (%r8),%r8 leaq -320(%rsp,%r9,2),%r11 movq %rsp,%rbp subq %rdi,%r11 andq $4095,%r11 cmpq %r11,%r10 jb .Lfrom_sp_alt subq %r11,%rbp leaq -320(%rbp,%r9,2),%rbp jmp .Lfrom_sp_done .align 32 .Lfrom_sp_alt: leaq 4096-320(,%r9,2),%r10 leaq -320(%rbp,%r9,2),%rbp subq %r10,%r11 movq $0,%r10 cmovcq %r10,%r11 subq %r11,%rbp .Lfrom_sp_done: andq $-64,%rbp movq %rsp,%r11 subq %rbp,%r11 andq $-4096,%r11 leaq (%r11,%rbp,1),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lfrom_page_walk jmp .Lfrom_page_walk_done .Lfrom_page_walk: leaq -4096(%rsp),%rsp movq (%rsp),%r10 cmpq %rbp,%rsp ja .Lfrom_page_walk .Lfrom_page_walk_done: movq %r9,%r10 negq %r9 movq %r8,32(%rsp) movq %rax,40(%rsp) .cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 .Lfrom_body: movq %r9,%r11 leaq 48(%rsp),%rax pxor %xmm0,%xmm0 jmp .Lmul_by_1 .align 32 .Lmul_by_1: movdqu (%rsi),%xmm1 movdqu 16(%rsi),%xmm2 movdqu 32(%rsi),%xmm3 movdqa %xmm0,(%rax,%r9,1) movdqu 48(%rsi),%xmm4 movdqa %xmm0,16(%rax,%r9,1) .byte 0x48,0x8d,0xb6,0x40,0x00,0x00,0x00 movdqa %xmm1,(%rax) movdqa %xmm0,32(%rax,%r9,1) movdqa %xmm2,16(%rax) movdqa %xmm0,48(%rax,%r9,1) movdqa %xmm3,32(%rax) movdqa %xmm4,48(%rax) leaq 64(%rax),%rax subq $64,%r11 jnz .Lmul_by_1 .byte 102,72,15,110,207 .byte 102,72,15,110,209 .byte 0x67 movq %rcx,%rbp .byte 102,73,15,110,218 + movl OPENSSL_ia32cap_P+8(%rip),%r11d + andl $0x80108,%r11d + cmpl $0x80108,%r11d + jne .Lfrom_mont_nox + + leaq (%rax,%r9,1),%rdi + call __bn_sqrx8x_reduction + call __bn_postx4x_internal + + pxor %xmm0,%xmm0 + leaq 48(%rsp),%rax + jmp .Lfrom_mont_zero + +.align 32 +.Lfrom_mont_nox: call __bn_sqr8x_reduction call __bn_post4x_internal pxor %xmm0,%xmm0 leaq 48(%rsp),%rax jmp .Lfrom_mont_zero .align 32 .Lfrom_mont_zero: movq 40(%rsp),%rsi .cfi_def_cfa %rsi,8 movdqa %xmm0,0(%rax) movdqa %xmm0,16(%rax) movdqa %xmm0,32(%rax) movdqa %xmm0,48(%rax) leaq 64(%rax),%rax subq $32,%r9 jnz .Lfrom_mont_zero movq $1,%rax movq -48(%rsi),%r15 .cfi_restore %r15 movq -40(%rsi),%r14 .cfi_restore %r14 movq -32(%rsi),%r13 .cfi_restore %r13 movq -24(%rsi),%r12 .cfi_restore %r12 movq -16(%rsi),%rbp .cfi_restore %rbp movq -8(%rsi),%rbx .cfi_restore %rbx leaq (%rsi),%rsp .cfi_def_cfa_register %rsp .Lfrom_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_from_mont8x,.-bn_from_mont8x +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lmulx4x_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: +.cfi_startproc + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi +.byte 102,72,15,126,194 + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc + movq %rsp,%rax +.cfi_def_cfa_register %rax +.Lpowerx5_enter: + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 +.byte 102,72,15,110,207 +.byte 102,72,15,110,209 +.byte 102,73,15,110,218 +.byte 102,72,15,110,226 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi +.byte 102,72,15,126,209 +.byte 102,72,15,126,226 + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) +.byte 102,72,15,126,217 + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi +.byte 102,72,15,126,213 +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 +.byte 102,72,15,126,217 + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi +.byte 102,72,15,126,213 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + .byte 0xf3,0xc3 +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +__bn_postx4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + +.byte 102,72,15,126,202 +.byte 102,72,15,126,206 + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + .byte 0xf3,0xc3 +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal .globl bn_get_bits5 .type bn_get_bits5,@function .align 16 bn_get_bits5: .cfi_startproc leaq 0(%rdi),%r10 leaq 1(%rdi),%r11 movl %esi,%ecx shrl $4,%esi andl $15,%ecx leal -8(%rcx),%eax cmpl $11,%ecx cmovaq %r11,%r10 cmoval %eax,%ecx movzwl (%r10,%rsi,2),%eax shrl %cl,%eax andl $31,%eax .byte 0xf3,0xc3 .cfi_endproc .size bn_get_bits5,.-bn_get_bits5 .globl bn_scatter5 .type bn_scatter5,@function .align 16 bn_scatter5: .cfi_startproc cmpl $0,%esi jz .Lscatter_epilogue leaq (%rdx,%rcx,8),%rdx .Lscatter: movq (%rdi),%rax leaq 8(%rdi),%rdi movq %rax,(%rdx) leaq 256(%rdx),%rdx subl $1,%esi jnz .Lscatter .Lscatter_epilogue: .byte 0xf3,0xc3 .cfi_endproc .size bn_scatter5,.-bn_scatter5 .globl bn_gather5 .type bn_gather5,@function .align 32 bn_gather5: .LSEH_begin_bn_gather5: .cfi_startproc .byte 0x4c,0x8d,0x14,0x24 .byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 leaq .Linc(%rip),%rax andq $-16,%rsp movd %ecx,%xmm5 movdqa 0(%rax),%xmm0 movdqa 16(%rax),%xmm1 leaq 128(%rdx),%r11 leaq 128(%rsp),%rax pshufd $0,%xmm5,%xmm5 movdqa %xmm1,%xmm4 movdqa %xmm1,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,-128(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,-112(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,-96(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,-80(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,-64(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,-48(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,-32(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,-16(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,0(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,16(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,32(%rax) movdqa %xmm4,%xmm2 paddd %xmm0,%xmm1 pcmpeqd %xmm5,%xmm0 movdqa %xmm3,48(%rax) movdqa %xmm4,%xmm3 paddd %xmm1,%xmm2 pcmpeqd %xmm5,%xmm1 movdqa %xmm0,64(%rax) movdqa %xmm4,%xmm0 paddd %xmm2,%xmm3 pcmpeqd %xmm5,%xmm2 movdqa %xmm1,80(%rax) movdqa %xmm4,%xmm1 paddd %xmm3,%xmm0 pcmpeqd %xmm5,%xmm3 movdqa %xmm2,96(%rax) movdqa %xmm4,%xmm2 movdqa %xmm3,112(%rax) jmp .Lgather .align 32 .Lgather: pxor %xmm4,%xmm4 pxor %xmm5,%xmm5 movdqa -128(%r11),%xmm0 movdqa -112(%r11),%xmm1 movdqa -96(%r11),%xmm2 pand -128(%rax),%xmm0 movdqa -80(%r11),%xmm3 pand -112(%rax),%xmm1 por %xmm0,%xmm4 pand -96(%rax),%xmm2 por %xmm1,%xmm5 pand -80(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa -64(%r11),%xmm0 movdqa -48(%r11),%xmm1 movdqa -32(%r11),%xmm2 pand -64(%rax),%xmm0 movdqa -16(%r11),%xmm3 pand -48(%rax),%xmm1 por %xmm0,%xmm4 pand -32(%rax),%xmm2 por %xmm1,%xmm5 pand -16(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 0(%r11),%xmm0 movdqa 16(%r11),%xmm1 movdqa 32(%r11),%xmm2 pand 0(%rax),%xmm0 movdqa 48(%r11),%xmm3 pand 16(%rax),%xmm1 por %xmm0,%xmm4 pand 32(%rax),%xmm2 por %xmm1,%xmm5 pand 48(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 movdqa 64(%r11),%xmm0 movdqa 80(%r11),%xmm1 movdqa 96(%r11),%xmm2 pand 64(%rax),%xmm0 movdqa 112(%r11),%xmm3 pand 80(%rax),%xmm1 por %xmm0,%xmm4 pand 96(%rax),%xmm2 por %xmm1,%xmm5 pand 112(%rax),%xmm3 por %xmm2,%xmm4 por %xmm3,%xmm5 por %xmm5,%xmm4 leaq 256(%r11),%r11 pshufd $0x4e,%xmm4,%xmm0 por %xmm4,%xmm0 movq %xmm0,(%rdi) leaq 8(%rdi),%rdi subl $1,%esi jnz .Lgather leaq (%r10),%rsp .byte 0xf3,0xc3 .LSEH_end_bn_gather5: .cfi_endproc .size bn_gather5,.-bn_gather5 .align 64 .Linc: .long 0,0, 1,1 .long 2,2, 2,2 .byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 Index: stable/12/secure/lib/libcrypto/i386/chacha-x86.S =================================================================== --- stable/12/secure/lib/libcrypto/i386/chacha-x86.S (revision 364962) +++ stable/12/secure/lib/libcrypto/i386/chacha-x86.S (revision 364963) @@ -1,1061 +1,2021 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from chacha-x86.pl. */ #ifdef PIC .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,@function .align 16 ChaCha20_ctr32: .L_ChaCha20_ctr32_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi xorl %eax,%eax cmpl 28(%esp),%eax je .L000no_data call .Lpic_point .Lpic_point: popl %eax leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp testl $16777216,(%ebp) jz .L001x86 testl $512,4(%ebp) jz .L001x86 jmp .Lssse3_shortcut .L001x86: movl 32(%esp),%esi movl 36(%esp),%edi subl $132,%esp movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edx movl %eax,80(%esp) movl %ebx,84(%esp) movl %ecx,88(%esp) movl %edx,92(%esp) movl 16(%esi),%eax movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edx movl %eax,96(%esp) movl %ebx,100(%esp) movl %ecx,104(%esp) movl %edx,108(%esp) movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx movl 12(%edi),%edx subl $1,%eax movl %eax,112(%esp) movl %ebx,116(%esp) movl %ecx,120(%esp) movl %edx,124(%esp) jmp .L002entry .align 16 .L003outer_loop: movl %ebx,156(%esp) movl %eax,152(%esp) movl %ecx,160(%esp) .L002entry: movl $1634760805,%eax movl $857760878,4(%esp) movl $2036477234,8(%esp) movl $1797285236,12(%esp) movl 84(%esp),%ebx movl 88(%esp),%ebp movl 104(%esp),%ecx movl 108(%esp),%esi movl 116(%esp),%edx movl 120(%esp),%edi movl %ebx,20(%esp) movl %ebp,24(%esp) movl %ecx,40(%esp) movl %esi,44(%esp) movl %edx,52(%esp) movl %edi,56(%esp) movl 92(%esp),%ebx movl 124(%esp),%edi movl 112(%esp),%edx movl 80(%esp),%ebp movl 96(%esp),%ecx movl 100(%esp),%esi addl $1,%edx movl %ebx,28(%esp) movl %edi,60(%esp) movl %edx,112(%esp) movl $10,%ebx jmp .L004loop .align 16 .L004loop: addl %ebp,%eax movl %ebx,128(%esp) movl %ebp,%ebx xorl %eax,%edx roll $16,%edx addl %edx,%ecx xorl %ecx,%ebx movl 52(%esp),%edi roll $12,%ebx movl 20(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,(%esp) roll $8,%edx movl 4(%esp),%eax addl %edx,%ecx movl %edx,48(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi movl %ecx,32(%esp) roll $16,%edi movl %ebx,16(%esp) addl %edi,%esi movl 40(%esp),%ecx xorl %esi,%ebp movl 56(%esp),%edx roll $12,%ebp movl 24(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,4(%esp) roll $8,%edi movl 8(%esp),%eax addl %edi,%esi movl %edi,52(%esp) xorl %esi,%ebp addl %ebx,%eax roll $7,%ebp xorl %eax,%edx movl %esi,36(%esp) roll $16,%edx movl %ebp,20(%esp) addl %edx,%ecx movl 44(%esp),%esi xorl %ecx,%ebx movl 60(%esp),%edi roll $12,%ebx movl 28(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,8(%esp) roll $8,%edx movl 12(%esp),%eax addl %edx,%ecx movl %edx,56(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi roll $16,%edi movl %ebx,24(%esp) addl %edi,%esi xorl %esi,%ebp roll $12,%ebp movl 20(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,12(%esp) roll $8,%edi movl (%esp),%eax addl %edi,%esi movl %edi,%edx xorl %esi,%ebp addl %ebx,%eax roll $7,%ebp xorl %eax,%edx roll $16,%edx movl %ebp,28(%esp) addl %edx,%ecx xorl %ecx,%ebx movl 48(%esp),%edi roll $12,%ebx movl 24(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,(%esp) roll $8,%edx movl 4(%esp),%eax addl %edx,%ecx movl %edx,60(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi movl %ecx,40(%esp) roll $16,%edi movl %ebx,20(%esp) addl %edi,%esi movl 32(%esp),%ecx xorl %esi,%ebp movl 52(%esp),%edx roll $12,%ebp movl 28(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,4(%esp) roll $8,%edi movl 8(%esp),%eax addl %edi,%esi movl %edi,48(%esp) xorl %esi,%ebp addl %ebx,%eax roll $7,%ebp xorl %eax,%edx movl %esi,44(%esp) roll $16,%edx movl %ebp,24(%esp) addl %edx,%ecx movl 36(%esp),%esi xorl %ecx,%ebx movl 56(%esp),%edi roll $12,%ebx movl 16(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,8(%esp) roll $8,%edx movl 12(%esp),%eax addl %edx,%ecx movl %edx,52(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi roll $16,%edi movl %ebx,28(%esp) addl %edi,%esi xorl %esi,%ebp movl 48(%esp),%edx roll $12,%ebp movl 128(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,12(%esp) roll $8,%edi movl (%esp),%eax addl %edi,%esi movl %edi,56(%esp) xorl %esi,%ebp roll $7,%ebp decl %ebx jnz .L004loop movl 160(%esp),%ebx addl $1634760805,%eax addl 80(%esp),%ebp addl 96(%esp),%ecx addl 100(%esp),%esi cmpl $64,%ebx jb .L005tail movl 156(%esp),%ebx addl 112(%esp),%edx addl 120(%esp),%edi xorl (%ebx),%eax xorl 16(%ebx),%ebp movl %eax,(%esp) movl 152(%esp),%eax xorl 32(%ebx),%ecx xorl 36(%ebx),%esi xorl 48(%ebx),%edx xorl 56(%ebx),%edi movl %ebp,16(%eax) movl %ecx,32(%eax) movl %esi,36(%eax) movl %edx,48(%eax) movl %edi,56(%eax) movl 4(%esp),%ebp movl 8(%esp),%ecx movl 12(%esp),%esi movl 20(%esp),%edx movl 24(%esp),%edi addl $857760878,%ebp addl $2036477234,%ecx addl $1797285236,%esi addl 84(%esp),%edx addl 88(%esp),%edi xorl 4(%ebx),%ebp xorl 8(%ebx),%ecx xorl 12(%ebx),%esi xorl 20(%ebx),%edx xorl 24(%ebx),%edi movl %ebp,4(%eax) movl %ecx,8(%eax) movl %esi,12(%eax) movl %edx,20(%eax) movl %edi,24(%eax) movl 28(%esp),%ebp movl 40(%esp),%ecx movl 44(%esp),%esi movl 52(%esp),%edx movl 60(%esp),%edi addl 92(%esp),%ebp addl 104(%esp),%ecx addl 108(%esp),%esi addl 116(%esp),%edx addl 124(%esp),%edi xorl 28(%ebx),%ebp xorl 40(%ebx),%ecx xorl 44(%ebx),%esi xorl 52(%ebx),%edx xorl 60(%ebx),%edi leal 64(%ebx),%ebx movl %ebp,28(%eax) movl (%esp),%ebp movl %ecx,40(%eax) movl 160(%esp),%ecx movl %esi,44(%eax) movl %edx,52(%eax) movl %edi,60(%eax) movl %ebp,(%eax) leal 64(%eax),%eax subl $64,%ecx jnz .L003outer_loop jmp .L006done .L005tail: addl 112(%esp),%edx addl 120(%esp),%edi movl %eax,(%esp) movl %ebp,16(%esp) movl %ecx,32(%esp) movl %esi,36(%esp) movl %edx,48(%esp) movl %edi,56(%esp) movl 4(%esp),%ebp movl 8(%esp),%ecx movl 12(%esp),%esi movl 20(%esp),%edx movl 24(%esp),%edi addl $857760878,%ebp addl $2036477234,%ecx addl $1797285236,%esi addl 84(%esp),%edx addl 88(%esp),%edi movl %ebp,4(%esp) movl %ecx,8(%esp) movl %esi,12(%esp) movl %edx,20(%esp) movl %edi,24(%esp) movl 28(%esp),%ebp movl 40(%esp),%ecx movl 44(%esp),%esi movl 52(%esp),%edx movl 60(%esp),%edi addl 92(%esp),%ebp addl 104(%esp),%ecx addl 108(%esp),%esi addl 116(%esp),%edx addl 124(%esp),%edi movl %ebp,28(%esp) movl 156(%esp),%ebp movl %ecx,40(%esp) movl 152(%esp),%ecx movl %esi,44(%esp) xorl %esi,%esi movl %edx,52(%esp) movl %edi,60(%esp) xorl %eax,%eax xorl %edx,%edx .L007tail_loop: movb (%esi,%ebp,1),%al movb (%esp,%esi,1),%dl leal 1(%esi),%esi xorb %dl,%al movb %al,-1(%ecx,%esi,1) decl %ebx jnz .L007tail_loop .L006done: addl $132,%esp .L000no_data: popl %edi popl %esi popl %ebx popl %ebp ret .size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin .globl ChaCha20_ssse3 .type ChaCha20_ssse3,@function .align 16 ChaCha20_ssse3: .L_ChaCha20_ssse3_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi .Lssse3_shortcut: + testl $2048,4(%ebp) + jnz .Lxop_shortcut movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx movl 32(%esp),%edx movl 36(%esp),%ebx movl %esp,%ebp subl $524,%esp andl $-64,%esp movl %ebp,512(%esp) leal .Lssse3_data-.Lpic_point(%eax),%eax movdqu (%ebx),%xmm3 .L0081x: movdqa 32(%eax),%xmm0 movdqu (%edx),%xmm1 movdqu 16(%edx),%xmm2 movdqa (%eax),%xmm6 movdqa 16(%eax),%xmm7 movl %ebp,48(%esp) movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movl $10,%edx jmp .L009loop1x .align 16 .L010outer1x: movdqa 80(%eax),%xmm3 movdqa (%esp),%xmm0 movdqa 16(%esp),%xmm1 movdqa 32(%esp),%xmm2 paddd 48(%esp),%xmm3 movl $10,%edx movdqa %xmm3,48(%esp) jmp .L009loop1x .align 16 .L009loop1x: paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $57,%xmm1,%xmm1 pshufd $147,%xmm3,%xmm3 nop paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 decl %edx jnz .L009loop1x paddd (%esp),%xmm0 paddd 16(%esp),%xmm1 paddd 32(%esp),%xmm2 paddd 48(%esp),%xmm3 cmpl $64,%ecx jb .L011tail movdqu (%esi),%xmm4 movdqu 16(%esi),%xmm5 pxor %xmm4,%xmm0 movdqu 32(%esi),%xmm4 pxor %xmm5,%xmm1 movdqu 48(%esi),%xmm5 pxor %xmm4,%xmm2 pxor %xmm5,%xmm3 leal 64(%esi),%esi movdqu %xmm0,(%edi) movdqu %xmm1,16(%edi) movdqu %xmm2,32(%edi) movdqu %xmm3,48(%edi) leal 64(%edi),%edi subl $64,%ecx jnz .L010outer1x jmp .L012done .L011tail: movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) xorl %eax,%eax xorl %edx,%edx xorl %ebp,%ebp .L013tail_loop: movb (%esp,%ebp,1),%al movb (%esi,%ebp,1),%dl leal 1(%ebp),%ebp xorb %dl,%al movb %al,-1(%edi,%ebp,1) decl %ecx jnz .L013tail_loop .L012done: movl 512(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin .align 64 .Lssse3_data: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 .long 1634760805,857760878,2036477234,1797285236 .long 0,1,2,3 .long 4,4,4,4 .long 1,0,0,0 .long 4,0,0,0 .long 0,-1,-1,-1 .align 64 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 +.globl ChaCha20_xop +.type ChaCha20_xop,@function +.align 16 +ChaCha20_xop: +.L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0141x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L015outer_loop +.align 32 +.L015outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 32 +.L016loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz .L016loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L015outer_loop + addl $256,%ecx + jz .L017done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +.L0141x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L018loop1x +.align 16 +.L019outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp .L018loop1x +.align 16 +.L018loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L018loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb .L020tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L019outer1x + jmp .L017done +.L020tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L021tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L021tail_loop +.L017done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 #else .text .globl ChaCha20_ctr32 .type ChaCha20_ctr32,@function .align 16 ChaCha20_ctr32: .L_ChaCha20_ctr32_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi xorl %eax,%eax cmpl 28(%esp),%eax je .L000no_data call .Lpic_point .Lpic_point: popl %eax leal OPENSSL_ia32cap_P,%ebp testl $16777216,(%ebp) jz .L001x86 testl $512,4(%ebp) jz .L001x86 jmp .Lssse3_shortcut .L001x86: movl 32(%esp),%esi movl 36(%esp),%edi subl $132,%esp movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edx movl %eax,80(%esp) movl %ebx,84(%esp) movl %ecx,88(%esp) movl %edx,92(%esp) movl 16(%esi),%eax movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edx movl %eax,96(%esp) movl %ebx,100(%esp) movl %ecx,104(%esp) movl %edx,108(%esp) movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx movl 12(%edi),%edx subl $1,%eax movl %eax,112(%esp) movl %ebx,116(%esp) movl %ecx,120(%esp) movl %edx,124(%esp) jmp .L002entry .align 16 .L003outer_loop: movl %ebx,156(%esp) movl %eax,152(%esp) movl %ecx,160(%esp) .L002entry: movl $1634760805,%eax movl $857760878,4(%esp) movl $2036477234,8(%esp) movl $1797285236,12(%esp) movl 84(%esp),%ebx movl 88(%esp),%ebp movl 104(%esp),%ecx movl 108(%esp),%esi movl 116(%esp),%edx movl 120(%esp),%edi movl %ebx,20(%esp) movl %ebp,24(%esp) movl %ecx,40(%esp) movl %esi,44(%esp) movl %edx,52(%esp) movl %edi,56(%esp) movl 92(%esp),%ebx movl 124(%esp),%edi movl 112(%esp),%edx movl 80(%esp),%ebp movl 96(%esp),%ecx movl 100(%esp),%esi addl $1,%edx movl %ebx,28(%esp) movl %edi,60(%esp) movl %edx,112(%esp) movl $10,%ebx jmp .L004loop .align 16 .L004loop: addl %ebp,%eax movl %ebx,128(%esp) movl %ebp,%ebx xorl %eax,%edx roll $16,%edx addl %edx,%ecx xorl %ecx,%ebx movl 52(%esp),%edi roll $12,%ebx movl 20(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,(%esp) roll $8,%edx movl 4(%esp),%eax addl %edx,%ecx movl %edx,48(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi movl %ecx,32(%esp) roll $16,%edi movl %ebx,16(%esp) addl %edi,%esi movl 40(%esp),%ecx xorl %esi,%ebp movl 56(%esp),%edx roll $12,%ebp movl 24(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,4(%esp) roll $8,%edi movl 8(%esp),%eax addl %edi,%esi movl %edi,52(%esp) xorl %esi,%ebp addl %ebx,%eax roll $7,%ebp xorl %eax,%edx movl %esi,36(%esp) roll $16,%edx movl %ebp,20(%esp) addl %edx,%ecx movl 44(%esp),%esi xorl %ecx,%ebx movl 60(%esp),%edi roll $12,%ebx movl 28(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,8(%esp) roll $8,%edx movl 12(%esp),%eax addl %edx,%ecx movl %edx,56(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi roll $16,%edi movl %ebx,24(%esp) addl %edi,%esi xorl %esi,%ebp roll $12,%ebp movl 20(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,12(%esp) roll $8,%edi movl (%esp),%eax addl %edi,%esi movl %edi,%edx xorl %esi,%ebp addl %ebx,%eax roll $7,%ebp xorl %eax,%edx roll $16,%edx movl %ebp,28(%esp) addl %edx,%ecx xorl %ecx,%ebx movl 48(%esp),%edi roll $12,%ebx movl 24(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,(%esp) roll $8,%edx movl 4(%esp),%eax addl %edx,%ecx movl %edx,60(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi movl %ecx,40(%esp) roll $16,%edi movl %ebx,20(%esp) addl %edi,%esi movl 32(%esp),%ecx xorl %esi,%ebp movl 52(%esp),%edx roll $12,%ebp movl 28(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,4(%esp) roll $8,%edi movl 8(%esp),%eax addl %edi,%esi movl %edi,48(%esp) xorl %esi,%ebp addl %ebx,%eax roll $7,%ebp xorl %eax,%edx movl %esi,44(%esp) roll $16,%edx movl %ebp,24(%esp) addl %edx,%ecx movl 36(%esp),%esi xorl %ecx,%ebx movl 56(%esp),%edi roll $12,%ebx movl 16(%esp),%ebp addl %ebx,%eax xorl %eax,%edx movl %eax,8(%esp) roll $8,%edx movl 12(%esp),%eax addl %edx,%ecx movl %edx,52(%esp) xorl %ecx,%ebx addl %ebp,%eax roll $7,%ebx xorl %eax,%edi roll $16,%edi movl %ebx,28(%esp) addl %edi,%esi xorl %esi,%ebp movl 48(%esp),%edx roll $12,%ebp movl 128(%esp),%ebx addl %ebp,%eax xorl %eax,%edi movl %eax,12(%esp) roll $8,%edi movl (%esp),%eax addl %edi,%esi movl %edi,56(%esp) xorl %esi,%ebp roll $7,%ebp decl %ebx jnz .L004loop movl 160(%esp),%ebx addl $1634760805,%eax addl 80(%esp),%ebp addl 96(%esp),%ecx addl 100(%esp),%esi cmpl $64,%ebx jb .L005tail movl 156(%esp),%ebx addl 112(%esp),%edx addl 120(%esp),%edi xorl (%ebx),%eax xorl 16(%ebx),%ebp movl %eax,(%esp) movl 152(%esp),%eax xorl 32(%ebx),%ecx xorl 36(%ebx),%esi xorl 48(%ebx),%edx xorl 56(%ebx),%edi movl %ebp,16(%eax) movl %ecx,32(%eax) movl %esi,36(%eax) movl %edx,48(%eax) movl %edi,56(%eax) movl 4(%esp),%ebp movl 8(%esp),%ecx movl 12(%esp),%esi movl 20(%esp),%edx movl 24(%esp),%edi addl $857760878,%ebp addl $2036477234,%ecx addl $1797285236,%esi addl 84(%esp),%edx addl 88(%esp),%edi xorl 4(%ebx),%ebp xorl 8(%ebx),%ecx xorl 12(%ebx),%esi xorl 20(%ebx),%edx xorl 24(%ebx),%edi movl %ebp,4(%eax) movl %ecx,8(%eax) movl %esi,12(%eax) movl %edx,20(%eax) movl %edi,24(%eax) movl 28(%esp),%ebp movl 40(%esp),%ecx movl 44(%esp),%esi movl 52(%esp),%edx movl 60(%esp),%edi addl 92(%esp),%ebp addl 104(%esp),%ecx addl 108(%esp),%esi addl 116(%esp),%edx addl 124(%esp),%edi xorl 28(%ebx),%ebp xorl 40(%ebx),%ecx xorl 44(%ebx),%esi xorl 52(%ebx),%edx xorl 60(%ebx),%edi leal 64(%ebx),%ebx movl %ebp,28(%eax) movl (%esp),%ebp movl %ecx,40(%eax) movl 160(%esp),%ecx movl %esi,44(%eax) movl %edx,52(%eax) movl %edi,60(%eax) movl %ebp,(%eax) leal 64(%eax),%eax subl $64,%ecx jnz .L003outer_loop jmp .L006done .L005tail: addl 112(%esp),%edx addl 120(%esp),%edi movl %eax,(%esp) movl %ebp,16(%esp) movl %ecx,32(%esp) movl %esi,36(%esp) movl %edx,48(%esp) movl %edi,56(%esp) movl 4(%esp),%ebp movl 8(%esp),%ecx movl 12(%esp),%esi movl 20(%esp),%edx movl 24(%esp),%edi addl $857760878,%ebp addl $2036477234,%ecx addl $1797285236,%esi addl 84(%esp),%edx addl 88(%esp),%edi movl %ebp,4(%esp) movl %ecx,8(%esp) movl %esi,12(%esp) movl %edx,20(%esp) movl %edi,24(%esp) movl 28(%esp),%ebp movl 40(%esp),%ecx movl 44(%esp),%esi movl 52(%esp),%edx movl 60(%esp),%edi addl 92(%esp),%ebp addl 104(%esp),%ecx addl 108(%esp),%esi addl 116(%esp),%edx addl 124(%esp),%edi movl %ebp,28(%esp) movl 156(%esp),%ebp movl %ecx,40(%esp) movl 152(%esp),%ecx movl %esi,44(%esp) xorl %esi,%esi movl %edx,52(%esp) movl %edi,60(%esp) xorl %eax,%eax xorl %edx,%edx .L007tail_loop: movb (%esi,%ebp,1),%al movb (%esp,%esi,1),%dl leal 1(%esi),%esi xorb %dl,%al movb %al,-1(%ecx,%esi,1) decl %ebx jnz .L007tail_loop .L006done: addl $132,%esp .L000no_data: popl %edi popl %esi popl %ebx popl %ebp ret .size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin .globl ChaCha20_ssse3 .type ChaCha20_ssse3,@function .align 16 ChaCha20_ssse3: .L_ChaCha20_ssse3_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi .Lssse3_shortcut: + testl $2048,4(%ebp) + jnz .Lxop_shortcut movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx movl 32(%esp),%edx movl 36(%esp),%ebx movl %esp,%ebp subl $524,%esp andl $-64,%esp movl %ebp,512(%esp) leal .Lssse3_data-.Lpic_point(%eax),%eax movdqu (%ebx),%xmm3 .L0081x: movdqa 32(%eax),%xmm0 movdqu (%edx),%xmm1 movdqu 16(%edx),%xmm2 movdqa (%eax),%xmm6 movdqa 16(%eax),%xmm7 movl %ebp,48(%esp) movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movl $10,%edx jmp .L009loop1x .align 16 .L010outer1x: movdqa 80(%eax),%xmm3 movdqa (%esp),%xmm0 movdqa 16(%esp),%xmm1 movdqa 32(%esp),%xmm2 paddd 48(%esp),%xmm3 movl $10,%edx movdqa %xmm3,48(%esp) jmp .L009loop1x .align 16 .L009loop1x: paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $57,%xmm1,%xmm1 pshufd $147,%xmm3,%xmm3 nop paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,222 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $20,%xmm1 pslld $12,%xmm4 por %xmm4,%xmm1 paddd %xmm1,%xmm0 pxor %xmm0,%xmm3 .byte 102,15,56,0,223 paddd %xmm3,%xmm2 pxor %xmm2,%xmm1 movdqa %xmm1,%xmm4 psrld $25,%xmm1 pslld $7,%xmm4 por %xmm4,%xmm1 pshufd $78,%xmm2,%xmm2 pshufd $147,%xmm1,%xmm1 pshufd $57,%xmm3,%xmm3 decl %edx jnz .L009loop1x paddd (%esp),%xmm0 paddd 16(%esp),%xmm1 paddd 32(%esp),%xmm2 paddd 48(%esp),%xmm3 cmpl $64,%ecx jb .L011tail movdqu (%esi),%xmm4 movdqu 16(%esi),%xmm5 pxor %xmm4,%xmm0 movdqu 32(%esi),%xmm4 pxor %xmm5,%xmm1 movdqu 48(%esi),%xmm5 pxor %xmm4,%xmm2 pxor %xmm5,%xmm3 leal 64(%esi),%esi movdqu %xmm0,(%edi) movdqu %xmm1,16(%edi) movdqu %xmm2,32(%edi) movdqu %xmm3,48(%edi) leal 64(%edi),%edi subl $64,%ecx jnz .L010outer1x jmp .L012done .L011tail: movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) xorl %eax,%eax xorl %edx,%edx xorl %ebp,%ebp .L013tail_loop: movb (%esp,%ebp,1),%al movb (%esi,%ebp,1),%dl leal 1(%ebp),%ebp xorb %dl,%al movb %al,-1(%edi,%ebp,1) decl %ecx jnz .L013tail_loop .L012done: movl 512(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin .align 64 .Lssse3_data: .byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 .byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 .long 1634760805,857760878,2036477234,1797285236 .long 0,1,2,3 .long 4,4,4,4 .long 1,0,0,0 .long 4,0,0,0 .long 0,-1,-1,-1 .align 64 .byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 +.globl ChaCha20_xop +.type ChaCha20_xop,@function +.align 16 +ChaCha20_xop: +.L_ChaCha20_xop_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +.Lxop_shortcut: + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + vzeroupper + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + vmovdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0141x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + vmovdqu (%edx),%xmm7 + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpaddd 48(%eax),%xmm0,%xmm0 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpsubd 64(%eax),%xmm0,%xmm0 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,64(%ebp) + vmovdqa %xmm1,80(%ebp) + vmovdqa %xmm2,96(%ebp) + vmovdqa %xmm3,112(%ebp) + vmovdqu 16(%edx),%xmm3 + vmovdqa %xmm4,-64(%ebp) + vmovdqa %xmm5,-48(%ebp) + vmovdqa %xmm6,-32(%ebp) + vmovdqa %xmm7,-16(%ebp) + vmovdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + vpshufd $0,%xmm3,%xmm0 + vpshufd $85,%xmm3,%xmm1 + vpshufd $170,%xmm3,%xmm2 + vpshufd $255,%xmm3,%xmm3 + vpshufd $0,%xmm7,%xmm4 + vpshufd $85,%xmm7,%xmm5 + vpshufd $170,%xmm7,%xmm6 + vpshufd $255,%xmm7,%xmm7 + vmovdqa %xmm0,(%ebp) + vmovdqa %xmm1,16(%ebp) + vmovdqa %xmm2,32(%ebp) + vmovdqa %xmm3,48(%ebp) + vmovdqa %xmm4,-128(%ebp) + vmovdqa %xmm5,-112(%ebp) + vmovdqa %xmm6,-96(%ebp) + vmovdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L015outer_loop +.align 32 +.L015outer_loop: + vmovdqa -112(%ebp),%xmm1 + vmovdqa -96(%ebp),%xmm2 + vmovdqa -80(%ebp),%xmm3 + vmovdqa -48(%ebp),%xmm5 + vmovdqa -32(%ebp),%xmm6 + vmovdqa -16(%ebp),%xmm7 + vmovdqa %xmm1,-112(%ebx) + vmovdqa %xmm2,-96(%ebx) + vmovdqa %xmm3,-80(%ebx) + vmovdqa %xmm5,-48(%ebx) + vmovdqa %xmm6,-32(%ebx) + vmovdqa %xmm7,-16(%ebx) + vmovdqa 32(%ebp),%xmm2 + vmovdqa 48(%ebp),%xmm3 + vmovdqa 64(%ebp),%xmm4 + vmovdqa 80(%ebp),%xmm5 + vmovdqa 96(%ebp),%xmm6 + vmovdqa 112(%ebp),%xmm7 + vpaddd 64(%eax),%xmm4,%xmm4 + vmovdqa %xmm2,32(%ebx) + vmovdqa %xmm3,48(%ebx) + vmovdqa %xmm4,64(%ebx) + vmovdqa %xmm5,80(%ebx) + vmovdqa %xmm6,96(%ebx) + vmovdqa %xmm7,112(%ebx) + vmovdqa %xmm4,64(%ebp) + vmovdqa -128(%ebp),%xmm0 + vmovdqa %xmm4,%xmm6 + vmovdqa -64(%ebp),%xmm3 + vmovdqa (%ebp),%xmm4 + vmovdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 32 +.L016loop: + vpaddd %xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,246,16 + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -48(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 80(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,64(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-64(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa 32(%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -32(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 96(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,80(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,16(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-48(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 48(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -16(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 112(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,96(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-32(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -48(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm7,%xmm6 +.byte 143,232,120,194,219,7 +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-16(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -112(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -32(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 64(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-128(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,112(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 + vmovdqa %xmm4,32(%ebx) +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-48(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa (%ebx),%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -96(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vmovdqa -16(%ebx),%xmm2 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 80(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpaddd %xmm2,%xmm0,%xmm0 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-112(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,64(%ebx) + vpxor %xmm5,%xmm3,%xmm3 + vpxor %xmm0,%xmm6,%xmm6 +.byte 143,232,120,194,219,7 + vmovdqa %xmm5,48(%ebx) +.byte 143,232,120,194,246,16 + vmovdqa %xmm3,-32(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa 16(%ebx),%xmm5 + vpxor %xmm4,%xmm2,%xmm2 + vmovdqa -80(%ebx),%xmm1 +.byte 143,232,120,194,210,12 + vmovdqa -64(%ebx),%xmm3 + vpaddd %xmm2,%xmm0,%xmm0 + vmovdqa 96(%ebx),%xmm7 + vpxor %xmm0,%xmm6,%xmm6 + vpaddd %xmm3,%xmm1,%xmm1 +.byte 143,232,120,194,246,8 + vmovdqa %xmm0,-96(%ebx) + vpaddd %xmm6,%xmm4,%xmm4 + vmovdqa %xmm6,80(%ebx) + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,210,7 +.byte 143,232,120,194,255,16 + vmovdqa %xmm2,-16(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqa -128(%ebx),%xmm0 +.byte 143,232,120,194,219,12 + vpaddd %xmm3,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm6 + vpxor %xmm1,%xmm7,%xmm7 +.byte 143,232,120,194,255,8 + vmovdqa %xmm1,-80(%ebx) + vpaddd %xmm7,%xmm5,%xmm5 + vmovdqa %xmm7,96(%ebx) + vpxor %xmm5,%xmm3,%xmm3 +.byte 143,232,120,194,219,7 + decl %edx + jnz .L016loop + vmovdqa %xmm3,-64(%ebx) + vmovdqa %xmm4,(%ebx) + vmovdqa %xmm5,16(%ebx) + vmovdqa %xmm6,64(%ebx) + vmovdqa %xmm7,96(%ebx) + vmovdqa -112(%ebx),%xmm1 + vmovdqa -96(%ebx),%xmm2 + vmovdqa -80(%ebx),%xmm3 + vpaddd -128(%ebp),%xmm0,%xmm0 + vpaddd -112(%ebp),%xmm1,%xmm1 + vpaddd -96(%ebp),%xmm2,%xmm2 + vpaddd -80(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa -64(%ebx),%xmm0 + vmovdqa -48(%ebx),%xmm1 + vmovdqa -32(%ebx),%xmm2 + vmovdqa -16(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd -64(%ebp),%xmm0,%xmm0 + vpaddd -48(%ebp),%xmm1,%xmm1 + vpaddd -32(%ebp),%xmm2,%xmm2 + vpaddd -16(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa (%ebx),%xmm0 + vmovdqa 16(%ebx),%xmm1 + vmovdqa 32(%ebx),%xmm2 + vmovdqa 48(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd (%ebp),%xmm0,%xmm0 + vpaddd 16(%ebp),%xmm1,%xmm1 + vpaddd 32(%ebp),%xmm2,%xmm2 + vpaddd 48(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 16(%esi),%esi + vmovdqa 64(%ebx),%xmm0 + vmovdqa 80(%ebx),%xmm1 + vmovdqa 96(%ebx),%xmm2 + vmovdqa 112(%ebx),%xmm3 + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 16(%edi),%edi + vpaddd 64(%ebp),%xmm0,%xmm0 + vpaddd 80(%ebp),%xmm1,%xmm1 + vpaddd 96(%ebp),%xmm2,%xmm2 + vpaddd 112(%ebp),%xmm3,%xmm3 + vpunpckldq %xmm1,%xmm0,%xmm6 + vpunpckldq %xmm3,%xmm2,%xmm7 + vpunpckhdq %xmm1,%xmm0,%xmm0 + vpunpckhdq %xmm3,%xmm2,%xmm2 + vpunpcklqdq %xmm7,%xmm6,%xmm1 + vpunpckhqdq %xmm7,%xmm6,%xmm6 + vpunpcklqdq %xmm2,%xmm0,%xmm7 + vpunpckhqdq %xmm2,%xmm0,%xmm3 + vpxor -128(%esi),%xmm1,%xmm4 + vpxor -64(%esi),%xmm6,%xmm5 + vpxor (%esi),%xmm7,%xmm6 + vpxor 64(%esi),%xmm3,%xmm7 + leal 208(%esi),%esi + vmovdqu %xmm4,-128(%edi) + vmovdqu %xmm5,-64(%edi) + vmovdqu %xmm6,(%edi) + vmovdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L015outer_loop + addl $256,%ecx + jz .L017done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + vmovd 64(%ebp),%xmm2 + vmovdqu (%ebx),%xmm3 + vpaddd 96(%eax),%xmm2,%xmm2 + vpand 112(%eax),%xmm3,%xmm3 + vpor %xmm2,%xmm3,%xmm3 +.L0141x: + vmovdqa 32(%eax),%xmm0 + vmovdqu (%edx),%xmm1 + vmovdqu 16(%edx),%xmm2 + vmovdqa (%eax),%xmm6 + vmovdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L018loop1x +.align 16 +.L019outer1x: + vmovdqa 80(%eax),%xmm3 + vmovdqa (%esp),%xmm0 + vmovdqa 16(%esp),%xmm1 + vmovdqa 32(%esp),%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + movl $10,%edx + vmovdqa %xmm3,48(%esp) + jmp .L018loop1x +.align 16 +.L018loop1x: + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $57,%xmm1,%xmm1 + vpshufd $147,%xmm3,%xmm3 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,16 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,12 + vpaddd %xmm1,%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 +.byte 143,232,120,194,219,8 + vpaddd %xmm3,%xmm2,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 +.byte 143,232,120,194,201,7 + vpshufd $78,%xmm2,%xmm2 + vpshufd $147,%xmm1,%xmm1 + vpshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L018loop1x + vpaddd (%esp),%xmm0,%xmm0 + vpaddd 16(%esp),%xmm1,%xmm1 + vpaddd 32(%esp),%xmm2,%xmm2 + vpaddd 48(%esp),%xmm3,%xmm3 + cmpl $64,%ecx + jb .L020tail + vpxor (%esi),%xmm0,%xmm0 + vpxor 16(%esi),%xmm1,%xmm1 + vpxor 32(%esi),%xmm2,%xmm2 + vpxor 48(%esi),%xmm3,%xmm3 + leal 64(%esi),%esi + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L019outer1x + jmp .L017done +.L020tail: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L021tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L021tail_loop +.L017done: + vzeroupper + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_xop,.-.L_ChaCha20_xop_begin .comm OPENSSL_ia32cap_P,16,4 #endif Index: stable/12/secure/lib/libcrypto/i386/poly1305-x86.S =================================================================== --- stable/12/secure/lib/libcrypto/i386/poly1305-x86.S (revision 364962) +++ stable/12/secure/lib/libcrypto/i386/poly1305-x86.S (revision 364963) @@ -1,2715 +1,3825 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from poly1305-x86.pl. */ #ifdef PIC .text .align 64 .globl poly1305_init .type poly1305_init,@function .align 16 poly1305_init: .L_poly1305_init_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ebp xorl %eax,%eax movl %eax,(%edi) movl %eax,4(%edi) movl %eax,8(%edi) movl %eax,12(%edi) movl %eax,16(%edi) movl %eax,20(%edi) cmpl $0,%esi je .L000nokey call .L001pic_point .L001pic_point: popl %ebx leal poly1305_blocks-.L001pic_point(%ebx),%eax leal poly1305_emit-.L001pic_point(%ebx),%edx leal OPENSSL_ia32cap_P-.L001pic_point(%ebx),%edi movl (%edi),%ecx andl $83886080,%ecx cmpl $83886080,%ecx jne .L002no_sse2 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz .L002no_sse2 + leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax .L002no_sse2: movl 20(%esp),%edi movl %eax,(%ebp) movl %edx,4(%ebp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edx andl $268435455,%eax andl $268435452,%ebx andl $268435452,%ecx andl $268435452,%edx movl %eax,24(%edi) movl %ebx,28(%edi) movl %ecx,32(%edi) movl %edx,36(%edi) movl $1,%eax .L000nokey: popl %edi popl %esi popl %ebx popl %ebp ret .size poly1305_init,.-.L_poly1305_init_begin .globl poly1305_blocks .type poly1305_blocks,@function .align 16 poly1305_blocks: .L_poly1305_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx .Lenter_blocks: andl $-15,%ecx jz .L003nodata subl $64,%esp movl 24(%edi),%eax movl 28(%edi),%ebx leal (%esi,%ecx,1),%ebp movl 32(%edi),%ecx movl 36(%edi),%edx movl %ebp,92(%esp) movl %esi,%ebp movl %eax,36(%esp) movl %ebx,%eax shrl $2,%eax movl %ebx,40(%esp) addl %ebx,%eax movl %ecx,%ebx shrl $2,%ebx movl %ecx,44(%esp) addl %ecx,%ebx movl %edx,%ecx shrl $2,%ecx movl %edx,48(%esp) addl %edx,%ecx movl %eax,52(%esp) movl %ebx,56(%esp) movl %ecx,60(%esp) movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx movl 12(%edi),%esi movl 16(%edi),%edi jmp .L004loop .align 32 .L004loop: addl (%ebp),%eax adcl 4(%ebp),%ebx adcl 8(%ebp),%ecx adcl 12(%ebp),%esi leal 16(%ebp),%ebp adcl 96(%esp),%edi movl %eax,(%esp) movl %esi,12(%esp) mull 36(%esp) movl %edi,16(%esp) movl %eax,%edi movl %ebx,%eax movl %edx,%esi mull 60(%esp) addl %eax,%edi movl %ecx,%eax adcl %edx,%esi mull 56(%esp) addl %eax,%edi movl 12(%esp),%eax adcl %edx,%esi mull 52(%esp) addl %eax,%edi movl (%esp),%eax adcl %edx,%esi mull 40(%esp) movl %edi,20(%esp) xorl %edi,%edi addl %eax,%esi movl %ebx,%eax adcl %edx,%edi mull 36(%esp) addl %eax,%esi movl %ecx,%eax adcl %edx,%edi mull 60(%esp) addl %eax,%esi movl 12(%esp),%eax adcl %edx,%edi mull 56(%esp) addl %eax,%esi movl 16(%esp),%eax adcl %edx,%edi imull 52(%esp),%eax addl %eax,%esi movl (%esp),%eax adcl $0,%edi mull 44(%esp) movl %esi,24(%esp) xorl %esi,%esi addl %eax,%edi movl %ebx,%eax adcl %edx,%esi mull 40(%esp) addl %eax,%edi movl %ecx,%eax adcl %edx,%esi mull 36(%esp) addl %eax,%edi movl 12(%esp),%eax adcl %edx,%esi mull 60(%esp) addl %eax,%edi movl 16(%esp),%eax adcl %edx,%esi imull 56(%esp),%eax addl %eax,%edi movl (%esp),%eax adcl $0,%esi mull 48(%esp) movl %edi,28(%esp) xorl %edi,%edi addl %eax,%esi movl %ebx,%eax adcl %edx,%edi mull 44(%esp) addl %eax,%esi movl %ecx,%eax adcl %edx,%edi mull 40(%esp) addl %eax,%esi movl 12(%esp),%eax adcl %edx,%edi mull 36(%esp) addl %eax,%esi movl 16(%esp),%ecx adcl %edx,%edi movl %ecx,%edx imull 60(%esp),%ecx addl %ecx,%esi movl 20(%esp),%eax adcl $0,%edi imull 36(%esp),%edx addl %edi,%edx movl 24(%esp),%ebx movl 28(%esp),%ecx movl %edx,%edi shrl $2,%edx andl $3,%edi leal (%edx,%edx,4),%edx addl %edx,%eax adcl $0,%ebx adcl $0,%ecx adcl $0,%esi adcl $0,%edi cmpl 92(%esp),%ebp jne .L004loop movl 84(%esp),%edx addl $64,%esp movl %eax,(%edx) movl %ebx,4(%edx) movl %ecx,8(%edx) movl %esi,12(%edx) movl %edi,16(%edx) .L003nodata: popl %edi popl %esi popl %ebx popl %ebp ret .size poly1305_blocks,.-.L_poly1305_blocks_begin .globl poly1305_emit .type poly1305_emit,@function .align 16 poly1305_emit: .L_poly1305_emit_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%ebp .Lenter_emit: movl 24(%esp),%edi movl (%ebp),%eax movl 4(%ebp),%ebx movl 8(%ebp),%ecx movl 12(%ebp),%edx movl 16(%ebp),%esi addl $5,%eax adcl $0,%ebx adcl $0,%ecx adcl $0,%edx adcl $0,%esi shrl $2,%esi negl %esi andl %esi,%eax andl %esi,%ebx andl %esi,%ecx andl %esi,%edx movl %eax,(%edi) movl %ebx,4(%edi) movl %ecx,8(%edi) movl %edx,12(%edi) notl %esi movl (%ebp),%eax movl 4(%ebp),%ebx movl 8(%ebp),%ecx movl 12(%ebp),%edx movl 28(%esp),%ebp andl %esi,%eax andl %esi,%ebx andl %esi,%ecx andl %esi,%edx orl (%edi),%eax orl 4(%edi),%ebx orl 8(%edi),%ecx orl 12(%edi),%edx addl (%ebp),%eax adcl 4(%ebp),%ebx adcl 8(%ebp),%ecx adcl 12(%ebp),%edx movl %eax,(%edi) movl %ebx,4(%edi) movl %ecx,8(%edi) movl %edx,12(%edi) popl %edi popl %esi popl %ebx popl %ebp ret .size poly1305_emit,.-.L_poly1305_emit_begin .align 32 .type _poly1305_init_sse2,@function .align 16 _poly1305_init_sse2: movdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp subl $224,%esp andl $-16,%esp movq 64(%ebx),%xmm7 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 movdqa %xmm4,%xmm2 pand %xmm7,%xmm0 psrlq $26,%xmm1 psrldq $6,%xmm2 pand %xmm7,%xmm1 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 pand %xmm7,%xmm2 pand %xmm7,%xmm3 psrldq $13,%xmm4 leal 144(%esp),%edx movl $2,%ecx .L005square: movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) movdqa %xmm1,%xmm6 movdqa %xmm2,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm1,%xmm6 paddd %xmm2,%xmm5 movdqa %xmm6,80(%esp) movdqa %xmm5,96(%esp) movdqa %xmm3,%xmm6 movdqa %xmm4,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm3,%xmm6 paddd %xmm4,%xmm5 movdqa %xmm6,112(%esp) movdqa %xmm5,128(%esp) pshufd $68,%xmm0,%xmm6 movdqa %xmm1,%xmm5 pshufd $68,%xmm1,%xmm1 pshufd $68,%xmm2,%xmm2 pshufd $68,%xmm3,%xmm3 pshufd $68,%xmm4,%xmm4 movdqa %xmm6,(%edx) movdqa %xmm1,16(%edx) movdqa %xmm2,32(%edx) movdqa %xmm3,48(%edx) movdqa %xmm4,64(%edx) pmuludq %xmm0,%xmm4 pmuludq %xmm0,%xmm3 pmuludq %xmm0,%xmm2 pmuludq %xmm0,%xmm1 pmuludq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 48(%edx),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa 80(%esp),%xmm6 pmuludq (%edx),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%edx),%xmm6 movdqa 32(%esp),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%edx),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%edx),%xmm5 paddq %xmm7,%xmm4 movdqa 96(%esp),%xmm7 pmuludq (%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%edx),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%edx),%xmm5 movdqa 48(%esp),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%edx),%xmm6 paddq %xmm5,%xmm0 movdqa 112(%esp),%xmm5 pmuludq (%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%edx),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%edx),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%edx),%xmm7 movdqa 64(%esp),%xmm5 paddq %xmm6,%xmm1 movdqa 128(%esp),%xmm6 pmuludq (%edx),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%edx),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%edx),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 decl %ecx jz .L006square_break punpcklqdq (%esp),%xmm0 punpcklqdq 16(%esp),%xmm1 punpcklqdq 32(%esp),%xmm2 punpcklqdq 48(%esp),%xmm3 punpcklqdq 64(%esp),%xmm4 jmp .L005square .L006square_break: psllq $32,%xmm0 psllq $32,%xmm1 psllq $32,%xmm2 psllq $32,%xmm3 psllq $32,%xmm4 por (%esp),%xmm0 por 16(%esp),%xmm1 por 32(%esp),%xmm2 por 48(%esp),%xmm3 por 64(%esp),%xmm4 pshufd $141,%xmm0,%xmm0 pshufd $141,%xmm1,%xmm1 pshufd $141,%xmm2,%xmm2 pshufd $141,%xmm3,%xmm3 pshufd $141,%xmm4,%xmm4 movdqu %xmm0,(%edi) movdqu %xmm1,16(%edi) movdqu %xmm2,32(%edi) movdqu %xmm3,48(%edi) movdqu %xmm4,64(%edi) movdqa %xmm1,%xmm6 movdqa %xmm2,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm1,%xmm6 paddd %xmm2,%xmm5 movdqu %xmm6,80(%edi) movdqu %xmm5,96(%edi) movdqa %xmm3,%xmm6 movdqa %xmm4,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm3,%xmm6 paddd %xmm4,%xmm5 movdqu %xmm6,112(%edi) movdqu %xmm5,128(%edi) movl %ebp,%esp leal -48(%edi),%edi ret .size _poly1305_init_sse2,.-_poly1305_init_sse2 .align 32 .type _poly1305_blocks_sse2,@function .align 16 _poly1305_blocks_sse2: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx movl 20(%edi),%eax andl $-16,%ecx jz .L007nodata cmpl $64,%ecx jae .L008enter_sse2 testl %eax,%eax jz .Lenter_blocks .align 16 .L008enter_sse2: call .L009pic_point .L009pic_point: popl %ebx leal .Lconst_sse2-.L009pic_point(%ebx),%ebx testl %eax,%eax jnz .L010base2_26 call _poly1305_init_sse2 movl (%edi),%eax movl 3(%edi),%ecx movl 6(%edi),%edx movl 9(%edi),%esi movl 13(%edi),%ebp movl $1,20(%edi) shrl $2,%ecx andl $67108863,%eax shrl $4,%edx andl $67108863,%ecx shrl $6,%esi andl $67108863,%edx movd %eax,%xmm0 movd %ecx,%xmm1 movd %edx,%xmm2 movd %esi,%xmm3 movd %ebp,%xmm4 movl 24(%esp),%esi movl 28(%esp),%ecx jmp .L011base2_32 .align 16 .L010base2_26: movd (%edi),%xmm0 movd 4(%edi),%xmm1 movd 8(%edi),%xmm2 movd 12(%edi),%xmm3 movd 16(%edi),%xmm4 movdqa 64(%ebx),%xmm7 .L011base2_32: movl 32(%esp),%eax movl %esp,%ebp subl $528,%esp andl $-16,%esp leal 48(%edi),%edi shll $24,%eax testl $31,%ecx jz .L012even movdqu (%esi),%xmm6 leal 16(%esi),%esi movdqa %xmm6,%xmm5 pand %xmm7,%xmm6 paddd %xmm6,%xmm0 movdqa %xmm5,%xmm6 psrlq $26,%xmm5 psrldq $6,%xmm6 pand %xmm7,%xmm5 paddd %xmm5,%xmm1 movdqa %xmm6,%xmm5 psrlq $4,%xmm6 pand %xmm7,%xmm6 paddd %xmm6,%xmm2 movdqa %xmm5,%xmm6 psrlq $30,%xmm5 pand %xmm7,%xmm5 psrldq $7,%xmm6 paddd %xmm5,%xmm3 movd %eax,%xmm5 paddd %xmm6,%xmm4 movd 12(%edi),%xmm6 paddd %xmm5,%xmm4 movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) pmuludq %xmm6,%xmm0 pmuludq %xmm6,%xmm1 pmuludq %xmm6,%xmm2 movd 28(%edi),%xmm5 pmuludq %xmm6,%xmm3 pmuludq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 48(%esp),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 movd 92(%edi),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%esp),%xmm6 movd 44(%edi),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%esp),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%esp),%xmm5 paddq %xmm7,%xmm4 movd 108(%edi),%xmm7 pmuludq (%esp),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%esp),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%esp),%xmm5 movd 60(%edi),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%esp),%xmm6 paddq %xmm5,%xmm0 movd 124(%edi),%xmm5 pmuludq (%esp),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%esp),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%esp),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%esp),%xmm7 movd 76(%edi),%xmm5 paddq %xmm6,%xmm1 movd 140(%edi),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%esp),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%esp),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 subl $16,%ecx jz .L013done .L012even: leal 384(%esp),%edx leal -32(%esi),%eax subl $64,%ecx movdqu (%edi),%xmm5 pshufd $68,%xmm5,%xmm6 cmovbl %eax,%esi pshufd $238,%xmm5,%xmm5 movdqa %xmm6,(%edx) leal 160(%esp),%eax movdqu 16(%edi),%xmm6 movdqa %xmm5,-144(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,16(%edx) movdqu 32(%edi),%xmm5 movdqa %xmm6,-128(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,32(%edx) movdqu 48(%edi),%xmm6 movdqa %xmm5,-112(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,48(%edx) movdqu 64(%edi),%xmm5 movdqa %xmm6,-96(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,64(%edx) movdqu 80(%edi),%xmm6 movdqa %xmm5,-80(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,80(%edx) movdqu 96(%edi),%xmm5 movdqa %xmm6,-64(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,96(%edx) movdqu 112(%edi),%xmm6 movdqa %xmm5,-48(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,112(%edx) movdqu 128(%edi),%xmm5 movdqa %xmm6,-32(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,128(%edx) movdqa %xmm5,-16(%edx) movdqu 32(%esi),%xmm5 movdqu 48(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,112(%esp) movdqa %xmm3,128(%esp) movdqa %xmm4,144(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 movdqa %xmm0,80(%esp) movdqa %xmm1,96(%esp) jbe .L014skip_loop jmp .L015loop .align 32 .L015loop: movdqa -144(%edx),%xmm7 movdqa %xmm6,16(%eax) movdqa %xmm2,32(%eax) movdqa %xmm3,48(%eax) movdqa %xmm4,64(%eax) movdqa %xmm5,%xmm1 pmuludq %xmm7,%xmm5 movdqa %xmm6,%xmm0 pmuludq %xmm7,%xmm6 pmuludq %xmm7,%xmm2 pmuludq %xmm7,%xmm3 pmuludq %xmm7,%xmm4 pmuludq -16(%edx),%xmm0 movdqa %xmm1,%xmm7 pmuludq -128(%edx),%xmm1 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq -112(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa %xmm5,%xmm6 pmuludq -96(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 16(%eax),%xmm7 pmuludq -80(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq -128(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq -112(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 32(%eax),%xmm7 pmuludq -96(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq -32(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq -16(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq -128(%edx),%xmm6 paddq %xmm5,%xmm1 movdqa 48(%eax),%xmm5 pmuludq -112(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq -48(%edx),%xmm5 paddq %xmm7,%xmm4 movdqa %xmm6,%xmm7 pmuludq -32(%edx),%xmm6 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq -16(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa 64(%eax),%xmm6 pmuludq -128(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa %xmm6,%xmm7 pmuludq -16(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq -64(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq -48(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa 64(%ebx),%xmm7 pmuludq -32(%edx),%xmm6 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqu -32(%esi),%xmm5 movdqu -16(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 leal -32(%esi),%eax subl $64,%ecx paddd 80(%esp),%xmm5 paddd 96(%esp),%xmm6 paddd 112(%esp),%xmm2 paddd 128(%esp),%xmm3 paddd 144(%esp),%xmm4 cmovbl %eax,%esi leal 160(%esp),%eax movdqa (%edx),%xmm7 movdqa %xmm1,16(%esp) movdqa %xmm6,16(%eax) movdqa %xmm2,32(%eax) movdqa %xmm3,48(%eax) movdqa %xmm4,64(%eax) movdqa %xmm5,%xmm1 pmuludq %xmm7,%xmm5 paddq %xmm0,%xmm5 movdqa %xmm6,%xmm0 pmuludq %xmm7,%xmm6 pmuludq %xmm7,%xmm2 pmuludq %xmm7,%xmm3 pmuludq %xmm7,%xmm4 paddq 16(%esp),%xmm6 paddq 32(%esp),%xmm2 paddq 48(%esp),%xmm3 paddq 64(%esp),%xmm4 pmuludq 128(%edx),%xmm0 movdqa %xmm1,%xmm7 pmuludq 16(%edx),%xmm1 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq 32(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa %xmm5,%xmm6 pmuludq 48(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 16(%eax),%xmm7 pmuludq 64(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 16(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 32(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 32(%eax),%xmm7 pmuludq 48(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 112(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 128(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 16(%edx),%xmm6 paddq %xmm5,%xmm1 movdqa 48(%eax),%xmm5 pmuludq 32(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 96(%edx),%xmm5 paddq %xmm7,%xmm4 movdqa %xmm6,%xmm7 pmuludq 112(%edx),%xmm6 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq 128(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa 64(%eax),%xmm6 pmuludq 16(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa %xmm6,%xmm7 pmuludq 128(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 80(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 96(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa 64(%ebx),%xmm7 pmuludq 112(%edx),%xmm6 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 movdqu 32(%esi),%xmm5 movdqu 48(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,112(%esp) movdqa %xmm3,128(%esp) movdqa %xmm4,144(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 movdqa %xmm0,80(%esp) movdqa %xmm1,96(%esp) ja .L015loop .L014skip_loop: pshufd $16,-144(%edx),%xmm7 addl $32,%ecx jnz .L016long_tail paddd %xmm0,%xmm5 paddd %xmm1,%xmm6 paddd 112(%esp),%xmm2 paddd 128(%esp),%xmm3 paddd 144(%esp),%xmm4 .L016long_tail: movdqa %xmm5,(%eax) movdqa %xmm6,16(%eax) movdqa %xmm2,32(%eax) movdqa %xmm3,48(%eax) movdqa %xmm4,64(%eax) pmuludq %xmm7,%xmm5 pmuludq %xmm7,%xmm6 pmuludq %xmm7,%xmm2 movdqa %xmm5,%xmm0 pshufd $16,-128(%edx),%xmm5 pmuludq %xmm7,%xmm3 movdqa %xmm6,%xmm1 pmuludq %xmm7,%xmm4 movdqa %xmm5,%xmm6 pmuludq 48(%eax),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%eax),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%eax),%xmm7 paddq %xmm6,%xmm3 pshufd $16,-64(%edx),%xmm6 pmuludq (%eax),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%eax),%xmm6 pshufd $16,-112(%edx),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%eax),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%eax),%xmm5 paddq %xmm7,%xmm4 pshufd $16,-48(%edx),%xmm7 pmuludq (%eax),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%eax),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%eax),%xmm5 pshufd $16,-96(%edx),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%eax),%xmm6 paddq %xmm5,%xmm0 pshufd $16,-32(%edx),%xmm5 pmuludq (%eax),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%eax),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%eax),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%eax),%xmm7 pshufd $16,-80(%edx),%xmm5 paddq %xmm6,%xmm1 pshufd $16,-16(%edx),%xmm6 pmuludq (%eax),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%eax),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%eax),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%eax),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%eax),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 jz .L017short_tail movdqu -32(%esi),%xmm5 movdqu -16(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 pshufd $16,(%edx),%xmm7 paddd 80(%esp),%xmm5 paddd 96(%esp),%xmm6 paddd 112(%esp),%xmm2 paddd 128(%esp),%xmm3 paddd 144(%esp),%xmm4 movdqa %xmm5,(%esp) pmuludq %xmm7,%xmm5 movdqa %xmm6,16(%esp) pmuludq %xmm7,%xmm6 paddq %xmm5,%xmm0 movdqa %xmm2,%xmm5 pmuludq %xmm7,%xmm2 paddq %xmm6,%xmm1 movdqa %xmm3,%xmm6 pmuludq %xmm7,%xmm3 paddq 32(%esp),%xmm2 movdqa %xmm5,32(%esp) pshufd $16,16(%edx),%xmm5 paddq 48(%esp),%xmm3 movdqa %xmm6,48(%esp) movdqa %xmm4,%xmm6 pmuludq %xmm7,%xmm4 paddq 64(%esp),%xmm4 movdqa %xmm6,64(%esp) movdqa %xmm5,%xmm6 pmuludq 48(%esp),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 pshufd $16,80(%edx),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%esp),%xmm6 pshufd $16,32(%edx),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%esp),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%esp),%xmm5 paddq %xmm7,%xmm4 pshufd $16,96(%edx),%xmm7 pmuludq (%esp),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%esp),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%esp),%xmm5 pshufd $16,48(%edx),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%esp),%xmm6 paddq %xmm5,%xmm0 pshufd $16,112(%edx),%xmm5 pmuludq (%esp),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%esp),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%esp),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%esp),%xmm7 pshufd $16,64(%edx),%xmm5 paddq %xmm6,%xmm1 pshufd $16,128(%edx),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%esp),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%esp),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 .L017short_tail: pshufd $78,%xmm4,%xmm6 pshufd $78,%xmm3,%xmm5 paddq %xmm6,%xmm4 paddq %xmm5,%xmm3 pshufd $78,%xmm0,%xmm6 pshufd $78,%xmm1,%xmm5 paddq %xmm6,%xmm0 paddq %xmm5,%xmm1 pshufd $78,%xmm2,%xmm6 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm6,%xmm2 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 .L013done: movd %xmm0,-48(%edi) movd %xmm1,-44(%edi) movd %xmm2,-40(%edi) movd %xmm3,-36(%edi) movd %xmm4,-32(%edi) movl %ebp,%esp .L007nodata: popl %edi popl %esi popl %ebx popl %ebp ret .size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 .align 32 .type _poly1305_emit_sse2,@function .align 16 _poly1305_emit_sse2: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%ebp cmpl $0,20(%ebp) je .Lenter_emit movl (%ebp),%eax movl 4(%ebp),%edi movl 8(%ebp),%ecx movl 12(%ebp),%edx movl 16(%ebp),%esi movl %edi,%ebx shll $26,%edi shrl $6,%ebx addl %edi,%eax movl %ecx,%edi adcl $0,%ebx shll $20,%edi shrl $12,%ecx addl %edi,%ebx movl %edx,%edi adcl $0,%ecx shll $14,%edi shrl $18,%edx addl %edi,%ecx movl %esi,%edi adcl $0,%edx shll $8,%edi shrl $24,%esi addl %edi,%edx adcl $0,%esi movl %esi,%edi andl $3,%esi shrl $2,%edi leal (%edi,%edi,4),%ebp movl 24(%esp),%edi addl %ebp,%eax movl 28(%esp),%ebp adcl $0,%ebx adcl $0,%ecx adcl $0,%edx adcl $0,%esi movd %eax,%xmm0 addl $5,%eax movd %ebx,%xmm1 adcl $0,%ebx movd %ecx,%xmm2 adcl $0,%ecx movd %edx,%xmm3 adcl $0,%edx adcl $0,%esi shrl $2,%esi negl %esi andl %esi,%eax andl %esi,%ebx andl %esi,%ecx andl %esi,%edx movl %eax,(%edi) movd %xmm0,%eax movl %ebx,4(%edi) movd %xmm1,%ebx movl %ecx,8(%edi) movd %xmm2,%ecx movl %edx,12(%edi) movd %xmm3,%edx notl %esi andl %esi,%eax andl %esi,%ebx orl (%edi),%eax andl %esi,%ecx orl 4(%edi),%ebx andl %esi,%edx orl 8(%edi),%ecx orl 12(%edi),%edx addl (%ebp),%eax adcl 4(%ebp),%ebx movl %eax,(%edi) adcl 8(%ebp),%ecx movl %ebx,4(%edi) adcl 12(%ebp),%edx movl %ecx,8(%edi) movl %edx,12(%edi) popl %edi popl %esi popl %ebx popl %ebp ret .size _poly1305_emit_sse2,.-_poly1305_emit_sse2 +.align 32 +.type _poly1305_init_avx2,@function +.align 16 +_poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +.L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz .L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp .L018square +.L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.size _poly1305_init_avx2,.-_poly1305_init_avx2 +.align 32 +.type _poly1305_blocks_avx2,@function +.align 16 +_poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz .L020nodata + cmpl $64,%ecx + jae .L021enter_avx2 + testl %eax,%eax + jz .Lenter_blocks +.L021enter_avx2: + vzeroupper + call .L022pic_point +.L022pic_point: + popl %ebx + leal .Lconst_sse2-.L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz .L023base2_26 + call _poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +.L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz .L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb .L025one + vmovdqu 16(%esi),%xmm6 + je .L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp .L027tail +.L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp .L027tail +.L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp .L027tail +.align 32 +.L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz .L027tail +.L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz .L028loop +.L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je .L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp .L024even +.align 16 +.L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +.L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 .align 64 .Lconst_sse2: .long 16777216,0,16777216,0,16777216,0,16777216,0 .long 0,0,0,0,0,0,0,0 .long 67108863,0,67108863,0,67108863,0,67108863,0 .long 268435455,268435452,268435452,268435452 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 .align 4 .comm OPENSSL_ia32cap_P,16,4 #else .text .align 64 .globl poly1305_init .type poly1305_init,@function .align 16 poly1305_init: .L_poly1305_init_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ebp xorl %eax,%eax movl %eax,(%edi) movl %eax,4(%edi) movl %eax,8(%edi) movl %eax,12(%edi) movl %eax,16(%edi) movl %eax,20(%edi) cmpl $0,%esi je .L000nokey call .L001pic_point .L001pic_point: popl %ebx leal poly1305_blocks-.L001pic_point(%ebx),%eax leal poly1305_emit-.L001pic_point(%ebx),%edx leal OPENSSL_ia32cap_P,%edi movl (%edi),%ecx andl $83886080,%ecx cmpl $83886080,%ecx jne .L002no_sse2 leal _poly1305_blocks_sse2-.L001pic_point(%ebx),%eax leal _poly1305_emit_sse2-.L001pic_point(%ebx),%edx + movl 8(%edi),%ecx + testl $32,%ecx + jz .L002no_sse2 + leal _poly1305_blocks_avx2-.L001pic_point(%ebx),%eax .L002no_sse2: movl 20(%esp),%edi movl %eax,(%ebp) movl %edx,4(%ebp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edx andl $268435455,%eax andl $268435452,%ebx andl $268435452,%ecx andl $268435452,%edx movl %eax,24(%edi) movl %ebx,28(%edi) movl %ecx,32(%edi) movl %edx,36(%edi) movl $1,%eax .L000nokey: popl %edi popl %esi popl %ebx popl %ebp ret .size poly1305_init,.-.L_poly1305_init_begin .globl poly1305_blocks .type poly1305_blocks,@function .align 16 poly1305_blocks: .L_poly1305_blocks_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx .Lenter_blocks: andl $-15,%ecx jz .L003nodata subl $64,%esp movl 24(%edi),%eax movl 28(%edi),%ebx leal (%esi,%ecx,1),%ebp movl 32(%edi),%ecx movl 36(%edi),%edx movl %ebp,92(%esp) movl %esi,%ebp movl %eax,36(%esp) movl %ebx,%eax shrl $2,%eax movl %ebx,40(%esp) addl %ebx,%eax movl %ecx,%ebx shrl $2,%ebx movl %ecx,44(%esp) addl %ecx,%ebx movl %edx,%ecx shrl $2,%ecx movl %edx,48(%esp) addl %edx,%ecx movl %eax,52(%esp) movl %ebx,56(%esp) movl %ecx,60(%esp) movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx movl 12(%edi),%esi movl 16(%edi),%edi jmp .L004loop .align 32 .L004loop: addl (%ebp),%eax adcl 4(%ebp),%ebx adcl 8(%ebp),%ecx adcl 12(%ebp),%esi leal 16(%ebp),%ebp adcl 96(%esp),%edi movl %eax,(%esp) movl %esi,12(%esp) mull 36(%esp) movl %edi,16(%esp) movl %eax,%edi movl %ebx,%eax movl %edx,%esi mull 60(%esp) addl %eax,%edi movl %ecx,%eax adcl %edx,%esi mull 56(%esp) addl %eax,%edi movl 12(%esp),%eax adcl %edx,%esi mull 52(%esp) addl %eax,%edi movl (%esp),%eax adcl %edx,%esi mull 40(%esp) movl %edi,20(%esp) xorl %edi,%edi addl %eax,%esi movl %ebx,%eax adcl %edx,%edi mull 36(%esp) addl %eax,%esi movl %ecx,%eax adcl %edx,%edi mull 60(%esp) addl %eax,%esi movl 12(%esp),%eax adcl %edx,%edi mull 56(%esp) addl %eax,%esi movl 16(%esp),%eax adcl %edx,%edi imull 52(%esp),%eax addl %eax,%esi movl (%esp),%eax adcl $0,%edi mull 44(%esp) movl %esi,24(%esp) xorl %esi,%esi addl %eax,%edi movl %ebx,%eax adcl %edx,%esi mull 40(%esp) addl %eax,%edi movl %ecx,%eax adcl %edx,%esi mull 36(%esp) addl %eax,%edi movl 12(%esp),%eax adcl %edx,%esi mull 60(%esp) addl %eax,%edi movl 16(%esp),%eax adcl %edx,%esi imull 56(%esp),%eax addl %eax,%edi movl (%esp),%eax adcl $0,%esi mull 48(%esp) movl %edi,28(%esp) xorl %edi,%edi addl %eax,%esi movl %ebx,%eax adcl %edx,%edi mull 44(%esp) addl %eax,%esi movl %ecx,%eax adcl %edx,%edi mull 40(%esp) addl %eax,%esi movl 12(%esp),%eax adcl %edx,%edi mull 36(%esp) addl %eax,%esi movl 16(%esp),%ecx adcl %edx,%edi movl %ecx,%edx imull 60(%esp),%ecx addl %ecx,%esi movl 20(%esp),%eax adcl $0,%edi imull 36(%esp),%edx addl %edi,%edx movl 24(%esp),%ebx movl 28(%esp),%ecx movl %edx,%edi shrl $2,%edx andl $3,%edi leal (%edx,%edx,4),%edx addl %edx,%eax adcl $0,%ebx adcl $0,%ecx adcl $0,%esi adcl $0,%edi cmpl 92(%esp),%ebp jne .L004loop movl 84(%esp),%edx addl $64,%esp movl %eax,(%edx) movl %ebx,4(%edx) movl %ecx,8(%edx) movl %esi,12(%edx) movl %edi,16(%edx) .L003nodata: popl %edi popl %esi popl %ebx popl %ebp ret .size poly1305_blocks,.-.L_poly1305_blocks_begin .globl poly1305_emit .type poly1305_emit,@function .align 16 poly1305_emit: .L_poly1305_emit_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%ebp .Lenter_emit: movl 24(%esp),%edi movl (%ebp),%eax movl 4(%ebp),%ebx movl 8(%ebp),%ecx movl 12(%ebp),%edx movl 16(%ebp),%esi addl $5,%eax adcl $0,%ebx adcl $0,%ecx adcl $0,%edx adcl $0,%esi shrl $2,%esi negl %esi andl %esi,%eax andl %esi,%ebx andl %esi,%ecx andl %esi,%edx movl %eax,(%edi) movl %ebx,4(%edi) movl %ecx,8(%edi) movl %edx,12(%edi) notl %esi movl (%ebp),%eax movl 4(%ebp),%ebx movl 8(%ebp),%ecx movl 12(%ebp),%edx movl 28(%esp),%ebp andl %esi,%eax andl %esi,%ebx andl %esi,%ecx andl %esi,%edx orl (%edi),%eax orl 4(%edi),%ebx orl 8(%edi),%ecx orl 12(%edi),%edx addl (%ebp),%eax adcl 4(%ebp),%ebx adcl 8(%ebp),%ecx adcl 12(%ebp),%edx movl %eax,(%edi) movl %ebx,4(%edi) movl %ecx,8(%edi) movl %edx,12(%edi) popl %edi popl %esi popl %ebx popl %ebp ret .size poly1305_emit,.-.L_poly1305_emit_begin .align 32 .type _poly1305_init_sse2,@function .align 16 _poly1305_init_sse2: movdqu 24(%edi),%xmm4 leal 48(%edi),%edi movl %esp,%ebp subl $224,%esp andl $-16,%esp movq 64(%ebx),%xmm7 movdqa %xmm4,%xmm0 movdqa %xmm4,%xmm1 movdqa %xmm4,%xmm2 pand %xmm7,%xmm0 psrlq $26,%xmm1 psrldq $6,%xmm2 pand %xmm7,%xmm1 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 pand %xmm7,%xmm2 pand %xmm7,%xmm3 psrldq $13,%xmm4 leal 144(%esp),%edx movl $2,%ecx .L005square: movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) movdqa %xmm1,%xmm6 movdqa %xmm2,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm1,%xmm6 paddd %xmm2,%xmm5 movdqa %xmm6,80(%esp) movdqa %xmm5,96(%esp) movdqa %xmm3,%xmm6 movdqa %xmm4,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm3,%xmm6 paddd %xmm4,%xmm5 movdqa %xmm6,112(%esp) movdqa %xmm5,128(%esp) pshufd $68,%xmm0,%xmm6 movdqa %xmm1,%xmm5 pshufd $68,%xmm1,%xmm1 pshufd $68,%xmm2,%xmm2 pshufd $68,%xmm3,%xmm3 pshufd $68,%xmm4,%xmm4 movdqa %xmm6,(%edx) movdqa %xmm1,16(%edx) movdqa %xmm2,32(%edx) movdqa %xmm3,48(%edx) movdqa %xmm4,64(%edx) pmuludq %xmm0,%xmm4 pmuludq %xmm0,%xmm3 pmuludq %xmm0,%xmm2 pmuludq %xmm0,%xmm1 pmuludq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 48(%edx),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa 80(%esp),%xmm6 pmuludq (%edx),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%edx),%xmm6 movdqa 32(%esp),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%edx),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%edx),%xmm5 paddq %xmm7,%xmm4 movdqa 96(%esp),%xmm7 pmuludq (%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%edx),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%edx),%xmm5 movdqa 48(%esp),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%edx),%xmm6 paddq %xmm5,%xmm0 movdqa 112(%esp),%xmm5 pmuludq (%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%edx),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%edx),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%edx),%xmm7 movdqa 64(%esp),%xmm5 paddq %xmm6,%xmm1 movdqa 128(%esp),%xmm6 pmuludq (%edx),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%edx),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%edx),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 decl %ecx jz .L006square_break punpcklqdq (%esp),%xmm0 punpcklqdq 16(%esp),%xmm1 punpcklqdq 32(%esp),%xmm2 punpcklqdq 48(%esp),%xmm3 punpcklqdq 64(%esp),%xmm4 jmp .L005square .L006square_break: psllq $32,%xmm0 psllq $32,%xmm1 psllq $32,%xmm2 psllq $32,%xmm3 psllq $32,%xmm4 por (%esp),%xmm0 por 16(%esp),%xmm1 por 32(%esp),%xmm2 por 48(%esp),%xmm3 por 64(%esp),%xmm4 pshufd $141,%xmm0,%xmm0 pshufd $141,%xmm1,%xmm1 pshufd $141,%xmm2,%xmm2 pshufd $141,%xmm3,%xmm3 pshufd $141,%xmm4,%xmm4 movdqu %xmm0,(%edi) movdqu %xmm1,16(%edi) movdqu %xmm2,32(%edi) movdqu %xmm3,48(%edi) movdqu %xmm4,64(%edi) movdqa %xmm1,%xmm6 movdqa %xmm2,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm1,%xmm6 paddd %xmm2,%xmm5 movdqu %xmm6,80(%edi) movdqu %xmm5,96(%edi) movdqa %xmm3,%xmm6 movdqa %xmm4,%xmm5 pslld $2,%xmm6 pslld $2,%xmm5 paddd %xmm3,%xmm6 paddd %xmm4,%xmm5 movdqu %xmm6,112(%edi) movdqu %xmm5,128(%edi) movl %ebp,%esp leal -48(%edi),%edi ret .size _poly1305_init_sse2,.-_poly1305_init_sse2 .align 32 .type _poly1305_blocks_sse2,@function .align 16 _poly1305_blocks_sse2: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%edi movl 24(%esp),%esi movl 28(%esp),%ecx movl 20(%edi),%eax andl $-16,%ecx jz .L007nodata cmpl $64,%ecx jae .L008enter_sse2 testl %eax,%eax jz .Lenter_blocks .align 16 .L008enter_sse2: call .L009pic_point .L009pic_point: popl %ebx leal .Lconst_sse2-.L009pic_point(%ebx),%ebx testl %eax,%eax jnz .L010base2_26 call _poly1305_init_sse2 movl (%edi),%eax movl 3(%edi),%ecx movl 6(%edi),%edx movl 9(%edi),%esi movl 13(%edi),%ebp movl $1,20(%edi) shrl $2,%ecx andl $67108863,%eax shrl $4,%edx andl $67108863,%ecx shrl $6,%esi andl $67108863,%edx movd %eax,%xmm0 movd %ecx,%xmm1 movd %edx,%xmm2 movd %esi,%xmm3 movd %ebp,%xmm4 movl 24(%esp),%esi movl 28(%esp),%ecx jmp .L011base2_32 .align 16 .L010base2_26: movd (%edi),%xmm0 movd 4(%edi),%xmm1 movd 8(%edi),%xmm2 movd 12(%edi),%xmm3 movd 16(%edi),%xmm4 movdqa 64(%ebx),%xmm7 .L011base2_32: movl 32(%esp),%eax movl %esp,%ebp subl $528,%esp andl $-16,%esp leal 48(%edi),%edi shll $24,%eax testl $31,%ecx jz .L012even movdqu (%esi),%xmm6 leal 16(%esi),%esi movdqa %xmm6,%xmm5 pand %xmm7,%xmm6 paddd %xmm6,%xmm0 movdqa %xmm5,%xmm6 psrlq $26,%xmm5 psrldq $6,%xmm6 pand %xmm7,%xmm5 paddd %xmm5,%xmm1 movdqa %xmm6,%xmm5 psrlq $4,%xmm6 pand %xmm7,%xmm6 paddd %xmm6,%xmm2 movdqa %xmm5,%xmm6 psrlq $30,%xmm5 pand %xmm7,%xmm5 psrldq $7,%xmm6 paddd %xmm5,%xmm3 movd %eax,%xmm5 paddd %xmm6,%xmm4 movd 12(%edi),%xmm6 paddd %xmm5,%xmm4 movdqa %xmm0,(%esp) movdqa %xmm1,16(%esp) movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) pmuludq %xmm6,%xmm0 pmuludq %xmm6,%xmm1 pmuludq %xmm6,%xmm2 movd 28(%edi),%xmm5 pmuludq %xmm6,%xmm3 pmuludq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 48(%esp),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 movd 92(%edi),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%esp),%xmm6 movd 44(%edi),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%esp),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%esp),%xmm5 paddq %xmm7,%xmm4 movd 108(%edi),%xmm7 pmuludq (%esp),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%esp),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%esp),%xmm5 movd 60(%edi),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%esp),%xmm6 paddq %xmm5,%xmm0 movd 124(%edi),%xmm5 pmuludq (%esp),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%esp),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%esp),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%esp),%xmm7 movd 76(%edi),%xmm5 paddq %xmm6,%xmm1 movd 140(%edi),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%esp),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%esp),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 subl $16,%ecx jz .L013done .L012even: leal 384(%esp),%edx leal -32(%esi),%eax subl $64,%ecx movdqu (%edi),%xmm5 pshufd $68,%xmm5,%xmm6 cmovbl %eax,%esi pshufd $238,%xmm5,%xmm5 movdqa %xmm6,(%edx) leal 160(%esp),%eax movdqu 16(%edi),%xmm6 movdqa %xmm5,-144(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,16(%edx) movdqu 32(%edi),%xmm5 movdqa %xmm6,-128(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,32(%edx) movdqu 48(%edi),%xmm6 movdqa %xmm5,-112(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,48(%edx) movdqu 64(%edi),%xmm5 movdqa %xmm6,-96(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,64(%edx) movdqu 80(%edi),%xmm6 movdqa %xmm5,-80(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,80(%edx) movdqu 96(%edi),%xmm5 movdqa %xmm6,-64(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,96(%edx) movdqu 112(%edi),%xmm6 movdqa %xmm5,-48(%edx) pshufd $68,%xmm6,%xmm5 pshufd $238,%xmm6,%xmm6 movdqa %xmm5,112(%edx) movdqu 128(%edi),%xmm5 movdqa %xmm6,-32(%edx) pshufd $68,%xmm5,%xmm6 pshufd $238,%xmm5,%xmm5 movdqa %xmm6,128(%edx) movdqa %xmm5,-16(%edx) movdqu 32(%esi),%xmm5 movdqu 48(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,112(%esp) movdqa %xmm3,128(%esp) movdqa %xmm4,144(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 movdqa %xmm0,80(%esp) movdqa %xmm1,96(%esp) jbe .L014skip_loop jmp .L015loop .align 32 .L015loop: movdqa -144(%edx),%xmm7 movdqa %xmm6,16(%eax) movdqa %xmm2,32(%eax) movdqa %xmm3,48(%eax) movdqa %xmm4,64(%eax) movdqa %xmm5,%xmm1 pmuludq %xmm7,%xmm5 movdqa %xmm6,%xmm0 pmuludq %xmm7,%xmm6 pmuludq %xmm7,%xmm2 pmuludq %xmm7,%xmm3 pmuludq %xmm7,%xmm4 pmuludq -16(%edx),%xmm0 movdqa %xmm1,%xmm7 pmuludq -128(%edx),%xmm1 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq -112(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa %xmm5,%xmm6 pmuludq -96(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 16(%eax),%xmm7 pmuludq -80(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq -128(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq -112(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 32(%eax),%xmm7 pmuludq -96(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq -32(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq -16(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq -128(%edx),%xmm6 paddq %xmm5,%xmm1 movdqa 48(%eax),%xmm5 pmuludq -112(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq -48(%edx),%xmm5 paddq %xmm7,%xmm4 movdqa %xmm6,%xmm7 pmuludq -32(%edx),%xmm6 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq -16(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa 64(%eax),%xmm6 pmuludq -128(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa %xmm6,%xmm7 pmuludq -16(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq -64(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq -48(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa 64(%ebx),%xmm7 pmuludq -32(%edx),%xmm6 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqu -32(%esi),%xmm5 movdqu -16(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 leal -32(%esi),%eax subl $64,%ecx paddd 80(%esp),%xmm5 paddd 96(%esp),%xmm6 paddd 112(%esp),%xmm2 paddd 128(%esp),%xmm3 paddd 144(%esp),%xmm4 cmovbl %eax,%esi leal 160(%esp),%eax movdqa (%edx),%xmm7 movdqa %xmm1,16(%esp) movdqa %xmm6,16(%eax) movdqa %xmm2,32(%eax) movdqa %xmm3,48(%eax) movdqa %xmm4,64(%eax) movdqa %xmm5,%xmm1 pmuludq %xmm7,%xmm5 paddq %xmm0,%xmm5 movdqa %xmm6,%xmm0 pmuludq %xmm7,%xmm6 pmuludq %xmm7,%xmm2 pmuludq %xmm7,%xmm3 pmuludq %xmm7,%xmm4 paddq 16(%esp),%xmm6 paddq 32(%esp),%xmm2 paddq 48(%esp),%xmm3 paddq 64(%esp),%xmm4 pmuludq 128(%edx),%xmm0 movdqa %xmm1,%xmm7 pmuludq 16(%edx),%xmm1 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq 32(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa %xmm5,%xmm6 pmuludq 48(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 16(%eax),%xmm7 pmuludq 64(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 16(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 32(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa 32(%eax),%xmm7 pmuludq 48(%edx),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 112(%edx),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 128(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 16(%edx),%xmm6 paddq %xmm5,%xmm1 movdqa 48(%eax),%xmm5 pmuludq 32(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 96(%edx),%xmm5 paddq %xmm7,%xmm4 movdqa %xmm6,%xmm7 pmuludq 112(%edx),%xmm6 paddq %xmm5,%xmm0 movdqa %xmm7,%xmm5 pmuludq 128(%edx),%xmm7 paddq %xmm6,%xmm1 movdqa 64(%eax),%xmm6 pmuludq 16(%edx),%xmm5 paddq %xmm7,%xmm2 movdqa %xmm6,%xmm7 pmuludq 128(%edx),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 80(%edx),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 96(%edx),%xmm5 paddq %xmm7,%xmm0 movdqa 64(%ebx),%xmm7 pmuludq 112(%edx),%xmm6 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 movdqu 32(%esi),%xmm5 movdqu 48(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,112(%esp) movdqa %xmm3,128(%esp) movdqa %xmm4,144(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 movdqa %xmm0,80(%esp) movdqa %xmm1,96(%esp) ja .L015loop .L014skip_loop: pshufd $16,-144(%edx),%xmm7 addl $32,%ecx jnz .L016long_tail paddd %xmm0,%xmm5 paddd %xmm1,%xmm6 paddd 112(%esp),%xmm2 paddd 128(%esp),%xmm3 paddd 144(%esp),%xmm4 .L016long_tail: movdqa %xmm5,(%eax) movdqa %xmm6,16(%eax) movdqa %xmm2,32(%eax) movdqa %xmm3,48(%eax) movdqa %xmm4,64(%eax) pmuludq %xmm7,%xmm5 pmuludq %xmm7,%xmm6 pmuludq %xmm7,%xmm2 movdqa %xmm5,%xmm0 pshufd $16,-128(%edx),%xmm5 pmuludq %xmm7,%xmm3 movdqa %xmm6,%xmm1 pmuludq %xmm7,%xmm4 movdqa %xmm5,%xmm6 pmuludq 48(%eax),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%eax),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%eax),%xmm7 paddq %xmm6,%xmm3 pshufd $16,-64(%edx),%xmm6 pmuludq (%eax),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%eax),%xmm6 pshufd $16,-112(%edx),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%eax),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%eax),%xmm5 paddq %xmm7,%xmm4 pshufd $16,-48(%edx),%xmm7 pmuludq (%eax),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%eax),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%eax),%xmm5 pshufd $16,-96(%edx),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%eax),%xmm6 paddq %xmm5,%xmm0 pshufd $16,-32(%edx),%xmm5 pmuludq (%eax),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%eax),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%eax),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%eax),%xmm7 pshufd $16,-80(%edx),%xmm5 paddq %xmm6,%xmm1 pshufd $16,-16(%edx),%xmm6 pmuludq (%eax),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%eax),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%eax),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%eax),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%eax),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 jz .L017short_tail movdqu -32(%esi),%xmm5 movdqu -16(%esi),%xmm6 leal 32(%esi),%esi movdqa %xmm2,32(%esp) movdqa %xmm3,48(%esp) movdqa %xmm4,64(%esp) movdqa %xmm5,%xmm2 movdqa %xmm6,%xmm3 psrldq $6,%xmm2 psrldq $6,%xmm3 movdqa %xmm5,%xmm4 punpcklqdq %xmm3,%xmm2 punpckhqdq %xmm6,%xmm4 punpcklqdq %xmm6,%xmm5 movdqa %xmm2,%xmm3 psrlq $4,%xmm2 psrlq $30,%xmm3 movdqa %xmm5,%xmm6 psrlq $40,%xmm4 psrlq $26,%xmm6 pand %xmm7,%xmm5 pand %xmm7,%xmm6 pand %xmm7,%xmm2 pand %xmm7,%xmm3 por (%ebx),%xmm4 pshufd $16,(%edx),%xmm7 paddd 80(%esp),%xmm5 paddd 96(%esp),%xmm6 paddd 112(%esp),%xmm2 paddd 128(%esp),%xmm3 paddd 144(%esp),%xmm4 movdqa %xmm5,(%esp) pmuludq %xmm7,%xmm5 movdqa %xmm6,16(%esp) pmuludq %xmm7,%xmm6 paddq %xmm5,%xmm0 movdqa %xmm2,%xmm5 pmuludq %xmm7,%xmm2 paddq %xmm6,%xmm1 movdqa %xmm3,%xmm6 pmuludq %xmm7,%xmm3 paddq 32(%esp),%xmm2 movdqa %xmm5,32(%esp) pshufd $16,16(%edx),%xmm5 paddq 48(%esp),%xmm3 movdqa %xmm6,48(%esp) movdqa %xmm4,%xmm6 pmuludq %xmm7,%xmm4 paddq 64(%esp),%xmm4 movdqa %xmm6,64(%esp) movdqa %xmm5,%xmm6 pmuludq 48(%esp),%xmm5 movdqa %xmm6,%xmm7 pmuludq 32(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 pshufd $16,80(%edx),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm2 pmuludq 64(%esp),%xmm6 pshufd $16,32(%edx),%xmm7 paddq %xmm5,%xmm1 movdqa %xmm7,%xmm5 pmuludq 32(%esp),%xmm7 paddq %xmm6,%xmm0 movdqa %xmm5,%xmm6 pmuludq 16(%esp),%xmm5 paddq %xmm7,%xmm4 pshufd $16,96(%edx),%xmm7 pmuludq (%esp),%xmm6 paddq %xmm5,%xmm3 movdqa %xmm7,%xmm5 pmuludq 64(%esp),%xmm7 paddq %xmm6,%xmm2 pmuludq 48(%esp),%xmm5 pshufd $16,48(%edx),%xmm6 paddq %xmm7,%xmm1 movdqa %xmm6,%xmm7 pmuludq 16(%esp),%xmm6 paddq %xmm5,%xmm0 pshufd $16,112(%edx),%xmm5 pmuludq (%esp),%xmm7 paddq %xmm6,%xmm4 movdqa %xmm5,%xmm6 pmuludq 64(%esp),%xmm5 paddq %xmm7,%xmm3 movdqa %xmm6,%xmm7 pmuludq 48(%esp),%xmm6 paddq %xmm5,%xmm2 pmuludq 32(%esp),%xmm7 pshufd $16,64(%edx),%xmm5 paddq %xmm6,%xmm1 pshufd $16,128(%edx),%xmm6 pmuludq (%esp),%xmm5 paddq %xmm7,%xmm0 movdqa %xmm6,%xmm7 pmuludq 64(%esp),%xmm6 paddq %xmm5,%xmm4 movdqa %xmm7,%xmm5 pmuludq 16(%esp),%xmm7 paddq %xmm6,%xmm3 movdqa %xmm5,%xmm6 pmuludq 32(%esp),%xmm5 paddq %xmm7,%xmm0 pmuludq 48(%esp),%xmm6 movdqa 64(%ebx),%xmm7 paddq %xmm5,%xmm1 paddq %xmm6,%xmm2 .L017short_tail: pshufd $78,%xmm4,%xmm6 pshufd $78,%xmm3,%xmm5 paddq %xmm6,%xmm4 paddq %xmm5,%xmm3 pshufd $78,%xmm0,%xmm6 pshufd $78,%xmm1,%xmm5 paddq %xmm6,%xmm0 paddq %xmm5,%xmm1 pshufd $78,%xmm2,%xmm6 movdqa %xmm3,%xmm5 pand %xmm7,%xmm3 psrlq $26,%xmm5 paddq %xmm6,%xmm2 paddq %xmm4,%xmm5 movdqa %xmm0,%xmm6 pand %xmm7,%xmm0 psrlq $26,%xmm6 movdqa %xmm5,%xmm4 paddq %xmm1,%xmm6 psrlq $26,%xmm5 pand %xmm7,%xmm4 movdqa %xmm6,%xmm1 psrlq $26,%xmm6 paddd %xmm5,%xmm0 psllq $2,%xmm5 paddq %xmm2,%xmm6 paddq %xmm0,%xmm5 pand %xmm7,%xmm1 movdqa %xmm6,%xmm2 psrlq $26,%xmm6 pand %xmm7,%xmm2 paddd %xmm3,%xmm6 movdqa %xmm5,%xmm0 psrlq $26,%xmm5 movdqa %xmm6,%xmm3 psrlq $26,%xmm6 pand %xmm7,%xmm0 paddd %xmm5,%xmm1 pand %xmm7,%xmm3 paddd %xmm6,%xmm4 .L013done: movd %xmm0,-48(%edi) movd %xmm1,-44(%edi) movd %xmm2,-40(%edi) movd %xmm3,-36(%edi) movd %xmm4,-32(%edi) movl %ebp,%esp .L007nodata: popl %edi popl %esi popl %ebx popl %ebp ret .size _poly1305_blocks_sse2,.-_poly1305_blocks_sse2 .align 32 .type _poly1305_emit_sse2,@function .align 16 _poly1305_emit_sse2: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%ebp cmpl $0,20(%ebp) je .Lenter_emit movl (%ebp),%eax movl 4(%ebp),%edi movl 8(%ebp),%ecx movl 12(%ebp),%edx movl 16(%ebp),%esi movl %edi,%ebx shll $26,%edi shrl $6,%ebx addl %edi,%eax movl %ecx,%edi adcl $0,%ebx shll $20,%edi shrl $12,%ecx addl %edi,%ebx movl %edx,%edi adcl $0,%ecx shll $14,%edi shrl $18,%edx addl %edi,%ecx movl %esi,%edi adcl $0,%edx shll $8,%edi shrl $24,%esi addl %edi,%edx adcl $0,%esi movl %esi,%edi andl $3,%esi shrl $2,%edi leal (%edi,%edi,4),%ebp movl 24(%esp),%edi addl %ebp,%eax movl 28(%esp),%ebp adcl $0,%ebx adcl $0,%ecx adcl $0,%edx adcl $0,%esi movd %eax,%xmm0 addl $5,%eax movd %ebx,%xmm1 adcl $0,%ebx movd %ecx,%xmm2 adcl $0,%ecx movd %edx,%xmm3 adcl $0,%edx adcl $0,%esi shrl $2,%esi negl %esi andl %esi,%eax andl %esi,%ebx andl %esi,%ecx andl %esi,%edx movl %eax,(%edi) movd %xmm0,%eax movl %ebx,4(%edi) movd %xmm1,%ebx movl %ecx,8(%edi) movd %xmm2,%ecx movl %edx,12(%edi) movd %xmm3,%edx notl %esi andl %esi,%eax andl %esi,%ebx orl (%edi),%eax andl %esi,%ecx orl 4(%edi),%ebx andl %esi,%edx orl 8(%edi),%ecx orl 12(%edi),%edx addl (%ebp),%eax adcl 4(%ebp),%ebx movl %eax,(%edi) adcl 8(%ebp),%ecx movl %ebx,4(%edi) adcl 12(%ebp),%edx movl %ecx,8(%edi) movl %edx,12(%edi) popl %edi popl %esi popl %ebx popl %ebp ret .size _poly1305_emit_sse2,.-_poly1305_emit_sse2 +.align 32 +.type _poly1305_init_avx2,@function +.align 16 +_poly1305_init_avx2: + vmovdqu 24(%edi),%xmm4 + leal 48(%edi),%edi + movl %esp,%ebp + subl $224,%esp + andl $-16,%esp + vmovdqa 64(%ebx),%xmm7 + vpand %xmm7,%xmm4,%xmm0 + vpsrlq $26,%xmm4,%xmm1 + vpsrldq $6,%xmm4,%xmm3 + vpand %xmm7,%xmm1,%xmm1 + vpsrlq $4,%xmm3,%xmm2 + vpsrlq $30,%xmm3,%xmm3 + vpand %xmm7,%xmm2,%xmm2 + vpand %xmm7,%xmm3,%xmm3 + vpsrldq $13,%xmm4,%xmm4 + leal 144(%esp),%edx + movl $2,%ecx +.L018square: + vmovdqa %xmm0,(%esp) + vmovdqa %xmm1,16(%esp) + vmovdqa %xmm2,32(%esp) + vmovdqa %xmm3,48(%esp) + vmovdqa %xmm4,64(%esp) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqa %xmm6,80(%esp) + vmovdqa %xmm5,96(%esp) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqa %xmm6,112(%esp) + vmovdqa %xmm5,128(%esp) + vpshufd $68,%xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vpshufd $68,%xmm1,%xmm1 + vpshufd $68,%xmm2,%xmm2 + vpshufd $68,%xmm3,%xmm3 + vpshufd $68,%xmm4,%xmm4 + vmovdqa %xmm5,(%edx) + vmovdqa %xmm1,16(%edx) + vmovdqa %xmm2,32(%edx) + vmovdqa %xmm3,48(%edx) + vmovdqa %xmm4,64(%edx) + vpmuludq %xmm0,%xmm4,%xmm4 + vpmuludq %xmm0,%xmm3,%xmm3 + vpmuludq %xmm0,%xmm2,%xmm2 + vpmuludq %xmm0,%xmm1,%xmm1 + vpmuludq %xmm0,%xmm5,%xmm0 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm4,%xmm4 + vpmuludq 32(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vmovdqa 80(%esp),%xmm7 + vpmuludq (%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 32(%esp),%xmm5 + vpmuludq 64(%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm4,%xmm4 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm3,%xmm3 + vmovdqa 96(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm1,%xmm1 + vmovdqa 48(%esp),%xmm5 + vpmuludq 48(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vmovdqa 112(%esp),%xmm6 + vpmuludq (%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm3,%xmm3 + vpmuludq 64(%edx),%xmm6,%xmm7 + vpaddq %xmm7,%xmm2,%xmm2 + vpmuludq 48(%edx),%xmm6,%xmm5 + vpaddq %xmm5,%xmm1,%xmm1 + vmovdqa 64(%esp),%xmm7 + vpmuludq 32(%edx),%xmm6,%xmm6 + vpaddq %xmm6,%xmm0,%xmm0 + vmovdqa 128(%esp),%xmm5 + vpmuludq (%edx),%xmm7,%xmm7 + vpaddq %xmm7,%xmm4,%xmm4 + vpmuludq 64(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm3,%xmm3 + vpmuludq 16(%edx),%xmm5,%xmm7 + vpaddq %xmm7,%xmm0,%xmm0 + vpmuludq 32(%edx),%xmm5,%xmm6 + vpaddq %xmm6,%xmm1,%xmm1 + vmovdqa 64(%ebx),%xmm7 + vpmuludq 48(%edx),%xmm5,%xmm5 + vpaddq %xmm5,%xmm2,%xmm2 + vpsrlq $26,%xmm3,%xmm5 + vpand %xmm7,%xmm3,%xmm3 + vpsrlq $26,%xmm0,%xmm6 + vpand %xmm7,%xmm0,%xmm0 + vpaddq %xmm5,%xmm4,%xmm4 + vpaddq %xmm6,%xmm1,%xmm1 + vpsrlq $26,%xmm4,%xmm5 + vpand %xmm7,%xmm4,%xmm4 + vpsrlq $26,%xmm1,%xmm6 + vpand %xmm7,%xmm1,%xmm1 + vpaddq %xmm6,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpsllq $2,%xmm5,%xmm5 + vpsrlq $26,%xmm2,%xmm6 + vpand %xmm7,%xmm2,%xmm2 + vpaddd %xmm5,%xmm0,%xmm0 + vpaddd %xmm6,%xmm3,%xmm3 + vpsrlq $26,%xmm3,%xmm6 + vpsrlq $26,%xmm0,%xmm5 + vpand %xmm7,%xmm0,%xmm0 + vpand %xmm7,%xmm3,%xmm3 + vpaddd %xmm5,%xmm1,%xmm1 + vpaddd %xmm6,%xmm4,%xmm4 + decl %ecx + jz .L019square_break + vpunpcklqdq (%esp),%xmm0,%xmm0 + vpunpcklqdq 16(%esp),%xmm1,%xmm1 + vpunpcklqdq 32(%esp),%xmm2,%xmm2 + vpunpcklqdq 48(%esp),%xmm3,%xmm3 + vpunpcklqdq 64(%esp),%xmm4,%xmm4 + jmp .L018square +.L019square_break: + vpsllq $32,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm1 + vpsllq $32,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm3 + vpsllq $32,%xmm4,%xmm4 + vpor (%esp),%xmm0,%xmm0 + vpor 16(%esp),%xmm1,%xmm1 + vpor 32(%esp),%xmm2,%xmm2 + vpor 48(%esp),%xmm3,%xmm3 + vpor 64(%esp),%xmm4,%xmm4 + vpshufd $141,%xmm0,%xmm0 + vpshufd $141,%xmm1,%xmm1 + vpshufd $141,%xmm2,%xmm2 + vpshufd $141,%xmm3,%xmm3 + vpshufd $141,%xmm4,%xmm4 + vmovdqu %xmm0,(%edi) + vmovdqu %xmm1,16(%edi) + vmovdqu %xmm2,32(%edi) + vmovdqu %xmm3,48(%edi) + vmovdqu %xmm4,64(%edi) + vpslld $2,%xmm1,%xmm6 + vpslld $2,%xmm2,%xmm5 + vpaddd %xmm1,%xmm6,%xmm6 + vpaddd %xmm2,%xmm5,%xmm5 + vmovdqu %xmm6,80(%edi) + vmovdqu %xmm5,96(%edi) + vpslld $2,%xmm3,%xmm6 + vpslld $2,%xmm4,%xmm5 + vpaddd %xmm3,%xmm6,%xmm6 + vpaddd %xmm4,%xmm5,%xmm5 + vmovdqu %xmm6,112(%edi) + vmovdqu %xmm5,128(%edi) + movl %ebp,%esp + leal -48(%edi),%edi + ret +.size _poly1305_init_avx2,.-_poly1305_init_avx2 +.align 32 +.type _poly1305_blocks_avx2,@function +.align 16 +_poly1305_blocks_avx2: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 20(%edi),%eax + andl $-16,%ecx + jz .L020nodata + cmpl $64,%ecx + jae .L021enter_avx2 + testl %eax,%eax + jz .Lenter_blocks +.L021enter_avx2: + vzeroupper + call .L022pic_point +.L022pic_point: + popl %ebx + leal .Lconst_sse2-.L022pic_point(%ebx),%ebx + testl %eax,%eax + jnz .L023base2_26 + call _poly1305_init_avx2 + movl (%edi),%eax + movl 3(%edi),%ecx + movl 6(%edi),%edx + movl 9(%edi),%esi + movl 13(%edi),%ebp + shrl $2,%ecx + andl $67108863,%eax + shrl $4,%edx + andl $67108863,%ecx + shrl $6,%esi + andl $67108863,%edx + movl %eax,(%edi) + movl %ecx,4(%edi) + movl %edx,8(%edi) + movl %esi,12(%edi) + movl %ebp,16(%edi) + movl $1,20(%edi) + movl 24(%esp),%esi + movl 28(%esp),%ecx +.L023base2_26: + movl 32(%esp),%eax + movl %esp,%ebp + subl $448,%esp + andl $-512,%esp + vmovdqu 48(%edi),%xmm0 + leal 288(%esp),%edx + vmovdqu 64(%edi),%xmm1 + vmovdqu 80(%edi),%xmm2 + vmovdqu 96(%edi),%xmm3 + vmovdqu 112(%edi),%xmm4 + leal 48(%edi),%edi + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpermq $64,%ymm4,%ymm4 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vpshufd $200,%ymm4,%ymm4 + vmovdqa %ymm0,-128(%edx) + vmovdqu 80(%edi),%xmm0 + vmovdqa %ymm1,-96(%edx) + vmovdqu 96(%edi),%xmm1 + vmovdqa %ymm2,-64(%edx) + vmovdqu 112(%edi),%xmm2 + vmovdqa %ymm3,-32(%edx) + vmovdqu 128(%edi),%xmm3 + vmovdqa %ymm4,(%edx) + vpermq $64,%ymm0,%ymm0 + vpermq $64,%ymm1,%ymm1 + vpermq $64,%ymm2,%ymm2 + vpermq $64,%ymm3,%ymm3 + vpshufd $200,%ymm0,%ymm0 + vpshufd $200,%ymm1,%ymm1 + vpshufd $200,%ymm2,%ymm2 + vpshufd $200,%ymm3,%ymm3 + vmovdqa %ymm0,32(%edx) + vmovd -48(%edi),%xmm0 + vmovdqa %ymm1,64(%edx) + vmovd -44(%edi),%xmm1 + vmovdqa %ymm2,96(%edx) + vmovd -40(%edi),%xmm2 + vmovdqa %ymm3,128(%edx) + vmovd -36(%edi),%xmm3 + vmovd -32(%edi),%xmm4 + vmovdqa 64(%ebx),%ymm7 + negl %eax + testl $63,%ecx + jz .L024even + movl %ecx,%edx + andl $-64,%ecx + andl $63,%edx + vmovdqu (%esi),%xmm5 + cmpl $32,%edx + jb .L025one + vmovdqu 16(%esi),%xmm6 + je .L026two + vinserti128 $1,32(%esi),%ymm5,%ymm5 + leal 48(%esi),%esi + leal 8(%ebx),%ebx + leal 296(%esp),%edx + jmp .L027tail +.L026two: + leal 32(%esi),%esi + leal 16(%ebx),%ebx + leal 304(%esp),%edx + jmp .L027tail +.L025one: + leal 16(%esi),%esi + vpxor %ymm6,%ymm6,%ymm6 + leal 32(%ebx,%eax,8),%ebx + leal 312(%esp),%edx + jmp .L027tail +.align 32 +.L024even: + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jz .L027tail +.L028loop: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -64(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 96(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 128(%edx),%ymm2,%ymm1 + vpmuludq -128(%edx),%ymm2,%ymm2 + vpmuludq -32(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq (%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -96(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -64(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -64(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -32(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 128(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -128(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -96(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -128(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 64(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 96(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 128(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 32(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -128(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 64(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 96(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + vmovdqu (%esi),%xmm5 + vmovdqu 16(%esi),%xmm6 + vinserti128 $1,32(%esi),%ymm5,%ymm5 + vinserti128 $1,48(%esi),%ymm6,%ymm6 + leal 64(%esi),%esi + subl $64,%ecx + jnz .L028loop +.L027tail: + vmovdqa %ymm2,64(%esp) + vpsrldq $6,%ymm5,%ymm2 + vmovdqa %ymm0,(%esp) + vpsrldq $6,%ymm6,%ymm0 + vmovdqa %ymm1,32(%esp) + vpunpckhqdq %ymm6,%ymm5,%ymm1 + vpunpcklqdq %ymm6,%ymm5,%ymm5 + vpunpcklqdq %ymm0,%ymm2,%ymm2 + vpsrlq $30,%ymm2,%ymm0 + vpsrlq $4,%ymm2,%ymm2 + vpsrlq $26,%ymm5,%ymm6 + vpsrlq $40,%ymm1,%ymm1 + vpand %ymm7,%ymm2,%ymm2 + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpor (%ebx),%ymm1,%ymm1 + andl $-64,%ebx + vpaddq 64(%esp),%ymm2,%ymm2 + vpaddq (%esp),%ymm5,%ymm5 + vpaddq 32(%esp),%ymm6,%ymm6 + vpaddq %ymm3,%ymm0,%ymm0 + vpaddq %ymm4,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm2,%ymm3 + vmovdqa %ymm6,32(%esp) + vpmuludq -60(%edx),%ymm2,%ymm4 + vmovdqa %ymm0,96(%esp) + vpmuludq 100(%edx),%ymm2,%ymm0 + vmovdqa %ymm1,128(%esp) + vpmuludq 132(%edx),%ymm2,%ymm1 + vpmuludq -124(%edx),%ymm2,%ymm2 + vpmuludq -28(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 4(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm4,%ymm4 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm0,%ymm0 + vmovdqa 32(%esp),%ymm7 + vpmuludq -92(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq -60(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpmuludq -60(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm3,%ymm3 + vpmuludq -28(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm4,%ymm4 + vpmuludq 132(%edx),%ymm7,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vmovdqa 96(%esp),%ymm6 + vpmuludq -124(%edx),%ymm7,%ymm5 + vpaddq %ymm5,%ymm1,%ymm1 + vpmuludq -92(%edx),%ymm7,%ymm7 + vpaddq %ymm7,%ymm2,%ymm2 + vpmuludq -124(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm3,%ymm3 + vpmuludq -92(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vpmuludq 68(%edx),%ymm6,%ymm5 + vpaddq %ymm5,%ymm0,%ymm0 + vmovdqa 128(%esp),%ymm5 + vpmuludq 100(%edx),%ymm6,%ymm7 + vpaddq %ymm7,%ymm1,%ymm1 + vpmuludq 132(%edx),%ymm6,%ymm6 + vpaddq %ymm6,%ymm2,%ymm2 + vpmuludq 132(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm3,%ymm3 + vpmuludq 36(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm0,%ymm0 + vpmuludq -124(%edx),%ymm5,%ymm7 + vpaddq %ymm7,%ymm4,%ymm4 + vmovdqa 64(%ebx),%ymm7 + vpmuludq 68(%edx),%ymm5,%ymm6 + vpaddq %ymm6,%ymm1,%ymm1 + vpmuludq 100(%edx),%ymm5,%ymm5 + vpaddq %ymm5,%ymm2,%ymm2 + vpsrldq $8,%ymm4,%ymm5 + vpsrldq $8,%ymm3,%ymm6 + vpaddq %ymm5,%ymm4,%ymm4 + vpsrldq $8,%ymm0,%ymm5 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrldq $8,%ymm1,%ymm6 + vpaddq %ymm5,%ymm0,%ymm0 + vpsrldq $8,%ymm2,%ymm5 + vpaddq %ymm6,%ymm1,%ymm1 + vpermq $2,%ymm4,%ymm6 + vpaddq %ymm5,%ymm2,%ymm2 + vpermq $2,%ymm3,%ymm5 + vpaddq %ymm6,%ymm4,%ymm4 + vpermq $2,%ymm0,%ymm6 + vpaddq %ymm5,%ymm3,%ymm3 + vpermq $2,%ymm1,%ymm5 + vpaddq %ymm6,%ymm0,%ymm0 + vpermq $2,%ymm2,%ymm6 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpsrlq $26,%ymm3,%ymm5 + vpand %ymm7,%ymm3,%ymm3 + vpsrlq $26,%ymm0,%ymm6 + vpand %ymm7,%ymm0,%ymm0 + vpaddq %ymm5,%ymm4,%ymm4 + vpaddq %ymm6,%ymm1,%ymm1 + vpsrlq $26,%ymm4,%ymm5 + vpand %ymm7,%ymm4,%ymm4 + vpsrlq $26,%ymm1,%ymm6 + vpand %ymm7,%ymm1,%ymm1 + vpaddq %ymm6,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpsllq $2,%ymm5,%ymm5 + vpsrlq $26,%ymm2,%ymm6 + vpand %ymm7,%ymm2,%ymm2 + vpaddq %ymm5,%ymm0,%ymm0 + vpaddq %ymm6,%ymm3,%ymm3 + vpsrlq $26,%ymm3,%ymm6 + vpsrlq $26,%ymm0,%ymm5 + vpand %ymm7,%ymm0,%ymm0 + vpand %ymm7,%ymm3,%ymm3 + vpaddq %ymm5,%ymm1,%ymm1 + vpaddq %ymm6,%ymm4,%ymm4 + cmpl $0,%ecx + je .L029done + vpshufd $252,%xmm0,%xmm0 + leal 288(%esp),%edx + vpshufd $252,%xmm1,%xmm1 + vpshufd $252,%xmm2,%xmm2 + vpshufd $252,%xmm3,%xmm3 + vpshufd $252,%xmm4,%xmm4 + jmp .L024even +.align 16 +.L029done: + vmovd %xmm0,-48(%edi) + vmovd %xmm1,-44(%edi) + vmovd %xmm2,-40(%edi) + vmovd %xmm3,-36(%edi) + vmovd %xmm4,-32(%edi) + vzeroupper + movl %ebp,%esp +.L020nodata: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _poly1305_blocks_avx2,.-_poly1305_blocks_avx2 .align 64 .Lconst_sse2: .long 16777216,0,16777216,0,16777216,0,16777216,0 .long 0,0,0,0,0,0,0,0 .long 67108863,0,67108863,0,67108863,0,67108863,0 .long 268435455,268435452,268435452,268435452 .byte 80,111,108,121,49,51,48,53,32,102,111,114,32,120,56,54 .byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 .byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 .byte 114,103,62,0 .align 4 .comm OPENSSL_ia32cap_P,16,4 #endif Index: stable/12/secure/lib/libcrypto/i386/sha1-586.S =================================================================== --- stable/12/secure/lib/libcrypto/i386/sha1-586.S (revision 364962) +++ stable/12/secure/lib/libcrypto/i386/sha1-586.S (revision 364963) @@ -1,5593 +1,7943 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha1-586.pl. */ #ifdef PIC .text .globl sha1_block_data_order .type sha1_block_data_order,@function .align 16 sha1_block_data_order: .L_sha1_block_data_order_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi call .L000pic_point .L000pic_point: popl %ebp leal OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi leal .LK_XX_XX-.L000pic_point(%ebp),%ebp movl (%esi),%eax movl 4(%esi),%edx testl $512,%edx jz .L001x86 movl 8(%esi),%ecx testl $16777216,%eax jz .L001x86 testl $536870912,%ecx jnz .Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: movl 20(%esp),%ebp movl 24(%esp),%esi movl 28(%esp),%eax subl $76,%esp shll $6,%eax addl %esi,%eax movl %eax,104(%esp) movl 16(%ebp),%edi jmp .L002loop .align 16 .L002loop: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,(%esp) movl %ebx,4(%esp) movl %ecx,8(%esp) movl %edx,12(%esp) movl 16(%esi),%eax movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,16(%esp) movl %ebx,20(%esp) movl %ecx,24(%esp) movl %edx,28(%esp) movl 32(%esi),%eax movl 36(%esi),%ebx movl 40(%esi),%ecx movl 44(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,32(%esp) movl %ebx,36(%esp) movl %ecx,40(%esp) movl %edx,44(%esp) movl 48(%esi),%eax movl 52(%esi),%ebx movl 56(%esi),%ecx movl 60(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,48(%esp) movl %ebx,52(%esp) movl %ecx,56(%esp) movl %edx,60(%esp) movl %esi,100(%esp) movl (%ebp),%eax movl 4(%ebp),%ebx movl 8(%ebp),%ecx movl 12(%ebp),%edx movl %ecx,%esi movl %eax,%ebp roll $5,%ebp xorl %edx,%esi addl %edi,%ebp movl (%esp),%edi andl %ebx,%esi rorl $2,%ebx xorl %edx,%esi leal 1518500249(%ebp,%edi,1),%ebp addl %esi,%ebp movl %ebx,%edi movl %ebp,%esi roll $5,%ebp xorl %ecx,%edi addl %edx,%ebp movl 4(%esp),%edx andl %eax,%edi rorl $2,%eax xorl %ecx,%edi leal 1518500249(%ebp,%edx,1),%ebp addl %edi,%ebp movl %eax,%edx movl %ebp,%edi roll $5,%ebp xorl %ebx,%edx addl %ecx,%ebp movl 8(%esp),%ecx andl %esi,%edx rorl $2,%esi xorl %ebx,%edx leal 1518500249(%ebp,%ecx,1),%ebp addl %edx,%ebp movl %esi,%ecx movl %ebp,%edx roll $5,%ebp xorl %eax,%ecx addl %ebx,%ebp movl 12(%esp),%ebx andl %edi,%ecx rorl $2,%edi xorl %eax,%ecx leal 1518500249(%ebp,%ebx,1),%ebp addl %ecx,%ebp movl %edi,%ebx movl %ebp,%ecx roll $5,%ebp xorl %esi,%ebx addl %eax,%ebp movl 16(%esp),%eax andl %edx,%ebx rorl $2,%edx xorl %esi,%ebx leal 1518500249(%ebp,%eax,1),%ebp addl %ebx,%ebp movl %edx,%eax movl %ebp,%ebx roll $5,%ebp xorl %edi,%eax addl %esi,%ebp movl 20(%esp),%esi andl %ecx,%eax rorl $2,%ecx xorl %edi,%eax leal 1518500249(%ebp,%esi,1),%ebp addl %eax,%ebp movl %ecx,%esi movl %ebp,%eax roll $5,%ebp xorl %edx,%esi addl %edi,%ebp movl 24(%esp),%edi andl %ebx,%esi rorl $2,%ebx xorl %edx,%esi leal 1518500249(%ebp,%edi,1),%ebp addl %esi,%ebp movl %ebx,%edi movl %ebp,%esi roll $5,%ebp xorl %ecx,%edi addl %edx,%ebp movl 28(%esp),%edx andl %eax,%edi rorl $2,%eax xorl %ecx,%edi leal 1518500249(%ebp,%edx,1),%ebp addl %edi,%ebp movl %eax,%edx movl %ebp,%edi roll $5,%ebp xorl %ebx,%edx addl %ecx,%ebp movl 32(%esp),%ecx andl %esi,%edx rorl $2,%esi xorl %ebx,%edx leal 1518500249(%ebp,%ecx,1),%ebp addl %edx,%ebp movl %esi,%ecx movl %ebp,%edx roll $5,%ebp xorl %eax,%ecx addl %ebx,%ebp movl 36(%esp),%ebx andl %edi,%ecx rorl $2,%edi xorl %eax,%ecx leal 1518500249(%ebp,%ebx,1),%ebp addl %ecx,%ebp movl %edi,%ebx movl %ebp,%ecx roll $5,%ebp xorl %esi,%ebx addl %eax,%ebp movl 40(%esp),%eax andl %edx,%ebx rorl $2,%edx xorl %esi,%ebx leal 1518500249(%ebp,%eax,1),%ebp addl %ebx,%ebp movl %edx,%eax movl %ebp,%ebx roll $5,%ebp xorl %edi,%eax addl %esi,%ebp movl 44(%esp),%esi andl %ecx,%eax rorl $2,%ecx xorl %edi,%eax leal 1518500249(%ebp,%esi,1),%ebp addl %eax,%ebp movl %ecx,%esi movl %ebp,%eax roll $5,%ebp xorl %edx,%esi addl %edi,%ebp movl 48(%esp),%edi andl %ebx,%esi rorl $2,%ebx xorl %edx,%esi leal 1518500249(%ebp,%edi,1),%ebp addl %esi,%ebp movl %ebx,%edi movl %ebp,%esi roll $5,%ebp xorl %ecx,%edi addl %edx,%ebp movl 52(%esp),%edx andl %eax,%edi rorl $2,%eax xorl %ecx,%edi leal 1518500249(%ebp,%edx,1),%ebp addl %edi,%ebp movl %eax,%edx movl %ebp,%edi roll $5,%ebp xorl %ebx,%edx addl %ecx,%ebp movl 56(%esp),%ecx andl %esi,%edx rorl $2,%esi xorl %ebx,%edx leal 1518500249(%ebp,%ecx,1),%ebp addl %edx,%ebp movl %esi,%ecx movl %ebp,%edx roll $5,%ebp xorl %eax,%ecx addl %ebx,%ebp movl 60(%esp),%ebx andl %edi,%ecx rorl $2,%edi xorl %eax,%ecx leal 1518500249(%ebp,%ebx,1),%ebp movl (%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 8(%esp),%ebx xorl %esi,%ebp xorl 32(%esp),%ebx andl %edx,%ebp xorl 52(%esp),%ebx roll $1,%ebx xorl %esi,%ebp addl %ebp,%eax movl %ecx,%ebp rorl $2,%edx movl %ebx,(%esp) roll $5,%ebp leal 1518500249(%ebx,%eax,1),%ebx movl 4(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 12(%esp),%eax xorl %edi,%ebp xorl 36(%esp),%eax andl %ecx,%ebp xorl 56(%esp),%eax roll $1,%eax xorl %edi,%ebp addl %ebp,%esi movl %ebx,%ebp rorl $2,%ecx movl %eax,4(%esp) roll $5,%ebp leal 1518500249(%eax,%esi,1),%eax movl 8(%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 16(%esp),%esi xorl %edx,%ebp xorl 40(%esp),%esi andl %ebx,%ebp xorl 60(%esp),%esi roll $1,%esi xorl %edx,%ebp addl %ebp,%edi movl %eax,%ebp rorl $2,%ebx movl %esi,8(%esp) roll $5,%ebp leal 1518500249(%esi,%edi,1),%esi movl 12(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 20(%esp),%edi xorl %ecx,%ebp xorl 44(%esp),%edi andl %eax,%ebp xorl (%esp),%edi roll $1,%edi xorl %ecx,%ebp addl %ebp,%edx movl %esi,%ebp rorl $2,%eax movl %edi,12(%esp) roll $5,%ebp leal 1518500249(%edi,%edx,1),%edi movl 16(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 24(%esp),%edx xorl %eax,%ebp xorl 48(%esp),%edx xorl %ebx,%ebp xorl 4(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,16(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 20(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 28(%esp),%ecx xorl %esi,%ebp xorl 52(%esp),%ecx xorl %eax,%ebp xorl 8(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,20(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 24(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 32(%esp),%ebx xorl %edi,%ebp xorl 56(%esp),%ebx xorl %esi,%ebp xorl 12(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,24(%esp) leal 1859775393(%ebx,%eax,1),%ebx movl 28(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 36(%esp),%eax xorl %edx,%ebp xorl 60(%esp),%eax xorl %edi,%ebp xorl 16(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,28(%esp) leal 1859775393(%eax,%esi,1),%eax movl 32(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 40(%esp),%esi xorl %ecx,%ebp xorl (%esp),%esi xorl %edx,%ebp xorl 20(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,32(%esp) leal 1859775393(%esi,%edi,1),%esi movl 36(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 44(%esp),%edi xorl %ebx,%ebp xorl 4(%esp),%edi xorl %ecx,%ebp xorl 24(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,36(%esp) leal 1859775393(%edi,%edx,1),%edi movl 40(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 48(%esp),%edx xorl %eax,%ebp xorl 8(%esp),%edx xorl %ebx,%ebp xorl 28(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,40(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 44(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 52(%esp),%ecx xorl %esi,%ebp xorl 12(%esp),%ecx xorl %eax,%ebp xorl 32(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,44(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 48(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 56(%esp),%ebx xorl %edi,%ebp xorl 16(%esp),%ebx xorl %esi,%ebp xorl 36(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,48(%esp) leal 1859775393(%ebx,%eax,1),%ebx movl 52(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 60(%esp),%eax xorl %edx,%ebp xorl 20(%esp),%eax xorl %edi,%ebp xorl 40(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,52(%esp) leal 1859775393(%eax,%esi,1),%eax movl 56(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl (%esp),%esi xorl %ecx,%ebp xorl 24(%esp),%esi xorl %edx,%ebp xorl 44(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,56(%esp) leal 1859775393(%esi,%edi,1),%esi movl 60(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 4(%esp),%edi xorl %ebx,%ebp xorl 28(%esp),%edi xorl %ecx,%ebp xorl 48(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,60(%esp) leal 1859775393(%edi,%edx,1),%edi movl (%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 8(%esp),%edx xorl %eax,%ebp xorl 32(%esp),%edx xorl %ebx,%ebp xorl 52(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 4(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 12(%esp),%ecx xorl %esi,%ebp xorl 36(%esp),%ecx xorl %eax,%ebp xorl 56(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,4(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 8(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 16(%esp),%ebx xorl %edi,%ebp xorl 40(%esp),%ebx xorl %esi,%ebp xorl 60(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,8(%esp) leal 1859775393(%ebx,%eax,1),%ebx movl 12(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 20(%esp),%eax xorl %edx,%ebp xorl 44(%esp),%eax xorl %edi,%ebp xorl (%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,12(%esp) leal 1859775393(%eax,%esi,1),%eax movl 16(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 24(%esp),%esi xorl %ecx,%ebp xorl 48(%esp),%esi xorl %edx,%ebp xorl 4(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,16(%esp) leal 1859775393(%esi,%edi,1),%esi movl 20(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 28(%esp),%edi xorl %ebx,%ebp xorl 52(%esp),%edi xorl %ecx,%ebp xorl 8(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,20(%esp) leal 1859775393(%edi,%edx,1),%edi movl 24(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 32(%esp),%edx xorl %eax,%ebp xorl 56(%esp),%edx xorl %ebx,%ebp xorl 12(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,24(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 28(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 36(%esp),%ecx xorl %esi,%ebp xorl 60(%esp),%ecx xorl %eax,%ebp xorl 16(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,28(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 32(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 40(%esp),%ebx xorl %esi,%ebp xorl (%esp),%ebx andl %edx,%ebp xorl 20(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,32(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 36(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 44(%esp),%eax xorl %edi,%ebp xorl 4(%esp),%eax andl %ecx,%ebp xorl 24(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,36(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl 40(%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 48(%esp),%esi xorl %edx,%ebp xorl 8(%esp),%esi andl %ebx,%ebp xorl 28(%esp),%esi roll $1,%esi addl %edi,%ebp rorl $2,%ebx movl %eax,%edi roll $5,%edi movl %esi,40(%esp) leal 2400959708(%esi,%ebp,1),%esi movl %ecx,%ebp addl %edi,%esi andl %edx,%ebp movl 44(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 52(%esp),%edi xorl %ecx,%ebp xorl 12(%esp),%edi andl %eax,%ebp xorl 32(%esp),%edi roll $1,%edi addl %edx,%ebp rorl $2,%eax movl %esi,%edx roll $5,%edx movl %edi,44(%esp) leal 2400959708(%edi,%ebp,1),%edi movl %ebx,%ebp addl %edx,%edi andl %ecx,%ebp movl 48(%esp),%edx addl %ebp,%edi movl %eax,%ebp xorl 56(%esp),%edx xorl %ebx,%ebp xorl 16(%esp),%edx andl %esi,%ebp xorl 36(%esp),%edx roll $1,%edx addl %ecx,%ebp rorl $2,%esi movl %edi,%ecx roll $5,%ecx movl %edx,48(%esp) leal 2400959708(%edx,%ebp,1),%edx movl %eax,%ebp addl %ecx,%edx andl %ebx,%ebp movl 52(%esp),%ecx addl %ebp,%edx movl %esi,%ebp xorl 60(%esp),%ecx xorl %eax,%ebp xorl 20(%esp),%ecx andl %edi,%ebp xorl 40(%esp),%ecx roll $1,%ecx addl %ebx,%ebp rorl $2,%edi movl %edx,%ebx roll $5,%ebx movl %ecx,52(%esp) leal 2400959708(%ecx,%ebp,1),%ecx movl %esi,%ebp addl %ebx,%ecx andl %eax,%ebp movl 56(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl (%esp),%ebx xorl %esi,%ebp xorl 24(%esp),%ebx andl %edx,%ebp xorl 44(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,56(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 60(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 4(%esp),%eax xorl %edi,%ebp xorl 28(%esp),%eax andl %ecx,%ebp xorl 48(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,60(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl (%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 8(%esp),%esi xorl %edx,%ebp xorl 32(%esp),%esi andl %ebx,%ebp xorl 52(%esp),%esi roll $1,%esi addl %edi,%ebp rorl $2,%ebx movl %eax,%edi roll $5,%edi movl %esi,(%esp) leal 2400959708(%esi,%ebp,1),%esi movl %ecx,%ebp addl %edi,%esi andl %edx,%ebp movl 4(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 12(%esp),%edi xorl %ecx,%ebp xorl 36(%esp),%edi andl %eax,%ebp xorl 56(%esp),%edi roll $1,%edi addl %edx,%ebp rorl $2,%eax movl %esi,%edx roll $5,%edx movl %edi,4(%esp) leal 2400959708(%edi,%ebp,1),%edi movl %ebx,%ebp addl %edx,%edi andl %ecx,%ebp movl 8(%esp),%edx addl %ebp,%edi movl %eax,%ebp xorl 16(%esp),%edx xorl %ebx,%ebp xorl 40(%esp),%edx andl %esi,%ebp xorl 60(%esp),%edx roll $1,%edx addl %ecx,%ebp rorl $2,%esi movl %edi,%ecx roll $5,%ecx movl %edx,8(%esp) leal 2400959708(%edx,%ebp,1),%edx movl %eax,%ebp addl %ecx,%edx andl %ebx,%ebp movl 12(%esp),%ecx addl %ebp,%edx movl %esi,%ebp xorl 20(%esp),%ecx xorl %eax,%ebp xorl 44(%esp),%ecx andl %edi,%ebp xorl (%esp),%ecx roll $1,%ecx addl %ebx,%ebp rorl $2,%edi movl %edx,%ebx roll $5,%ebx movl %ecx,12(%esp) leal 2400959708(%ecx,%ebp,1),%ecx movl %esi,%ebp addl %ebx,%ecx andl %eax,%ebp movl 16(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 24(%esp),%ebx xorl %esi,%ebp xorl 48(%esp),%ebx andl %edx,%ebp xorl 4(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,16(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 20(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 28(%esp),%eax xorl %edi,%ebp xorl 52(%esp),%eax andl %ecx,%ebp xorl 8(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,20(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl 24(%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 32(%esp),%esi xorl %edx,%ebp xorl 56(%esp),%esi andl %ebx,%ebp xorl 12(%esp),%esi roll $1,%esi addl %edi,%ebp rorl $2,%ebx movl %eax,%edi roll $5,%edi movl %esi,24(%esp) leal 2400959708(%esi,%ebp,1),%esi movl %ecx,%ebp addl %edi,%esi andl %edx,%ebp movl 28(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 36(%esp),%edi xorl %ecx,%ebp xorl 60(%esp),%edi andl %eax,%ebp xorl 16(%esp),%edi roll $1,%edi addl %edx,%ebp rorl $2,%eax movl %esi,%edx roll $5,%edx movl %edi,28(%esp) leal 2400959708(%edi,%ebp,1),%edi movl %ebx,%ebp addl %edx,%edi andl %ecx,%ebp movl 32(%esp),%edx addl %ebp,%edi movl %eax,%ebp xorl 40(%esp),%edx xorl %ebx,%ebp xorl (%esp),%edx andl %esi,%ebp xorl 20(%esp),%edx roll $1,%edx addl %ecx,%ebp rorl $2,%esi movl %edi,%ecx roll $5,%ecx movl %edx,32(%esp) leal 2400959708(%edx,%ebp,1),%edx movl %eax,%ebp addl %ecx,%edx andl %ebx,%ebp movl 36(%esp),%ecx addl %ebp,%edx movl %esi,%ebp xorl 44(%esp),%ecx xorl %eax,%ebp xorl 4(%esp),%ecx andl %edi,%ebp xorl 24(%esp),%ecx roll $1,%ecx addl %ebx,%ebp rorl $2,%edi movl %edx,%ebx roll $5,%ebx movl %ecx,36(%esp) leal 2400959708(%ecx,%ebp,1),%ecx movl %esi,%ebp addl %ebx,%ecx andl %eax,%ebp movl 40(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 48(%esp),%ebx xorl %esi,%ebp xorl 8(%esp),%ebx andl %edx,%ebp xorl 28(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,40(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 44(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 52(%esp),%eax xorl %edi,%ebp xorl 12(%esp),%eax andl %ecx,%ebp xorl 32(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,44(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl 48(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 56(%esp),%esi xorl %ecx,%ebp xorl 16(%esp),%esi xorl %edx,%ebp xorl 36(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,48(%esp) leal 3395469782(%esi,%edi,1),%esi movl 52(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 60(%esp),%edi xorl %ebx,%ebp xorl 20(%esp),%edi xorl %ecx,%ebp xorl 40(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,52(%esp) leal 3395469782(%edi,%edx,1),%edi movl 56(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl (%esp),%edx xorl %eax,%ebp xorl 24(%esp),%edx xorl %ebx,%ebp xorl 44(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,56(%esp) leal 3395469782(%edx,%ecx,1),%edx movl 60(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 4(%esp),%ecx xorl %esi,%ebp xorl 28(%esp),%ecx xorl %eax,%ebp xorl 48(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,60(%esp) leal 3395469782(%ecx,%ebx,1),%ecx movl (%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 8(%esp),%ebx xorl %edi,%ebp xorl 32(%esp),%ebx xorl %esi,%ebp xorl 52(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,(%esp) leal 3395469782(%ebx,%eax,1),%ebx movl 4(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 12(%esp),%eax xorl %edx,%ebp xorl 36(%esp),%eax xorl %edi,%ebp xorl 56(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,4(%esp) leal 3395469782(%eax,%esi,1),%eax movl 8(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 16(%esp),%esi xorl %ecx,%ebp xorl 40(%esp),%esi xorl %edx,%ebp xorl 60(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,8(%esp) leal 3395469782(%esi,%edi,1),%esi movl 12(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 20(%esp),%edi xorl %ebx,%ebp xorl 44(%esp),%edi xorl %ecx,%ebp xorl (%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,12(%esp) leal 3395469782(%edi,%edx,1),%edi movl 16(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 24(%esp),%edx xorl %eax,%ebp xorl 48(%esp),%edx xorl %ebx,%ebp xorl 4(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,16(%esp) leal 3395469782(%edx,%ecx,1),%edx movl 20(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 28(%esp),%ecx xorl %esi,%ebp xorl 52(%esp),%ecx xorl %eax,%ebp xorl 8(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,20(%esp) leal 3395469782(%ecx,%ebx,1),%ecx movl 24(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 32(%esp),%ebx xorl %edi,%ebp xorl 56(%esp),%ebx xorl %esi,%ebp xorl 12(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,24(%esp) leal 3395469782(%ebx,%eax,1),%ebx movl 28(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 36(%esp),%eax xorl %edx,%ebp xorl 60(%esp),%eax xorl %edi,%ebp xorl 16(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,28(%esp) leal 3395469782(%eax,%esi,1),%eax movl 32(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 40(%esp),%esi xorl %ecx,%ebp xorl (%esp),%esi xorl %edx,%ebp xorl 20(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,32(%esp) leal 3395469782(%esi,%edi,1),%esi movl 36(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 44(%esp),%edi xorl %ebx,%ebp xorl 4(%esp),%edi xorl %ecx,%ebp xorl 24(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,36(%esp) leal 3395469782(%edi,%edx,1),%edi movl 40(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 48(%esp),%edx xorl %eax,%ebp xorl 8(%esp),%edx xorl %ebx,%ebp xorl 28(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,40(%esp) leal 3395469782(%edx,%ecx,1),%edx movl 44(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 52(%esp),%ecx xorl %esi,%ebp xorl 12(%esp),%ecx xorl %eax,%ebp xorl 32(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,44(%esp) leal 3395469782(%ecx,%ebx,1),%ecx movl 48(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 56(%esp),%ebx xorl %edi,%ebp xorl 16(%esp),%ebx xorl %esi,%ebp xorl 36(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,48(%esp) leal 3395469782(%ebx,%eax,1),%ebx movl 52(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 60(%esp),%eax xorl %edx,%ebp xorl 20(%esp),%eax xorl %edi,%ebp xorl 40(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp leal 3395469782(%eax,%esi,1),%eax movl 56(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl (%esp),%esi xorl %ecx,%ebp xorl 24(%esp),%esi xorl %edx,%ebp xorl 44(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp leal 3395469782(%esi,%edi,1),%esi movl 60(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 4(%esp),%edi xorl %ebx,%ebp xorl 28(%esp),%edi xorl %ecx,%ebp xorl 48(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp leal 3395469782(%edi,%edx,1),%edi addl %ebp,%edi movl 96(%esp),%ebp movl 100(%esp),%edx addl (%ebp),%edi addl 4(%ebp),%esi addl 8(%ebp),%eax addl 12(%ebp),%ebx addl 16(%ebp),%ecx movl %edi,(%ebp) addl $64,%edx movl %esi,4(%ebp) cmpl 104(%esp),%edx movl %eax,8(%ebp) movl %ecx,%edi movl %ebx,12(%ebp) movl %edx,%esi movl %ecx,16(%ebp) jb .L002loop addl $76,%esp popl %edi popl %esi popl %ebx popl %ebp ret .size sha1_block_data_order,.-.L_sha1_block_data_order_begin .type _sha1_block_data_order_shaext,@function .align 16 _sha1_block_data_order_shaext: pushl %ebp pushl %ebx pushl %esi pushl %edi call .L003pic_point .L003pic_point: popl %ebp leal .LK_XX_XX-.L003pic_point(%ebp),%ebp .Lshaext_shortcut: movl 20(%esp),%edi movl %esp,%ebx movl 24(%esp),%esi movl 28(%esp),%ecx subl $32,%esp movdqu (%edi),%xmm0 movd 16(%edi),%xmm1 andl $-32,%esp movdqa 80(%ebp),%xmm3 movdqu (%esi),%xmm4 pshufd $27,%xmm0,%xmm0 movdqu 16(%esi),%xmm5 pshufd $27,%xmm1,%xmm1 movdqu 32(%esi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%esi),%xmm7 .byte 102,15,56,0,235 .byte 102,15,56,0,243 .byte 102,15,56,0,251 jmp .L004loop_shaext .align 16 .L004loop_shaext: decl %ecx leal 64(%esi),%eax movdqa %xmm1,(%esp) paddd %xmm4,%xmm1 cmovnel %eax,%esi movdqa %xmm0,16(%esp) .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,0 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,0 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,1 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,1 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,2 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,2 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 movdqu (%esi),%xmm4 movdqa %xmm0,%xmm2 .byte 15,58,204,193,3 .byte 15,56,200,213 movdqu 16(%esi),%xmm5 .byte 102,15,56,0,227 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 15,56,200,206 movdqu 32(%esi),%xmm6 .byte 102,15,56,0,235 movdqa %xmm0,%xmm2 .byte 15,58,204,193,3 .byte 15,56,200,215 movdqu 48(%esi),%xmm7 .byte 102,15,56,0,243 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 movdqa (%esp),%xmm2 .byte 102,15,56,0,251 .byte 15,56,200,202 paddd 16(%esp),%xmm0 jnz .L004loop_shaext pshufd $27,%xmm0,%xmm0 pshufd $27,%xmm1,%xmm1 movdqu %xmm0,(%edi) movd %xmm1,16(%edi) movl %ebx,%esp popl %edi popl %esi popl %ebx popl %ebp ret .size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext .type _sha1_block_data_order_ssse3,@function .align 16 _sha1_block_data_order_ssse3: pushl %ebp pushl %ebx pushl %esi pushl %edi call .L005pic_point .L005pic_point: popl %ebp leal .LK_XX_XX-.L005pic_point(%ebp),%ebp .Lssse3_shortcut: movdqa (%ebp),%xmm7 movdqa 16(%ebp),%xmm0 movdqa 32(%ebp),%xmm1 movdqa 48(%ebp),%xmm2 movdqa 64(%ebp),%xmm6 movl 20(%esp),%edi movl 24(%esp),%ebp movl 28(%esp),%edx movl %esp,%esi subl $208,%esp andl $-64,%esp movdqa %xmm0,112(%esp) movdqa %xmm1,128(%esp) movdqa %xmm2,144(%esp) shll $6,%edx movdqa %xmm7,160(%esp) addl %ebp,%edx movdqa %xmm6,176(%esp) addl $64,%ebp movl %edi,192(%esp) movl %ebp,196(%esp) movl %edx,200(%esp) movl %esi,204(%esp) movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx movl 12(%edi),%edx movl 16(%edi),%edi movl %ebx,%esi movdqu -64(%ebp),%xmm0 movdqu -48(%ebp),%xmm1 movdqu -32(%ebp),%xmm2 movdqu -16(%ebp),%xmm3 .byte 102,15,56,0,198 .byte 102,15,56,0,206 .byte 102,15,56,0,214 movdqa %xmm7,96(%esp) .byte 102,15,56,0,222 paddd %xmm7,%xmm0 paddd %xmm7,%xmm1 paddd %xmm7,%xmm2 movdqa %xmm0,(%esp) psubd %xmm7,%xmm0 movdqa %xmm1,16(%esp) psubd %xmm7,%xmm1 movdqa %xmm2,32(%esp) movl %ecx,%ebp psubd %xmm7,%xmm2 xorl %edx,%ebp pshufd $238,%xmm0,%xmm4 andl %ebp,%esi jmp .L006loop .align 16 .L006loop: rorl $2,%ebx xorl %edx,%esi movl %eax,%ebp punpcklqdq %xmm1,%xmm4 movdqa %xmm3,%xmm6 addl (%esp),%edi xorl %ecx,%ebx paddd %xmm3,%xmm7 movdqa %xmm0,64(%esp) roll $5,%eax addl %esi,%edi psrldq $4,%xmm6 andl %ebx,%ebp xorl %ecx,%ebx pxor %xmm0,%xmm4 addl %eax,%edi rorl $7,%eax pxor %xmm2,%xmm6 xorl %ecx,%ebp movl %edi,%esi addl 4(%esp),%edx pxor %xmm6,%xmm4 xorl %ebx,%eax roll $5,%edi movdqa %xmm7,48(%esp) addl %ebp,%edx andl %eax,%esi movdqa %xmm4,%xmm0 xorl %ebx,%eax addl %edi,%edx rorl $7,%edi movdqa %xmm4,%xmm6 xorl %ebx,%esi pslldq $12,%xmm0 paddd %xmm4,%xmm4 movl %edx,%ebp addl 8(%esp),%ecx psrld $31,%xmm6 xorl %eax,%edi roll $5,%edx movdqa %xmm0,%xmm7 addl %esi,%ecx andl %edi,%ebp xorl %eax,%edi psrld $30,%xmm0 addl %edx,%ecx rorl $7,%edx por %xmm6,%xmm4 xorl %eax,%ebp movl %ecx,%esi addl 12(%esp),%ebx pslld $2,%xmm7 xorl %edi,%edx roll $5,%ecx pxor %xmm0,%xmm4 movdqa 96(%esp),%xmm0 addl %ebp,%ebx andl %edx,%esi pxor %xmm7,%xmm4 pshufd $238,%xmm1,%xmm5 xorl %edi,%edx addl %ecx,%ebx rorl $7,%ecx xorl %edi,%esi movl %ebx,%ebp punpcklqdq %xmm2,%xmm5 movdqa %xmm4,%xmm7 addl 16(%esp),%eax xorl %edx,%ecx paddd %xmm4,%xmm0 movdqa %xmm1,80(%esp) roll $5,%ebx addl %esi,%eax psrldq $4,%xmm7 andl %ecx,%ebp xorl %edx,%ecx pxor %xmm1,%xmm5 addl %ebx,%eax rorl $7,%ebx pxor %xmm3,%xmm7 xorl %edx,%ebp movl %eax,%esi addl 20(%esp),%edi pxor %xmm7,%xmm5 xorl %ecx,%ebx roll $5,%eax movdqa %xmm0,(%esp) addl %ebp,%edi andl %ebx,%esi movdqa %xmm5,%xmm1 xorl %ecx,%ebx addl %eax,%edi rorl $7,%eax movdqa %xmm5,%xmm7 xorl %ecx,%esi pslldq $12,%xmm1 paddd %xmm5,%xmm5 movl %edi,%ebp addl 24(%esp),%edx psrld $31,%xmm7 xorl %ebx,%eax roll $5,%edi movdqa %xmm1,%xmm0 addl %esi,%edx andl %eax,%ebp xorl %ebx,%eax psrld $30,%xmm1 addl %edi,%edx rorl $7,%edi por %xmm7,%xmm5 xorl %ebx,%ebp movl %edx,%esi addl 28(%esp),%ecx pslld $2,%xmm0 xorl %eax,%edi roll $5,%edx pxor %xmm1,%xmm5 movdqa 112(%esp),%xmm1 addl %ebp,%ecx andl %edi,%esi pxor %xmm0,%xmm5 pshufd $238,%xmm2,%xmm6 xorl %eax,%edi addl %edx,%ecx rorl $7,%edx xorl %eax,%esi movl %ecx,%ebp punpcklqdq %xmm3,%xmm6 movdqa %xmm5,%xmm0 addl 32(%esp),%ebx xorl %edi,%edx paddd %xmm5,%xmm1 movdqa %xmm2,96(%esp) roll $5,%ecx addl %esi,%ebx psrldq $4,%xmm0 andl %edx,%ebp xorl %edi,%edx pxor %xmm2,%xmm6 addl %ecx,%ebx rorl $7,%ecx pxor %xmm4,%xmm0 xorl %edi,%ebp movl %ebx,%esi addl 36(%esp),%eax pxor %xmm0,%xmm6 xorl %edx,%ecx roll $5,%ebx movdqa %xmm1,16(%esp) addl %ebp,%eax andl %ecx,%esi movdqa %xmm6,%xmm2 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx movdqa %xmm6,%xmm0 xorl %edx,%esi pslldq $12,%xmm2 paddd %xmm6,%xmm6 movl %eax,%ebp addl 40(%esp),%edi psrld $31,%xmm0 xorl %ecx,%ebx roll $5,%eax movdqa %xmm2,%xmm1 addl %esi,%edi andl %ebx,%ebp xorl %ecx,%ebx psrld $30,%xmm2 addl %eax,%edi rorl $7,%eax por %xmm0,%xmm6 xorl %ecx,%ebp movdqa 64(%esp),%xmm0 movl %edi,%esi addl 44(%esp),%edx pslld $2,%xmm1 xorl %ebx,%eax roll $5,%edi pxor %xmm2,%xmm6 movdqa 112(%esp),%xmm2 addl %ebp,%edx andl %eax,%esi pxor %xmm1,%xmm6 pshufd $238,%xmm3,%xmm7 xorl %ebx,%eax addl %edi,%edx rorl $7,%edi xorl %ebx,%esi movl %edx,%ebp punpcklqdq %xmm4,%xmm7 movdqa %xmm6,%xmm1 addl 48(%esp),%ecx xorl %eax,%edi paddd %xmm6,%xmm2 movdqa %xmm3,64(%esp) roll $5,%edx addl %esi,%ecx psrldq $4,%xmm1 andl %edi,%ebp xorl %eax,%edi pxor %xmm3,%xmm7 addl %edx,%ecx rorl $7,%edx pxor %xmm5,%xmm1 xorl %eax,%ebp movl %ecx,%esi addl 52(%esp),%ebx pxor %xmm1,%xmm7 xorl %edi,%edx roll $5,%ecx movdqa %xmm2,32(%esp) addl %ebp,%ebx andl %edx,%esi movdqa %xmm7,%xmm3 xorl %edi,%edx addl %ecx,%ebx rorl $7,%ecx movdqa %xmm7,%xmm1 xorl %edi,%esi pslldq $12,%xmm3 paddd %xmm7,%xmm7 movl %ebx,%ebp addl 56(%esp),%eax psrld $31,%xmm1 xorl %edx,%ecx roll $5,%ebx movdqa %xmm3,%xmm2 addl %esi,%eax andl %ecx,%ebp xorl %edx,%ecx psrld $30,%xmm3 addl %ebx,%eax rorl $7,%ebx por %xmm1,%xmm7 xorl %edx,%ebp movdqa 80(%esp),%xmm1 movl %eax,%esi addl 60(%esp),%edi pslld $2,%xmm2 xorl %ecx,%ebx roll $5,%eax pxor %xmm3,%xmm7 movdqa 112(%esp),%xmm3 addl %ebp,%edi andl %ebx,%esi pxor %xmm2,%xmm7 pshufd $238,%xmm6,%xmm2 xorl %ecx,%ebx addl %eax,%edi rorl $7,%eax pxor %xmm4,%xmm0 punpcklqdq %xmm7,%xmm2 xorl %ecx,%esi movl %edi,%ebp addl (%esp),%edx pxor %xmm1,%xmm0 movdqa %xmm4,80(%esp) xorl %ebx,%eax roll $5,%edi movdqa %xmm3,%xmm4 addl %esi,%edx paddd %xmm7,%xmm3 andl %eax,%ebp pxor %xmm2,%xmm0 xorl %ebx,%eax addl %edi,%edx rorl $7,%edi xorl %ebx,%ebp movdqa %xmm0,%xmm2 movdqa %xmm3,48(%esp) movl %edx,%esi addl 4(%esp),%ecx xorl %eax,%edi roll $5,%edx pslld $2,%xmm0 addl %ebp,%ecx andl %edi,%esi psrld $30,%xmm2 xorl %eax,%edi addl %edx,%ecx rorl $7,%edx xorl %eax,%esi movl %ecx,%ebp addl 8(%esp),%ebx xorl %edi,%edx roll $5,%ecx por %xmm2,%xmm0 addl %esi,%ebx andl %edx,%ebp movdqa 96(%esp),%xmm2 xorl %edi,%edx addl %ecx,%ebx addl 12(%esp),%eax xorl %edi,%ebp movl %ebx,%esi pshufd $238,%xmm7,%xmm3 roll $5,%ebx addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax addl 16(%esp),%edi pxor %xmm5,%xmm1 punpcklqdq %xmm0,%xmm3 xorl %ecx,%esi movl %eax,%ebp roll $5,%eax pxor %xmm2,%xmm1 movdqa %xmm5,96(%esp) addl %esi,%edi xorl %ecx,%ebp movdqa %xmm4,%xmm5 rorl $7,%ebx paddd %xmm0,%xmm4 addl %eax,%edi pxor %xmm3,%xmm1 addl 20(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi movdqa %xmm1,%xmm3 movdqa %xmm4,(%esp) addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx pslld $2,%xmm1 addl 24(%esp),%ecx xorl %eax,%esi psrld $30,%xmm3 movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi addl %edx,%ecx por %xmm3,%xmm1 addl 28(%esp),%ebx xorl %edi,%ebp movdqa 64(%esp),%xmm3 movl %ecx,%esi roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx pshufd $238,%xmm0,%xmm4 addl %ecx,%ebx addl 32(%esp),%eax pxor %xmm6,%xmm2 punpcklqdq %xmm1,%xmm4 xorl %edx,%esi movl %ebx,%ebp roll $5,%ebx pxor %xmm3,%xmm2 movdqa %xmm6,64(%esp) addl %esi,%eax xorl %edx,%ebp movdqa 128(%esp),%xmm6 rorl $7,%ecx paddd %xmm1,%xmm5 addl %ebx,%eax pxor %xmm4,%xmm2 addl 36(%esp),%edi xorl %ecx,%ebp movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm4 movdqa %xmm5,16(%esp) addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi pslld $2,%xmm2 addl 40(%esp),%edx xorl %ebx,%esi psrld $30,%xmm4 movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax addl %edi,%edx por %xmm4,%xmm2 addl 44(%esp),%ecx xorl %eax,%ebp movdqa 80(%esp),%xmm4 movl %edx,%esi roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi pshufd $238,%xmm1,%xmm5 addl %edx,%ecx addl 48(%esp),%ebx pxor %xmm7,%xmm3 punpcklqdq %xmm2,%xmm5 xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx pxor %xmm4,%xmm3 movdqa %xmm7,80(%esp) addl %esi,%ebx xorl %edi,%ebp movdqa %xmm6,%xmm7 rorl $7,%edx paddd %xmm2,%xmm6 addl %ecx,%ebx pxor %xmm5,%xmm3 addl 52(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm5 movdqa %xmm6,32(%esp) addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax pslld $2,%xmm3 addl 56(%esp),%edi xorl %ecx,%esi psrld $30,%xmm5 movl %eax,%ebp roll $5,%eax addl %esi,%edi xorl %ecx,%ebp rorl $7,%ebx addl %eax,%edi por %xmm5,%xmm3 addl 60(%esp),%edx xorl %ebx,%ebp movdqa 96(%esp),%xmm5 movl %edi,%esi roll $5,%edi addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax pshufd $238,%xmm2,%xmm6 addl %edi,%edx addl (%esp),%ecx pxor %xmm0,%xmm4 punpcklqdq %xmm3,%xmm6 xorl %eax,%esi movl %edx,%ebp roll $5,%edx pxor %xmm5,%xmm4 movdqa %xmm0,96(%esp) addl %esi,%ecx xorl %eax,%ebp movdqa %xmm7,%xmm0 rorl $7,%edi paddd %xmm3,%xmm7 addl %edx,%ecx pxor %xmm6,%xmm4 addl 4(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm6 movdqa %xmm7,48(%esp) addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx addl %ecx,%ebx pslld $2,%xmm4 addl 8(%esp),%eax xorl %edx,%esi psrld $30,%xmm6 movl %ebx,%ebp roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx addl %ebx,%eax por %xmm6,%xmm4 addl 12(%esp),%edi xorl %ecx,%ebp movdqa 64(%esp),%xmm6 movl %eax,%esi roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx pshufd $238,%xmm3,%xmm7 addl %eax,%edi addl 16(%esp),%edx pxor %xmm1,%xmm5 punpcklqdq %xmm4,%xmm7 xorl %ebx,%esi movl %edi,%ebp roll $5,%edi pxor %xmm6,%xmm5 movdqa %xmm1,64(%esp) addl %esi,%edx xorl %ebx,%ebp movdqa %xmm0,%xmm1 rorl $7,%eax paddd %xmm4,%xmm0 addl %edi,%edx pxor %xmm7,%xmm5 addl 20(%esp),%ecx xorl %eax,%ebp movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm7 movdqa %xmm0,(%esp) addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi addl %edx,%ecx pslld $2,%xmm5 addl 24(%esp),%ebx xorl %edi,%esi psrld $30,%xmm7 movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx por %xmm7,%xmm5 addl 28(%esp),%eax movdqa 80(%esp),%xmm7 rorl $7,%ecx movl %ebx,%esi xorl %edx,%ebp roll $5,%ebx pshufd $238,%xmm4,%xmm0 addl %ebp,%eax xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax addl 32(%esp),%edi pxor %xmm2,%xmm6 punpcklqdq %xmm5,%xmm0 andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx pxor %xmm7,%xmm6 movdqa %xmm2,80(%esp) movl %eax,%ebp xorl %ecx,%esi roll $5,%eax movdqa %xmm1,%xmm2 addl %esi,%edi paddd %xmm5,%xmm1 xorl %ebx,%ebp pxor %xmm0,%xmm6 xorl %ecx,%ebx addl %eax,%edi addl 36(%esp),%edx andl %ebx,%ebp movdqa %xmm6,%xmm0 movdqa %xmm1,16(%esp) xorl %ecx,%ebx rorl $7,%eax movl %edi,%esi xorl %ebx,%ebp roll $5,%edi pslld $2,%xmm6 addl %ebp,%edx xorl %eax,%esi psrld $30,%xmm0 xorl %ebx,%eax addl %edi,%edx addl 40(%esp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%edi por %xmm0,%xmm6 movl %edx,%ebp xorl %eax,%esi movdqa 96(%esp),%xmm0 roll $5,%edx addl %esi,%ecx xorl %edi,%ebp xorl %eax,%edi addl %edx,%ecx pshufd $238,%xmm5,%xmm1 addl 44(%esp),%ebx andl %edi,%ebp xorl %eax,%edi rorl $7,%edx movl %ecx,%esi xorl %edi,%ebp roll $5,%ecx addl %ebp,%ebx xorl %edx,%esi xorl %edi,%edx addl %ecx,%ebx addl 48(%esp),%eax pxor %xmm3,%xmm7 punpcklqdq %xmm6,%xmm1 andl %edx,%esi xorl %edi,%edx rorl $7,%ecx pxor %xmm0,%xmm7 movdqa %xmm3,96(%esp) movl %ebx,%ebp xorl %edx,%esi roll $5,%ebx movdqa 144(%esp),%xmm3 addl %esi,%eax paddd %xmm6,%xmm2 xorl %ecx,%ebp pxor %xmm1,%xmm7 xorl %edx,%ecx addl %ebx,%eax addl 52(%esp),%edi andl %ecx,%ebp movdqa %xmm7,%xmm1 movdqa %xmm2,32(%esp) xorl %edx,%ecx rorl $7,%ebx movl %eax,%esi xorl %ecx,%ebp roll $5,%eax pslld $2,%xmm7 addl %ebp,%edi xorl %ebx,%esi psrld $30,%xmm1 xorl %ecx,%ebx addl %eax,%edi addl 56(%esp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax por %xmm1,%xmm7 movl %edi,%ebp xorl %ebx,%esi movdqa 64(%esp),%xmm1 roll $5,%edi addl %esi,%edx xorl %eax,%ebp xorl %ebx,%eax addl %edi,%edx pshufd $238,%xmm6,%xmm2 addl 60(%esp),%ecx andl %eax,%ebp xorl %ebx,%eax rorl $7,%edi movl %edx,%esi xorl %eax,%ebp roll $5,%edx addl %ebp,%ecx xorl %edi,%esi xorl %eax,%edi addl %edx,%ecx addl (%esp),%ebx pxor %xmm4,%xmm0 punpcklqdq %xmm7,%xmm2 andl %edi,%esi xorl %eax,%edi rorl $7,%edx pxor %xmm1,%xmm0 movdqa %xmm4,64(%esp) movl %ecx,%ebp xorl %edi,%esi roll $5,%ecx movdqa %xmm3,%xmm4 addl %esi,%ebx paddd %xmm7,%xmm3 xorl %edx,%ebp pxor %xmm2,%xmm0 xorl %edi,%edx addl %ecx,%ebx addl 4(%esp),%eax andl %edx,%ebp movdqa %xmm0,%xmm2 movdqa %xmm3,48(%esp) xorl %edi,%edx rorl $7,%ecx movl %ebx,%esi xorl %edx,%ebp roll $5,%ebx pslld $2,%xmm0 addl %ebp,%eax xorl %ecx,%esi psrld $30,%xmm2 xorl %edx,%ecx addl %ebx,%eax addl 8(%esp),%edi andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx por %xmm2,%xmm0 movl %eax,%ebp xorl %ecx,%esi movdqa 80(%esp),%xmm2 roll $5,%eax addl %esi,%edi xorl %ebx,%ebp xorl %ecx,%ebx addl %eax,%edi pshufd $238,%xmm7,%xmm3 addl 12(%esp),%edx andl %ebx,%ebp xorl %ecx,%ebx rorl $7,%eax movl %edi,%esi xorl %ebx,%ebp roll $5,%edi addl %ebp,%edx xorl %eax,%esi xorl %ebx,%eax addl %edi,%edx addl 16(%esp),%ecx pxor %xmm5,%xmm1 punpcklqdq %xmm0,%xmm3 andl %eax,%esi xorl %ebx,%eax rorl $7,%edi pxor %xmm2,%xmm1 movdqa %xmm5,80(%esp) movl %edx,%ebp xorl %eax,%esi roll $5,%edx movdqa %xmm4,%xmm5 addl %esi,%ecx paddd %xmm0,%xmm4 xorl %edi,%ebp pxor %xmm3,%xmm1 xorl %eax,%edi addl %edx,%ecx addl 20(%esp),%ebx andl %edi,%ebp movdqa %xmm1,%xmm3 movdqa %xmm4,(%esp) xorl %eax,%edi rorl $7,%edx movl %ecx,%esi xorl %edi,%ebp roll $5,%ecx pslld $2,%xmm1 addl %ebp,%ebx xorl %edx,%esi psrld $30,%xmm3 xorl %edi,%edx addl %ecx,%ebx addl 24(%esp),%eax andl %edx,%esi xorl %edi,%edx rorl $7,%ecx por %xmm3,%xmm1 movl %ebx,%ebp xorl %edx,%esi movdqa 96(%esp),%xmm3 roll $5,%ebx addl %esi,%eax xorl %ecx,%ebp xorl %edx,%ecx addl %ebx,%eax pshufd $238,%xmm0,%xmm4 addl 28(%esp),%edi andl %ecx,%ebp xorl %edx,%ecx rorl $7,%ebx movl %eax,%esi xorl %ecx,%ebp roll $5,%eax addl %ebp,%edi xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%edi addl 32(%esp),%edx pxor %xmm6,%xmm2 punpcklqdq %xmm1,%xmm4 andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax pxor %xmm3,%xmm2 movdqa %xmm6,96(%esp) movl %edi,%ebp xorl %ebx,%esi roll $5,%edi movdqa %xmm5,%xmm6 addl %esi,%edx paddd %xmm1,%xmm5 xorl %eax,%ebp pxor %xmm4,%xmm2 xorl %ebx,%eax addl %edi,%edx addl 36(%esp),%ecx andl %eax,%ebp movdqa %xmm2,%xmm4 movdqa %xmm5,16(%esp) xorl %ebx,%eax rorl $7,%edi movl %edx,%esi xorl %eax,%ebp roll $5,%edx pslld $2,%xmm2 addl %ebp,%ecx xorl %edi,%esi psrld $30,%xmm4 xorl %eax,%edi addl %edx,%ecx addl 40(%esp),%ebx andl %edi,%esi xorl %eax,%edi rorl $7,%edx por %xmm4,%xmm2 movl %ecx,%ebp xorl %edi,%esi movdqa 64(%esp),%xmm4 roll $5,%ecx addl %esi,%ebx xorl %edx,%ebp xorl %edi,%edx addl %ecx,%ebx pshufd $238,%xmm1,%xmm5 addl 44(%esp),%eax andl %edx,%ebp xorl %edi,%edx rorl $7,%ecx movl %ebx,%esi xorl %edx,%ebp roll $5,%ebx addl %ebp,%eax xorl %edx,%esi addl %ebx,%eax addl 48(%esp),%edi pxor %xmm7,%xmm3 punpcklqdq %xmm2,%xmm5 xorl %ecx,%esi movl %eax,%ebp roll $5,%eax pxor %xmm4,%xmm3 movdqa %xmm7,64(%esp) addl %esi,%edi xorl %ecx,%ebp movdqa %xmm6,%xmm7 rorl $7,%ebx paddd %xmm2,%xmm6 addl %eax,%edi pxor %xmm5,%xmm3 addl 52(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi movdqa %xmm3,%xmm5 movdqa %xmm6,32(%esp) addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx pslld $2,%xmm3 addl 56(%esp),%ecx xorl %eax,%esi psrld $30,%xmm5 movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi addl %edx,%ecx por %xmm5,%xmm3 addl 60(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx addl %ecx,%ebx addl (%esp),%eax xorl %edx,%esi movl %ebx,%ebp roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx paddd %xmm3,%xmm7 addl %ebx,%eax addl 4(%esp),%edi xorl %ecx,%ebp movl %eax,%esi movdqa %xmm7,48(%esp) roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi addl 8(%esp),%edx xorl %ebx,%esi movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax addl %edi,%edx addl 12(%esp),%ecx xorl %eax,%ebp movl %edx,%esi roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp je .L007done movdqa 160(%esp),%xmm7 movdqa 176(%esp),%xmm6 movdqu (%ebp),%xmm0 movdqu 16(%ebp),%xmm1 movdqu 32(%ebp),%xmm2 movdqu 48(%ebp),%xmm3 addl $64,%ebp .byte 102,15,56,0,198 movl %ebp,196(%esp) movdqa %xmm7,96(%esp) addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx .byte 102,15,56,0,206 addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp movl %ebx,%esi paddd %xmm7,%xmm0 roll $5,%ebx addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx movdqa %xmm0,(%esp) addl %ebx,%eax addl 24(%esp),%edi xorl %ecx,%esi movl %eax,%ebp psubd %xmm7,%xmm0 roll $5,%eax addl %esi,%edi xorl %ecx,%ebp rorl $7,%ebx addl %eax,%edi addl 28(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx addl 32(%esp),%ecx xorl %eax,%esi movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi .byte 102,15,56,0,214 addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi paddd %xmm7,%xmm1 roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx movdqa %xmm1,16(%esp) addl %ecx,%ebx addl 40(%esp),%eax xorl %edx,%esi movl %ebx,%ebp psubd %xmm7,%xmm1 roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx addl %ebx,%eax addl 44(%esp),%edi xorl %ecx,%ebp movl %eax,%esi roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi addl 48(%esp),%edx xorl %ebx,%esi movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax .byte 102,15,56,0,222 addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp movl %edx,%esi paddd %xmm7,%xmm2 roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi movdqa %xmm2,32(%esp) addl %edx,%ecx addl 56(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp psubd %xmm7,%xmm2 roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx addl 60(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx addl %ebp,%eax rorl $7,%ecx addl %ebx,%eax movl 192(%esp),%ebp addl (%ebp),%eax addl 4(%ebp),%esi addl 8(%ebp),%ecx movl %eax,(%ebp) addl 12(%ebp),%edx movl %esi,4(%ebp) addl 16(%ebp),%edi movl %ecx,8(%ebp) movl %ecx,%ebx movl %edx,12(%ebp) xorl %edx,%ebx movl %edi,16(%ebp) movl %esi,%ebp pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx jmp .L006loop .align 16 .L007done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax addl 24(%esp),%edi xorl %ecx,%esi movl %eax,%ebp roll $5,%eax addl %esi,%edi xorl %ecx,%ebp rorl $7,%ebx addl %eax,%edi addl 28(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx addl 32(%esp),%ecx xorl %eax,%esi movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx addl %ecx,%ebx addl 40(%esp),%eax xorl %edx,%esi movl %ebx,%ebp roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx addl %ebx,%eax addl 44(%esp),%edi xorl %ecx,%ebp movl %eax,%esi roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi addl 48(%esp),%edx xorl %ebx,%esi movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp movl %edx,%esi roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi addl %edx,%ecx addl 56(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx addl 60(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx addl %ebp,%eax rorl $7,%ecx addl %ebx,%eax movl 192(%esp),%ebp addl (%ebp),%eax movl 204(%esp),%esp addl 4(%ebp),%esi addl 8(%ebp),%ecx movl %eax,(%ebp) addl 12(%ebp),%edx movl %esi,4(%ebp) addl 16(%ebp),%edi movl %ecx,8(%ebp) movl %edx,12(%ebp) movl %edi,16(%ebp) popl %edi popl %esi popl %ebx popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.type _sha1_block_data_order_avx,@function +.align 16 +_sha1_block_data_order_avx: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L008pic_point +.L008pic_point: + popl %ebp + leal .LK_XX_XX-.L008pic_point(%ebp),%ebp +.Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp .L009loop +.align 16 +.L009loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L010done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp .L009loop +.align 16 +.L010done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 .long 1859775393,1859775393,1859775393,1859775393 .long 2400959708,2400959708,2400959708,2400959708 .long 3395469782,3395469782,3395469782,3395469782 .long 66051,67438087,134810123,202182159 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 .byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 #else .text .globl sha1_block_data_order .type sha1_block_data_order,@function .align 16 sha1_block_data_order: .L_sha1_block_data_order_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi call .L000pic_point .L000pic_point: popl %ebp leal OPENSSL_ia32cap_P,%esi leal .LK_XX_XX-.L000pic_point(%ebp),%ebp movl (%esi),%eax movl 4(%esi),%edx testl $512,%edx jz .L001x86 movl 8(%esi),%ecx testl $16777216,%eax jz .L001x86 testl $536870912,%ecx jnz .Lshaext_shortcut + andl $268435456,%edx + andl $1073741824,%eax + orl %edx,%eax + cmpl $1342177280,%eax + je .Lavx_shortcut jmp .Lssse3_shortcut .align 16 .L001x86: movl 20(%esp),%ebp movl 24(%esp),%esi movl 28(%esp),%eax subl $76,%esp shll $6,%eax addl %esi,%eax movl %eax,104(%esp) movl 16(%ebp),%edi jmp .L002loop .align 16 .L002loop: movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,(%esp) movl %ebx,4(%esp) movl %ecx,8(%esp) movl %edx,12(%esp) movl 16(%esi),%eax movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,16(%esp) movl %ebx,20(%esp) movl %ecx,24(%esp) movl %edx,28(%esp) movl 32(%esi),%eax movl 36(%esi),%ebx movl 40(%esi),%ecx movl 44(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,32(%esp) movl %ebx,36(%esp) movl %ecx,40(%esp) movl %edx,44(%esp) movl 48(%esi),%eax movl 52(%esi),%ebx movl 56(%esi),%ecx movl 60(%esi),%edx bswap %eax bswap %ebx bswap %ecx bswap %edx movl %eax,48(%esp) movl %ebx,52(%esp) movl %ecx,56(%esp) movl %edx,60(%esp) movl %esi,100(%esp) movl (%ebp),%eax movl 4(%ebp),%ebx movl 8(%ebp),%ecx movl 12(%ebp),%edx movl %ecx,%esi movl %eax,%ebp roll $5,%ebp xorl %edx,%esi addl %edi,%ebp movl (%esp),%edi andl %ebx,%esi rorl $2,%ebx xorl %edx,%esi leal 1518500249(%ebp,%edi,1),%ebp addl %esi,%ebp movl %ebx,%edi movl %ebp,%esi roll $5,%ebp xorl %ecx,%edi addl %edx,%ebp movl 4(%esp),%edx andl %eax,%edi rorl $2,%eax xorl %ecx,%edi leal 1518500249(%ebp,%edx,1),%ebp addl %edi,%ebp movl %eax,%edx movl %ebp,%edi roll $5,%ebp xorl %ebx,%edx addl %ecx,%ebp movl 8(%esp),%ecx andl %esi,%edx rorl $2,%esi xorl %ebx,%edx leal 1518500249(%ebp,%ecx,1),%ebp addl %edx,%ebp movl %esi,%ecx movl %ebp,%edx roll $5,%ebp xorl %eax,%ecx addl %ebx,%ebp movl 12(%esp),%ebx andl %edi,%ecx rorl $2,%edi xorl %eax,%ecx leal 1518500249(%ebp,%ebx,1),%ebp addl %ecx,%ebp movl %edi,%ebx movl %ebp,%ecx roll $5,%ebp xorl %esi,%ebx addl %eax,%ebp movl 16(%esp),%eax andl %edx,%ebx rorl $2,%edx xorl %esi,%ebx leal 1518500249(%ebp,%eax,1),%ebp addl %ebx,%ebp movl %edx,%eax movl %ebp,%ebx roll $5,%ebp xorl %edi,%eax addl %esi,%ebp movl 20(%esp),%esi andl %ecx,%eax rorl $2,%ecx xorl %edi,%eax leal 1518500249(%ebp,%esi,1),%ebp addl %eax,%ebp movl %ecx,%esi movl %ebp,%eax roll $5,%ebp xorl %edx,%esi addl %edi,%ebp movl 24(%esp),%edi andl %ebx,%esi rorl $2,%ebx xorl %edx,%esi leal 1518500249(%ebp,%edi,1),%ebp addl %esi,%ebp movl %ebx,%edi movl %ebp,%esi roll $5,%ebp xorl %ecx,%edi addl %edx,%ebp movl 28(%esp),%edx andl %eax,%edi rorl $2,%eax xorl %ecx,%edi leal 1518500249(%ebp,%edx,1),%ebp addl %edi,%ebp movl %eax,%edx movl %ebp,%edi roll $5,%ebp xorl %ebx,%edx addl %ecx,%ebp movl 32(%esp),%ecx andl %esi,%edx rorl $2,%esi xorl %ebx,%edx leal 1518500249(%ebp,%ecx,1),%ebp addl %edx,%ebp movl %esi,%ecx movl %ebp,%edx roll $5,%ebp xorl %eax,%ecx addl %ebx,%ebp movl 36(%esp),%ebx andl %edi,%ecx rorl $2,%edi xorl %eax,%ecx leal 1518500249(%ebp,%ebx,1),%ebp addl %ecx,%ebp movl %edi,%ebx movl %ebp,%ecx roll $5,%ebp xorl %esi,%ebx addl %eax,%ebp movl 40(%esp),%eax andl %edx,%ebx rorl $2,%edx xorl %esi,%ebx leal 1518500249(%ebp,%eax,1),%ebp addl %ebx,%ebp movl %edx,%eax movl %ebp,%ebx roll $5,%ebp xorl %edi,%eax addl %esi,%ebp movl 44(%esp),%esi andl %ecx,%eax rorl $2,%ecx xorl %edi,%eax leal 1518500249(%ebp,%esi,1),%ebp addl %eax,%ebp movl %ecx,%esi movl %ebp,%eax roll $5,%ebp xorl %edx,%esi addl %edi,%ebp movl 48(%esp),%edi andl %ebx,%esi rorl $2,%ebx xorl %edx,%esi leal 1518500249(%ebp,%edi,1),%ebp addl %esi,%ebp movl %ebx,%edi movl %ebp,%esi roll $5,%ebp xorl %ecx,%edi addl %edx,%ebp movl 52(%esp),%edx andl %eax,%edi rorl $2,%eax xorl %ecx,%edi leal 1518500249(%ebp,%edx,1),%ebp addl %edi,%ebp movl %eax,%edx movl %ebp,%edi roll $5,%ebp xorl %ebx,%edx addl %ecx,%ebp movl 56(%esp),%ecx andl %esi,%edx rorl $2,%esi xorl %ebx,%edx leal 1518500249(%ebp,%ecx,1),%ebp addl %edx,%ebp movl %esi,%ecx movl %ebp,%edx roll $5,%ebp xorl %eax,%ecx addl %ebx,%ebp movl 60(%esp),%ebx andl %edi,%ecx rorl $2,%edi xorl %eax,%ecx leal 1518500249(%ebp,%ebx,1),%ebp movl (%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 8(%esp),%ebx xorl %esi,%ebp xorl 32(%esp),%ebx andl %edx,%ebp xorl 52(%esp),%ebx roll $1,%ebx xorl %esi,%ebp addl %ebp,%eax movl %ecx,%ebp rorl $2,%edx movl %ebx,(%esp) roll $5,%ebp leal 1518500249(%ebx,%eax,1),%ebx movl 4(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 12(%esp),%eax xorl %edi,%ebp xorl 36(%esp),%eax andl %ecx,%ebp xorl 56(%esp),%eax roll $1,%eax xorl %edi,%ebp addl %ebp,%esi movl %ebx,%ebp rorl $2,%ecx movl %eax,4(%esp) roll $5,%ebp leal 1518500249(%eax,%esi,1),%eax movl 8(%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 16(%esp),%esi xorl %edx,%ebp xorl 40(%esp),%esi andl %ebx,%ebp xorl 60(%esp),%esi roll $1,%esi xorl %edx,%ebp addl %ebp,%edi movl %eax,%ebp rorl $2,%ebx movl %esi,8(%esp) roll $5,%ebp leal 1518500249(%esi,%edi,1),%esi movl 12(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 20(%esp),%edi xorl %ecx,%ebp xorl 44(%esp),%edi andl %eax,%ebp xorl (%esp),%edi roll $1,%edi xorl %ecx,%ebp addl %ebp,%edx movl %esi,%ebp rorl $2,%eax movl %edi,12(%esp) roll $5,%ebp leal 1518500249(%edi,%edx,1),%edi movl 16(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 24(%esp),%edx xorl %eax,%ebp xorl 48(%esp),%edx xorl %ebx,%ebp xorl 4(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,16(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 20(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 28(%esp),%ecx xorl %esi,%ebp xorl 52(%esp),%ecx xorl %eax,%ebp xorl 8(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,20(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 24(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 32(%esp),%ebx xorl %edi,%ebp xorl 56(%esp),%ebx xorl %esi,%ebp xorl 12(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,24(%esp) leal 1859775393(%ebx,%eax,1),%ebx movl 28(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 36(%esp),%eax xorl %edx,%ebp xorl 60(%esp),%eax xorl %edi,%ebp xorl 16(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,28(%esp) leal 1859775393(%eax,%esi,1),%eax movl 32(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 40(%esp),%esi xorl %ecx,%ebp xorl (%esp),%esi xorl %edx,%ebp xorl 20(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,32(%esp) leal 1859775393(%esi,%edi,1),%esi movl 36(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 44(%esp),%edi xorl %ebx,%ebp xorl 4(%esp),%edi xorl %ecx,%ebp xorl 24(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,36(%esp) leal 1859775393(%edi,%edx,1),%edi movl 40(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 48(%esp),%edx xorl %eax,%ebp xorl 8(%esp),%edx xorl %ebx,%ebp xorl 28(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,40(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 44(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 52(%esp),%ecx xorl %esi,%ebp xorl 12(%esp),%ecx xorl %eax,%ebp xorl 32(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,44(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 48(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 56(%esp),%ebx xorl %edi,%ebp xorl 16(%esp),%ebx xorl %esi,%ebp xorl 36(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,48(%esp) leal 1859775393(%ebx,%eax,1),%ebx movl 52(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 60(%esp),%eax xorl %edx,%ebp xorl 20(%esp),%eax xorl %edi,%ebp xorl 40(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,52(%esp) leal 1859775393(%eax,%esi,1),%eax movl 56(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl (%esp),%esi xorl %ecx,%ebp xorl 24(%esp),%esi xorl %edx,%ebp xorl 44(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,56(%esp) leal 1859775393(%esi,%edi,1),%esi movl 60(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 4(%esp),%edi xorl %ebx,%ebp xorl 28(%esp),%edi xorl %ecx,%ebp xorl 48(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,60(%esp) leal 1859775393(%edi,%edx,1),%edi movl (%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 8(%esp),%edx xorl %eax,%ebp xorl 32(%esp),%edx xorl %ebx,%ebp xorl 52(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 4(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 12(%esp),%ecx xorl %esi,%ebp xorl 36(%esp),%ecx xorl %eax,%ebp xorl 56(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,4(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 8(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 16(%esp),%ebx xorl %edi,%ebp xorl 40(%esp),%ebx xorl %esi,%ebp xorl 60(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,8(%esp) leal 1859775393(%ebx,%eax,1),%ebx movl 12(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 20(%esp),%eax xorl %edx,%ebp xorl 44(%esp),%eax xorl %edi,%ebp xorl (%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,12(%esp) leal 1859775393(%eax,%esi,1),%eax movl 16(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 24(%esp),%esi xorl %ecx,%ebp xorl 48(%esp),%esi xorl %edx,%ebp xorl 4(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,16(%esp) leal 1859775393(%esi,%edi,1),%esi movl 20(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 28(%esp),%edi xorl %ebx,%ebp xorl 52(%esp),%edi xorl %ecx,%ebp xorl 8(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,20(%esp) leal 1859775393(%edi,%edx,1),%edi movl 24(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 32(%esp),%edx xorl %eax,%ebp xorl 56(%esp),%edx xorl %ebx,%ebp xorl 12(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,24(%esp) leal 1859775393(%edx,%ecx,1),%edx movl 28(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 36(%esp),%ecx xorl %esi,%ebp xorl 60(%esp),%ecx xorl %eax,%ebp xorl 16(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,28(%esp) leal 1859775393(%ecx,%ebx,1),%ecx movl 32(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 40(%esp),%ebx xorl %esi,%ebp xorl (%esp),%ebx andl %edx,%ebp xorl 20(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,32(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 36(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 44(%esp),%eax xorl %edi,%ebp xorl 4(%esp),%eax andl %ecx,%ebp xorl 24(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,36(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl 40(%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 48(%esp),%esi xorl %edx,%ebp xorl 8(%esp),%esi andl %ebx,%ebp xorl 28(%esp),%esi roll $1,%esi addl %edi,%ebp rorl $2,%ebx movl %eax,%edi roll $5,%edi movl %esi,40(%esp) leal 2400959708(%esi,%ebp,1),%esi movl %ecx,%ebp addl %edi,%esi andl %edx,%ebp movl 44(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 52(%esp),%edi xorl %ecx,%ebp xorl 12(%esp),%edi andl %eax,%ebp xorl 32(%esp),%edi roll $1,%edi addl %edx,%ebp rorl $2,%eax movl %esi,%edx roll $5,%edx movl %edi,44(%esp) leal 2400959708(%edi,%ebp,1),%edi movl %ebx,%ebp addl %edx,%edi andl %ecx,%ebp movl 48(%esp),%edx addl %ebp,%edi movl %eax,%ebp xorl 56(%esp),%edx xorl %ebx,%ebp xorl 16(%esp),%edx andl %esi,%ebp xorl 36(%esp),%edx roll $1,%edx addl %ecx,%ebp rorl $2,%esi movl %edi,%ecx roll $5,%ecx movl %edx,48(%esp) leal 2400959708(%edx,%ebp,1),%edx movl %eax,%ebp addl %ecx,%edx andl %ebx,%ebp movl 52(%esp),%ecx addl %ebp,%edx movl %esi,%ebp xorl 60(%esp),%ecx xorl %eax,%ebp xorl 20(%esp),%ecx andl %edi,%ebp xorl 40(%esp),%ecx roll $1,%ecx addl %ebx,%ebp rorl $2,%edi movl %edx,%ebx roll $5,%ebx movl %ecx,52(%esp) leal 2400959708(%ecx,%ebp,1),%ecx movl %esi,%ebp addl %ebx,%ecx andl %eax,%ebp movl 56(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl (%esp),%ebx xorl %esi,%ebp xorl 24(%esp),%ebx andl %edx,%ebp xorl 44(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,56(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 60(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 4(%esp),%eax xorl %edi,%ebp xorl 28(%esp),%eax andl %ecx,%ebp xorl 48(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,60(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl (%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 8(%esp),%esi xorl %edx,%ebp xorl 32(%esp),%esi andl %ebx,%ebp xorl 52(%esp),%esi roll $1,%esi addl %edi,%ebp rorl $2,%ebx movl %eax,%edi roll $5,%edi movl %esi,(%esp) leal 2400959708(%esi,%ebp,1),%esi movl %ecx,%ebp addl %edi,%esi andl %edx,%ebp movl 4(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 12(%esp),%edi xorl %ecx,%ebp xorl 36(%esp),%edi andl %eax,%ebp xorl 56(%esp),%edi roll $1,%edi addl %edx,%ebp rorl $2,%eax movl %esi,%edx roll $5,%edx movl %edi,4(%esp) leal 2400959708(%edi,%ebp,1),%edi movl %ebx,%ebp addl %edx,%edi andl %ecx,%ebp movl 8(%esp),%edx addl %ebp,%edi movl %eax,%ebp xorl 16(%esp),%edx xorl %ebx,%ebp xorl 40(%esp),%edx andl %esi,%ebp xorl 60(%esp),%edx roll $1,%edx addl %ecx,%ebp rorl $2,%esi movl %edi,%ecx roll $5,%ecx movl %edx,8(%esp) leal 2400959708(%edx,%ebp,1),%edx movl %eax,%ebp addl %ecx,%edx andl %ebx,%ebp movl 12(%esp),%ecx addl %ebp,%edx movl %esi,%ebp xorl 20(%esp),%ecx xorl %eax,%ebp xorl 44(%esp),%ecx andl %edi,%ebp xorl (%esp),%ecx roll $1,%ecx addl %ebx,%ebp rorl $2,%edi movl %edx,%ebx roll $5,%ebx movl %ecx,12(%esp) leal 2400959708(%ecx,%ebp,1),%ecx movl %esi,%ebp addl %ebx,%ecx andl %eax,%ebp movl 16(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 24(%esp),%ebx xorl %esi,%ebp xorl 48(%esp),%ebx andl %edx,%ebp xorl 4(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,16(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 20(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 28(%esp),%eax xorl %edi,%ebp xorl 52(%esp),%eax andl %ecx,%ebp xorl 8(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,20(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl 24(%esp),%esi addl %ebp,%eax movl %ecx,%ebp xorl 32(%esp),%esi xorl %edx,%ebp xorl 56(%esp),%esi andl %ebx,%ebp xorl 12(%esp),%esi roll $1,%esi addl %edi,%ebp rorl $2,%ebx movl %eax,%edi roll $5,%edi movl %esi,24(%esp) leal 2400959708(%esi,%ebp,1),%esi movl %ecx,%ebp addl %edi,%esi andl %edx,%ebp movl 28(%esp),%edi addl %ebp,%esi movl %ebx,%ebp xorl 36(%esp),%edi xorl %ecx,%ebp xorl 60(%esp),%edi andl %eax,%ebp xorl 16(%esp),%edi roll $1,%edi addl %edx,%ebp rorl $2,%eax movl %esi,%edx roll $5,%edx movl %edi,28(%esp) leal 2400959708(%edi,%ebp,1),%edi movl %ebx,%ebp addl %edx,%edi andl %ecx,%ebp movl 32(%esp),%edx addl %ebp,%edi movl %eax,%ebp xorl 40(%esp),%edx xorl %ebx,%ebp xorl (%esp),%edx andl %esi,%ebp xorl 20(%esp),%edx roll $1,%edx addl %ecx,%ebp rorl $2,%esi movl %edi,%ecx roll $5,%ecx movl %edx,32(%esp) leal 2400959708(%edx,%ebp,1),%edx movl %eax,%ebp addl %ecx,%edx andl %ebx,%ebp movl 36(%esp),%ecx addl %ebp,%edx movl %esi,%ebp xorl 44(%esp),%ecx xorl %eax,%ebp xorl 4(%esp),%ecx andl %edi,%ebp xorl 24(%esp),%ecx roll $1,%ecx addl %ebx,%ebp rorl $2,%edi movl %edx,%ebx roll $5,%ebx movl %ecx,36(%esp) leal 2400959708(%ecx,%ebp,1),%ecx movl %esi,%ebp addl %ebx,%ecx andl %eax,%ebp movl 40(%esp),%ebx addl %ebp,%ecx movl %edi,%ebp xorl 48(%esp),%ebx xorl %esi,%ebp xorl 8(%esp),%ebx andl %edx,%ebp xorl 28(%esp),%ebx roll $1,%ebx addl %eax,%ebp rorl $2,%edx movl %ecx,%eax roll $5,%eax movl %ebx,40(%esp) leal 2400959708(%ebx,%ebp,1),%ebx movl %edi,%ebp addl %eax,%ebx andl %esi,%ebp movl 44(%esp),%eax addl %ebp,%ebx movl %edx,%ebp xorl 52(%esp),%eax xorl %edi,%ebp xorl 12(%esp),%eax andl %ecx,%ebp xorl 32(%esp),%eax roll $1,%eax addl %esi,%ebp rorl $2,%ecx movl %ebx,%esi roll $5,%esi movl %eax,44(%esp) leal 2400959708(%eax,%ebp,1),%eax movl %edx,%ebp addl %esi,%eax andl %edi,%ebp movl 48(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 56(%esp),%esi xorl %ecx,%ebp xorl 16(%esp),%esi xorl %edx,%ebp xorl 36(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,48(%esp) leal 3395469782(%esi,%edi,1),%esi movl 52(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 60(%esp),%edi xorl %ebx,%ebp xorl 20(%esp),%edi xorl %ecx,%ebp xorl 40(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,52(%esp) leal 3395469782(%edi,%edx,1),%edi movl 56(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl (%esp),%edx xorl %eax,%ebp xorl 24(%esp),%edx xorl %ebx,%ebp xorl 44(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,56(%esp) leal 3395469782(%edx,%ecx,1),%edx movl 60(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 4(%esp),%ecx xorl %esi,%ebp xorl 28(%esp),%ecx xorl %eax,%ebp xorl 48(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,60(%esp) leal 3395469782(%ecx,%ebx,1),%ecx movl (%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 8(%esp),%ebx xorl %edi,%ebp xorl 32(%esp),%ebx xorl %esi,%ebp xorl 52(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,(%esp) leal 3395469782(%ebx,%eax,1),%ebx movl 4(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 12(%esp),%eax xorl %edx,%ebp xorl 36(%esp),%eax xorl %edi,%ebp xorl 56(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,4(%esp) leal 3395469782(%eax,%esi,1),%eax movl 8(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 16(%esp),%esi xorl %ecx,%ebp xorl 40(%esp),%esi xorl %edx,%ebp xorl 60(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,8(%esp) leal 3395469782(%esi,%edi,1),%esi movl 12(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 20(%esp),%edi xorl %ebx,%ebp xorl 44(%esp),%edi xorl %ecx,%ebp xorl (%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,12(%esp) leal 3395469782(%edi,%edx,1),%edi movl 16(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 24(%esp),%edx xorl %eax,%ebp xorl 48(%esp),%edx xorl %ebx,%ebp xorl 4(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,16(%esp) leal 3395469782(%edx,%ecx,1),%edx movl 20(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 28(%esp),%ecx xorl %esi,%ebp xorl 52(%esp),%ecx xorl %eax,%ebp xorl 8(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,20(%esp) leal 3395469782(%ecx,%ebx,1),%ecx movl 24(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 32(%esp),%ebx xorl %edi,%ebp xorl 56(%esp),%ebx xorl %esi,%ebp xorl 12(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,24(%esp) leal 3395469782(%ebx,%eax,1),%ebx movl 28(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 36(%esp),%eax xorl %edx,%ebp xorl 60(%esp),%eax xorl %edi,%ebp xorl 16(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp movl %eax,28(%esp) leal 3395469782(%eax,%esi,1),%eax movl 32(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl 40(%esp),%esi xorl %ecx,%ebp xorl (%esp),%esi xorl %edx,%ebp xorl 20(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp movl %esi,32(%esp) leal 3395469782(%esi,%edi,1),%esi movl 36(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 44(%esp),%edi xorl %ebx,%ebp xorl 4(%esp),%edi xorl %ecx,%ebp xorl 24(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp movl %edi,36(%esp) leal 3395469782(%edi,%edx,1),%edi movl 40(%esp),%edx addl %ebp,%edi movl %esi,%ebp xorl 48(%esp),%edx xorl %eax,%ebp xorl 8(%esp),%edx xorl %ebx,%ebp xorl 28(%esp),%edx roll $1,%edx addl %ebp,%ecx rorl $2,%esi movl %edi,%ebp roll $5,%ebp movl %edx,40(%esp) leal 3395469782(%edx,%ecx,1),%edx movl 44(%esp),%ecx addl %ebp,%edx movl %edi,%ebp xorl 52(%esp),%ecx xorl %esi,%ebp xorl 12(%esp),%ecx xorl %eax,%ebp xorl 32(%esp),%ecx roll $1,%ecx addl %ebp,%ebx rorl $2,%edi movl %edx,%ebp roll $5,%ebp movl %ecx,44(%esp) leal 3395469782(%ecx,%ebx,1),%ecx movl 48(%esp),%ebx addl %ebp,%ecx movl %edx,%ebp xorl 56(%esp),%ebx xorl %edi,%ebp xorl 16(%esp),%ebx xorl %esi,%ebp xorl 36(%esp),%ebx roll $1,%ebx addl %ebp,%eax rorl $2,%edx movl %ecx,%ebp roll $5,%ebp movl %ebx,48(%esp) leal 3395469782(%ebx,%eax,1),%ebx movl 52(%esp),%eax addl %ebp,%ebx movl %ecx,%ebp xorl 60(%esp),%eax xorl %edx,%ebp xorl 20(%esp),%eax xorl %edi,%ebp xorl 40(%esp),%eax roll $1,%eax addl %ebp,%esi rorl $2,%ecx movl %ebx,%ebp roll $5,%ebp leal 3395469782(%eax,%esi,1),%eax movl 56(%esp),%esi addl %ebp,%eax movl %ebx,%ebp xorl (%esp),%esi xorl %ecx,%ebp xorl 24(%esp),%esi xorl %edx,%ebp xorl 44(%esp),%esi roll $1,%esi addl %ebp,%edi rorl $2,%ebx movl %eax,%ebp roll $5,%ebp leal 3395469782(%esi,%edi,1),%esi movl 60(%esp),%edi addl %ebp,%esi movl %eax,%ebp xorl 4(%esp),%edi xorl %ebx,%ebp xorl 28(%esp),%edi xorl %ecx,%ebp xorl 48(%esp),%edi roll $1,%edi addl %ebp,%edx rorl $2,%eax movl %esi,%ebp roll $5,%ebp leal 3395469782(%edi,%edx,1),%edi addl %ebp,%edi movl 96(%esp),%ebp movl 100(%esp),%edx addl (%ebp),%edi addl 4(%ebp),%esi addl 8(%ebp),%eax addl 12(%ebp),%ebx addl 16(%ebp),%ecx movl %edi,(%ebp) addl $64,%edx movl %esi,4(%ebp) cmpl 104(%esp),%edx movl %eax,8(%ebp) movl %ecx,%edi movl %ebx,12(%ebp) movl %edx,%esi movl %ecx,16(%ebp) jb .L002loop addl $76,%esp popl %edi popl %esi popl %ebx popl %ebp ret .size sha1_block_data_order,.-.L_sha1_block_data_order_begin .type _sha1_block_data_order_shaext,@function .align 16 _sha1_block_data_order_shaext: pushl %ebp pushl %ebx pushl %esi pushl %edi call .L003pic_point .L003pic_point: popl %ebp leal .LK_XX_XX-.L003pic_point(%ebp),%ebp .Lshaext_shortcut: movl 20(%esp),%edi movl %esp,%ebx movl 24(%esp),%esi movl 28(%esp),%ecx subl $32,%esp movdqu (%edi),%xmm0 movd 16(%edi),%xmm1 andl $-32,%esp movdqa 80(%ebp),%xmm3 movdqu (%esi),%xmm4 pshufd $27,%xmm0,%xmm0 movdqu 16(%esi),%xmm5 pshufd $27,%xmm1,%xmm1 movdqu 32(%esi),%xmm6 .byte 102,15,56,0,227 movdqu 48(%esi),%xmm7 .byte 102,15,56,0,235 .byte 102,15,56,0,243 .byte 102,15,56,0,251 jmp .L004loop_shaext .align 16 .L004loop_shaext: decl %ecx leal 64(%esi),%eax movdqa %xmm1,(%esp) paddd %xmm4,%xmm1 cmovnel %eax,%esi movdqa %xmm0,16(%esp) .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,0 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,0 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,0 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,1 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,1 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,1 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,2 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 .byte 15,56,201,229 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,213 pxor %xmm6,%xmm4 .byte 15,56,201,238 .byte 15,56,202,231 movdqa %xmm0,%xmm1 .byte 15,58,204,194,2 .byte 15,56,200,206 pxor %xmm7,%xmm5 .byte 15,56,202,236 .byte 15,56,201,247 movdqa %xmm0,%xmm2 .byte 15,58,204,193,2 .byte 15,56,200,215 pxor %xmm4,%xmm6 .byte 15,56,201,252 .byte 15,56,202,245 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 15,56,200,204 pxor %xmm5,%xmm7 .byte 15,56,202,254 movdqu (%esi),%xmm4 movdqa %xmm0,%xmm2 .byte 15,58,204,193,3 .byte 15,56,200,213 movdqu 16(%esi),%xmm5 .byte 102,15,56,0,227 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 .byte 15,56,200,206 movdqu 32(%esi),%xmm6 .byte 102,15,56,0,235 movdqa %xmm0,%xmm2 .byte 15,58,204,193,3 .byte 15,56,200,215 movdqu 48(%esi),%xmm7 .byte 102,15,56,0,243 movdqa %xmm0,%xmm1 .byte 15,58,204,194,3 movdqa (%esp),%xmm2 .byte 102,15,56,0,251 .byte 15,56,200,202 paddd 16(%esp),%xmm0 jnz .L004loop_shaext pshufd $27,%xmm0,%xmm0 pshufd $27,%xmm1,%xmm1 movdqu %xmm0,(%edi) movd %xmm1,16(%edi) movl %ebx,%esp popl %edi popl %esi popl %ebx popl %ebp ret .size _sha1_block_data_order_shaext,.-_sha1_block_data_order_shaext .type _sha1_block_data_order_ssse3,@function .align 16 _sha1_block_data_order_ssse3: pushl %ebp pushl %ebx pushl %esi pushl %edi call .L005pic_point .L005pic_point: popl %ebp leal .LK_XX_XX-.L005pic_point(%ebp),%ebp .Lssse3_shortcut: movdqa (%ebp),%xmm7 movdqa 16(%ebp),%xmm0 movdqa 32(%ebp),%xmm1 movdqa 48(%ebp),%xmm2 movdqa 64(%ebp),%xmm6 movl 20(%esp),%edi movl 24(%esp),%ebp movl 28(%esp),%edx movl %esp,%esi subl $208,%esp andl $-64,%esp movdqa %xmm0,112(%esp) movdqa %xmm1,128(%esp) movdqa %xmm2,144(%esp) shll $6,%edx movdqa %xmm7,160(%esp) addl %ebp,%edx movdqa %xmm6,176(%esp) addl $64,%ebp movl %edi,192(%esp) movl %ebp,196(%esp) movl %edx,200(%esp) movl %esi,204(%esp) movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx movl 12(%edi),%edx movl 16(%edi),%edi movl %ebx,%esi movdqu -64(%ebp),%xmm0 movdqu -48(%ebp),%xmm1 movdqu -32(%ebp),%xmm2 movdqu -16(%ebp),%xmm3 .byte 102,15,56,0,198 .byte 102,15,56,0,206 .byte 102,15,56,0,214 movdqa %xmm7,96(%esp) .byte 102,15,56,0,222 paddd %xmm7,%xmm0 paddd %xmm7,%xmm1 paddd %xmm7,%xmm2 movdqa %xmm0,(%esp) psubd %xmm7,%xmm0 movdqa %xmm1,16(%esp) psubd %xmm7,%xmm1 movdqa %xmm2,32(%esp) movl %ecx,%ebp psubd %xmm7,%xmm2 xorl %edx,%ebp pshufd $238,%xmm0,%xmm4 andl %ebp,%esi jmp .L006loop .align 16 .L006loop: rorl $2,%ebx xorl %edx,%esi movl %eax,%ebp punpcklqdq %xmm1,%xmm4 movdqa %xmm3,%xmm6 addl (%esp),%edi xorl %ecx,%ebx paddd %xmm3,%xmm7 movdqa %xmm0,64(%esp) roll $5,%eax addl %esi,%edi psrldq $4,%xmm6 andl %ebx,%ebp xorl %ecx,%ebx pxor %xmm0,%xmm4 addl %eax,%edi rorl $7,%eax pxor %xmm2,%xmm6 xorl %ecx,%ebp movl %edi,%esi addl 4(%esp),%edx pxor %xmm6,%xmm4 xorl %ebx,%eax roll $5,%edi movdqa %xmm7,48(%esp) addl %ebp,%edx andl %eax,%esi movdqa %xmm4,%xmm0 xorl %ebx,%eax addl %edi,%edx rorl $7,%edi movdqa %xmm4,%xmm6 xorl %ebx,%esi pslldq $12,%xmm0 paddd %xmm4,%xmm4 movl %edx,%ebp addl 8(%esp),%ecx psrld $31,%xmm6 xorl %eax,%edi roll $5,%edx movdqa %xmm0,%xmm7 addl %esi,%ecx andl %edi,%ebp xorl %eax,%edi psrld $30,%xmm0 addl %edx,%ecx rorl $7,%edx por %xmm6,%xmm4 xorl %eax,%ebp movl %ecx,%esi addl 12(%esp),%ebx pslld $2,%xmm7 xorl %edi,%edx roll $5,%ecx pxor %xmm0,%xmm4 movdqa 96(%esp),%xmm0 addl %ebp,%ebx andl %edx,%esi pxor %xmm7,%xmm4 pshufd $238,%xmm1,%xmm5 xorl %edi,%edx addl %ecx,%ebx rorl $7,%ecx xorl %edi,%esi movl %ebx,%ebp punpcklqdq %xmm2,%xmm5 movdqa %xmm4,%xmm7 addl 16(%esp),%eax xorl %edx,%ecx paddd %xmm4,%xmm0 movdqa %xmm1,80(%esp) roll $5,%ebx addl %esi,%eax psrldq $4,%xmm7 andl %ecx,%ebp xorl %edx,%ecx pxor %xmm1,%xmm5 addl %ebx,%eax rorl $7,%ebx pxor %xmm3,%xmm7 xorl %edx,%ebp movl %eax,%esi addl 20(%esp),%edi pxor %xmm7,%xmm5 xorl %ecx,%ebx roll $5,%eax movdqa %xmm0,(%esp) addl %ebp,%edi andl %ebx,%esi movdqa %xmm5,%xmm1 xorl %ecx,%ebx addl %eax,%edi rorl $7,%eax movdqa %xmm5,%xmm7 xorl %ecx,%esi pslldq $12,%xmm1 paddd %xmm5,%xmm5 movl %edi,%ebp addl 24(%esp),%edx psrld $31,%xmm7 xorl %ebx,%eax roll $5,%edi movdqa %xmm1,%xmm0 addl %esi,%edx andl %eax,%ebp xorl %ebx,%eax psrld $30,%xmm1 addl %edi,%edx rorl $7,%edi por %xmm7,%xmm5 xorl %ebx,%ebp movl %edx,%esi addl 28(%esp),%ecx pslld $2,%xmm0 xorl %eax,%edi roll $5,%edx pxor %xmm1,%xmm5 movdqa 112(%esp),%xmm1 addl %ebp,%ecx andl %edi,%esi pxor %xmm0,%xmm5 pshufd $238,%xmm2,%xmm6 xorl %eax,%edi addl %edx,%ecx rorl $7,%edx xorl %eax,%esi movl %ecx,%ebp punpcklqdq %xmm3,%xmm6 movdqa %xmm5,%xmm0 addl 32(%esp),%ebx xorl %edi,%edx paddd %xmm5,%xmm1 movdqa %xmm2,96(%esp) roll $5,%ecx addl %esi,%ebx psrldq $4,%xmm0 andl %edx,%ebp xorl %edi,%edx pxor %xmm2,%xmm6 addl %ecx,%ebx rorl $7,%ecx pxor %xmm4,%xmm0 xorl %edi,%ebp movl %ebx,%esi addl 36(%esp),%eax pxor %xmm0,%xmm6 xorl %edx,%ecx roll $5,%ebx movdqa %xmm1,16(%esp) addl %ebp,%eax andl %ecx,%esi movdqa %xmm6,%xmm2 xorl %edx,%ecx addl %ebx,%eax rorl $7,%ebx movdqa %xmm6,%xmm0 xorl %edx,%esi pslldq $12,%xmm2 paddd %xmm6,%xmm6 movl %eax,%ebp addl 40(%esp),%edi psrld $31,%xmm0 xorl %ecx,%ebx roll $5,%eax movdqa %xmm2,%xmm1 addl %esi,%edi andl %ebx,%ebp xorl %ecx,%ebx psrld $30,%xmm2 addl %eax,%edi rorl $7,%eax por %xmm0,%xmm6 xorl %ecx,%ebp movdqa 64(%esp),%xmm0 movl %edi,%esi addl 44(%esp),%edx pslld $2,%xmm1 xorl %ebx,%eax roll $5,%edi pxor %xmm2,%xmm6 movdqa 112(%esp),%xmm2 addl %ebp,%edx andl %eax,%esi pxor %xmm1,%xmm6 pshufd $238,%xmm3,%xmm7 xorl %ebx,%eax addl %edi,%edx rorl $7,%edi xorl %ebx,%esi movl %edx,%ebp punpcklqdq %xmm4,%xmm7 movdqa %xmm6,%xmm1 addl 48(%esp),%ecx xorl %eax,%edi paddd %xmm6,%xmm2 movdqa %xmm3,64(%esp) roll $5,%edx addl %esi,%ecx psrldq $4,%xmm1 andl %edi,%ebp xorl %eax,%edi pxor %xmm3,%xmm7 addl %edx,%ecx rorl $7,%edx pxor %xmm5,%xmm1 xorl %eax,%ebp movl %ecx,%esi addl 52(%esp),%ebx pxor %xmm1,%xmm7 xorl %edi,%edx roll $5,%ecx movdqa %xmm2,32(%esp) addl %ebp,%ebx andl %edx,%esi movdqa %xmm7,%xmm3 xorl %edi,%edx addl %ecx,%ebx rorl $7,%ecx movdqa %xmm7,%xmm1 xorl %edi,%esi pslldq $12,%xmm3 paddd %xmm7,%xmm7 movl %ebx,%ebp addl 56(%esp),%eax psrld $31,%xmm1 xorl %edx,%ecx roll $5,%ebx movdqa %xmm3,%xmm2 addl %esi,%eax andl %ecx,%ebp xorl %edx,%ecx psrld $30,%xmm3 addl %ebx,%eax rorl $7,%ebx por %xmm1,%xmm7 xorl %edx,%ebp movdqa 80(%esp),%xmm1 movl %eax,%esi addl 60(%esp),%edi pslld $2,%xmm2 xorl %ecx,%ebx roll $5,%eax pxor %xmm3,%xmm7 movdqa 112(%esp),%xmm3 addl %ebp,%edi andl %ebx,%esi pxor %xmm2,%xmm7 pshufd $238,%xmm6,%xmm2 xorl %ecx,%ebx addl %eax,%edi rorl $7,%eax pxor %xmm4,%xmm0 punpcklqdq %xmm7,%xmm2 xorl %ecx,%esi movl %edi,%ebp addl (%esp),%edx pxor %xmm1,%xmm0 movdqa %xmm4,80(%esp) xorl %ebx,%eax roll $5,%edi movdqa %xmm3,%xmm4 addl %esi,%edx paddd %xmm7,%xmm3 andl %eax,%ebp pxor %xmm2,%xmm0 xorl %ebx,%eax addl %edi,%edx rorl $7,%edi xorl %ebx,%ebp movdqa %xmm0,%xmm2 movdqa %xmm3,48(%esp) movl %edx,%esi addl 4(%esp),%ecx xorl %eax,%edi roll $5,%edx pslld $2,%xmm0 addl %ebp,%ecx andl %edi,%esi psrld $30,%xmm2 xorl %eax,%edi addl %edx,%ecx rorl $7,%edx xorl %eax,%esi movl %ecx,%ebp addl 8(%esp),%ebx xorl %edi,%edx roll $5,%ecx por %xmm2,%xmm0 addl %esi,%ebx andl %edx,%ebp movdqa 96(%esp),%xmm2 xorl %edi,%edx addl %ecx,%ebx addl 12(%esp),%eax xorl %edi,%ebp movl %ebx,%esi pshufd $238,%xmm7,%xmm3 roll $5,%ebx addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax addl 16(%esp),%edi pxor %xmm5,%xmm1 punpcklqdq %xmm0,%xmm3 xorl %ecx,%esi movl %eax,%ebp roll $5,%eax pxor %xmm2,%xmm1 movdqa %xmm5,96(%esp) addl %esi,%edi xorl %ecx,%ebp movdqa %xmm4,%xmm5 rorl $7,%ebx paddd %xmm0,%xmm4 addl %eax,%edi pxor %xmm3,%xmm1 addl 20(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi movdqa %xmm1,%xmm3 movdqa %xmm4,(%esp) addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx pslld $2,%xmm1 addl 24(%esp),%ecx xorl %eax,%esi psrld $30,%xmm3 movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi addl %edx,%ecx por %xmm3,%xmm1 addl 28(%esp),%ebx xorl %edi,%ebp movdqa 64(%esp),%xmm3 movl %ecx,%esi roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx pshufd $238,%xmm0,%xmm4 addl %ecx,%ebx addl 32(%esp),%eax pxor %xmm6,%xmm2 punpcklqdq %xmm1,%xmm4 xorl %edx,%esi movl %ebx,%ebp roll $5,%ebx pxor %xmm3,%xmm2 movdqa %xmm6,64(%esp) addl %esi,%eax xorl %edx,%ebp movdqa 128(%esp),%xmm6 rorl $7,%ecx paddd %xmm1,%xmm5 addl %ebx,%eax pxor %xmm4,%xmm2 addl 36(%esp),%edi xorl %ecx,%ebp movl %eax,%esi roll $5,%eax movdqa %xmm2,%xmm4 movdqa %xmm5,16(%esp) addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi pslld $2,%xmm2 addl 40(%esp),%edx xorl %ebx,%esi psrld $30,%xmm4 movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax addl %edi,%edx por %xmm4,%xmm2 addl 44(%esp),%ecx xorl %eax,%ebp movdqa 80(%esp),%xmm4 movl %edx,%esi roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi pshufd $238,%xmm1,%xmm5 addl %edx,%ecx addl 48(%esp),%ebx pxor %xmm7,%xmm3 punpcklqdq %xmm2,%xmm5 xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx pxor %xmm4,%xmm3 movdqa %xmm7,80(%esp) addl %esi,%ebx xorl %edi,%ebp movdqa %xmm6,%xmm7 rorl $7,%edx paddd %xmm2,%xmm6 addl %ecx,%ebx pxor %xmm5,%xmm3 addl 52(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx movdqa %xmm3,%xmm5 movdqa %xmm6,32(%esp) addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax pslld $2,%xmm3 addl 56(%esp),%edi xorl %ecx,%esi psrld $30,%xmm5 movl %eax,%ebp roll $5,%eax addl %esi,%edi xorl %ecx,%ebp rorl $7,%ebx addl %eax,%edi por %xmm5,%xmm3 addl 60(%esp),%edx xorl %ebx,%ebp movdqa 96(%esp),%xmm5 movl %edi,%esi roll $5,%edi addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax pshufd $238,%xmm2,%xmm6 addl %edi,%edx addl (%esp),%ecx pxor %xmm0,%xmm4 punpcklqdq %xmm3,%xmm6 xorl %eax,%esi movl %edx,%ebp roll $5,%edx pxor %xmm5,%xmm4 movdqa %xmm0,96(%esp) addl %esi,%ecx xorl %eax,%ebp movdqa %xmm7,%xmm0 rorl $7,%edi paddd %xmm3,%xmm7 addl %edx,%ecx pxor %xmm6,%xmm4 addl 4(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi roll $5,%ecx movdqa %xmm4,%xmm6 movdqa %xmm7,48(%esp) addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx addl %ecx,%ebx pslld $2,%xmm4 addl 8(%esp),%eax xorl %edx,%esi psrld $30,%xmm6 movl %ebx,%ebp roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx addl %ebx,%eax por %xmm6,%xmm4 addl 12(%esp),%edi xorl %ecx,%ebp movdqa 64(%esp),%xmm6 movl %eax,%esi roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx pshufd $238,%xmm3,%xmm7 addl %eax,%edi addl 16(%esp),%edx pxor %xmm1,%xmm5 punpcklqdq %xmm4,%xmm7 xorl %ebx,%esi movl %edi,%ebp roll $5,%edi pxor %xmm6,%xmm5 movdqa %xmm1,64(%esp) addl %esi,%edx xorl %ebx,%ebp movdqa %xmm0,%xmm1 rorl $7,%eax paddd %xmm4,%xmm0 addl %edi,%edx pxor %xmm7,%xmm5 addl 20(%esp),%ecx xorl %eax,%ebp movl %edx,%esi roll $5,%edx movdqa %xmm5,%xmm7 movdqa %xmm0,(%esp) addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi addl %edx,%ecx pslld $2,%xmm5 addl 24(%esp),%ebx xorl %edi,%esi psrld $30,%xmm7 movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx por %xmm7,%xmm5 addl 28(%esp),%eax movdqa 80(%esp),%xmm7 rorl $7,%ecx movl %ebx,%esi xorl %edx,%ebp roll $5,%ebx pshufd $238,%xmm4,%xmm0 addl %ebp,%eax xorl %ecx,%esi xorl %edx,%ecx addl %ebx,%eax addl 32(%esp),%edi pxor %xmm2,%xmm6 punpcklqdq %xmm5,%xmm0 andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx pxor %xmm7,%xmm6 movdqa %xmm2,80(%esp) movl %eax,%ebp xorl %ecx,%esi roll $5,%eax movdqa %xmm1,%xmm2 addl %esi,%edi paddd %xmm5,%xmm1 xorl %ebx,%ebp pxor %xmm0,%xmm6 xorl %ecx,%ebx addl %eax,%edi addl 36(%esp),%edx andl %ebx,%ebp movdqa %xmm6,%xmm0 movdqa %xmm1,16(%esp) xorl %ecx,%ebx rorl $7,%eax movl %edi,%esi xorl %ebx,%ebp roll $5,%edi pslld $2,%xmm6 addl %ebp,%edx xorl %eax,%esi psrld $30,%xmm0 xorl %ebx,%eax addl %edi,%edx addl 40(%esp),%ecx andl %eax,%esi xorl %ebx,%eax rorl $7,%edi por %xmm0,%xmm6 movl %edx,%ebp xorl %eax,%esi movdqa 96(%esp),%xmm0 roll $5,%edx addl %esi,%ecx xorl %edi,%ebp xorl %eax,%edi addl %edx,%ecx pshufd $238,%xmm5,%xmm1 addl 44(%esp),%ebx andl %edi,%ebp xorl %eax,%edi rorl $7,%edx movl %ecx,%esi xorl %edi,%ebp roll $5,%ecx addl %ebp,%ebx xorl %edx,%esi xorl %edi,%edx addl %ecx,%ebx addl 48(%esp),%eax pxor %xmm3,%xmm7 punpcklqdq %xmm6,%xmm1 andl %edx,%esi xorl %edi,%edx rorl $7,%ecx pxor %xmm0,%xmm7 movdqa %xmm3,96(%esp) movl %ebx,%ebp xorl %edx,%esi roll $5,%ebx movdqa 144(%esp),%xmm3 addl %esi,%eax paddd %xmm6,%xmm2 xorl %ecx,%ebp pxor %xmm1,%xmm7 xorl %edx,%ecx addl %ebx,%eax addl 52(%esp),%edi andl %ecx,%ebp movdqa %xmm7,%xmm1 movdqa %xmm2,32(%esp) xorl %edx,%ecx rorl $7,%ebx movl %eax,%esi xorl %ecx,%ebp roll $5,%eax pslld $2,%xmm7 addl %ebp,%edi xorl %ebx,%esi psrld $30,%xmm1 xorl %ecx,%ebx addl %eax,%edi addl 56(%esp),%edx andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax por %xmm1,%xmm7 movl %edi,%ebp xorl %ebx,%esi movdqa 64(%esp),%xmm1 roll $5,%edi addl %esi,%edx xorl %eax,%ebp xorl %ebx,%eax addl %edi,%edx pshufd $238,%xmm6,%xmm2 addl 60(%esp),%ecx andl %eax,%ebp xorl %ebx,%eax rorl $7,%edi movl %edx,%esi xorl %eax,%ebp roll $5,%edx addl %ebp,%ecx xorl %edi,%esi xorl %eax,%edi addl %edx,%ecx addl (%esp),%ebx pxor %xmm4,%xmm0 punpcklqdq %xmm7,%xmm2 andl %edi,%esi xorl %eax,%edi rorl $7,%edx pxor %xmm1,%xmm0 movdqa %xmm4,64(%esp) movl %ecx,%ebp xorl %edi,%esi roll $5,%ecx movdqa %xmm3,%xmm4 addl %esi,%ebx paddd %xmm7,%xmm3 xorl %edx,%ebp pxor %xmm2,%xmm0 xorl %edi,%edx addl %ecx,%ebx addl 4(%esp),%eax andl %edx,%ebp movdqa %xmm0,%xmm2 movdqa %xmm3,48(%esp) xorl %edi,%edx rorl $7,%ecx movl %ebx,%esi xorl %edx,%ebp roll $5,%ebx pslld $2,%xmm0 addl %ebp,%eax xorl %ecx,%esi psrld $30,%xmm2 xorl %edx,%ecx addl %ebx,%eax addl 8(%esp),%edi andl %ecx,%esi xorl %edx,%ecx rorl $7,%ebx por %xmm2,%xmm0 movl %eax,%ebp xorl %ecx,%esi movdqa 80(%esp),%xmm2 roll $5,%eax addl %esi,%edi xorl %ebx,%ebp xorl %ecx,%ebx addl %eax,%edi pshufd $238,%xmm7,%xmm3 addl 12(%esp),%edx andl %ebx,%ebp xorl %ecx,%ebx rorl $7,%eax movl %edi,%esi xorl %ebx,%ebp roll $5,%edi addl %ebp,%edx xorl %eax,%esi xorl %ebx,%eax addl %edi,%edx addl 16(%esp),%ecx pxor %xmm5,%xmm1 punpcklqdq %xmm0,%xmm3 andl %eax,%esi xorl %ebx,%eax rorl $7,%edi pxor %xmm2,%xmm1 movdqa %xmm5,80(%esp) movl %edx,%ebp xorl %eax,%esi roll $5,%edx movdqa %xmm4,%xmm5 addl %esi,%ecx paddd %xmm0,%xmm4 xorl %edi,%ebp pxor %xmm3,%xmm1 xorl %eax,%edi addl %edx,%ecx addl 20(%esp),%ebx andl %edi,%ebp movdqa %xmm1,%xmm3 movdqa %xmm4,(%esp) xorl %eax,%edi rorl $7,%edx movl %ecx,%esi xorl %edi,%ebp roll $5,%ecx pslld $2,%xmm1 addl %ebp,%ebx xorl %edx,%esi psrld $30,%xmm3 xorl %edi,%edx addl %ecx,%ebx addl 24(%esp),%eax andl %edx,%esi xorl %edi,%edx rorl $7,%ecx por %xmm3,%xmm1 movl %ebx,%ebp xorl %edx,%esi movdqa 96(%esp),%xmm3 roll $5,%ebx addl %esi,%eax xorl %ecx,%ebp xorl %edx,%ecx addl %ebx,%eax pshufd $238,%xmm0,%xmm4 addl 28(%esp),%edi andl %ecx,%ebp xorl %edx,%ecx rorl $7,%ebx movl %eax,%esi xorl %ecx,%ebp roll $5,%eax addl %ebp,%edi xorl %ebx,%esi xorl %ecx,%ebx addl %eax,%edi addl 32(%esp),%edx pxor %xmm6,%xmm2 punpcklqdq %xmm1,%xmm4 andl %ebx,%esi xorl %ecx,%ebx rorl $7,%eax pxor %xmm3,%xmm2 movdqa %xmm6,96(%esp) movl %edi,%ebp xorl %ebx,%esi roll $5,%edi movdqa %xmm5,%xmm6 addl %esi,%edx paddd %xmm1,%xmm5 xorl %eax,%ebp pxor %xmm4,%xmm2 xorl %ebx,%eax addl %edi,%edx addl 36(%esp),%ecx andl %eax,%ebp movdqa %xmm2,%xmm4 movdqa %xmm5,16(%esp) xorl %ebx,%eax rorl $7,%edi movl %edx,%esi xorl %eax,%ebp roll $5,%edx pslld $2,%xmm2 addl %ebp,%ecx xorl %edi,%esi psrld $30,%xmm4 xorl %eax,%edi addl %edx,%ecx addl 40(%esp),%ebx andl %edi,%esi xorl %eax,%edi rorl $7,%edx por %xmm4,%xmm2 movl %ecx,%ebp xorl %edi,%esi movdqa 64(%esp),%xmm4 roll $5,%ecx addl %esi,%ebx xorl %edx,%ebp xorl %edi,%edx addl %ecx,%ebx pshufd $238,%xmm1,%xmm5 addl 44(%esp),%eax andl %edx,%ebp xorl %edi,%edx rorl $7,%ecx movl %ebx,%esi xorl %edx,%ebp roll $5,%ebx addl %ebp,%eax xorl %edx,%esi addl %ebx,%eax addl 48(%esp),%edi pxor %xmm7,%xmm3 punpcklqdq %xmm2,%xmm5 xorl %ecx,%esi movl %eax,%ebp roll $5,%eax pxor %xmm4,%xmm3 movdqa %xmm7,64(%esp) addl %esi,%edi xorl %ecx,%ebp movdqa %xmm6,%xmm7 rorl $7,%ebx paddd %xmm2,%xmm6 addl %eax,%edi pxor %xmm5,%xmm3 addl 52(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi movdqa %xmm3,%xmm5 movdqa %xmm6,32(%esp) addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx pslld $2,%xmm3 addl 56(%esp),%ecx xorl %eax,%esi psrld $30,%xmm5 movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi addl %edx,%ecx por %xmm5,%xmm3 addl 60(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx addl %ecx,%ebx addl (%esp),%eax xorl %edx,%esi movl %ebx,%ebp roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx paddd %xmm3,%xmm7 addl %ebx,%eax addl 4(%esp),%edi xorl %ecx,%ebp movl %eax,%esi movdqa %xmm7,48(%esp) roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi addl 8(%esp),%edx xorl %ebx,%esi movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax addl %edi,%edx addl 12(%esp),%ecx xorl %eax,%ebp movl %edx,%esi roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi addl %edx,%ecx movl 196(%esp),%ebp cmpl 200(%esp),%ebp je .L007done movdqa 160(%esp),%xmm7 movdqa 176(%esp),%xmm6 movdqu (%ebp),%xmm0 movdqu 16(%ebp),%xmm1 movdqu 32(%ebp),%xmm2 movdqu 48(%ebp),%xmm3 addl $64,%ebp .byte 102,15,56,0,198 movl %ebp,196(%esp) movdqa %xmm7,96(%esp) addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx .byte 102,15,56,0,206 addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp movl %ebx,%esi paddd %xmm7,%xmm0 roll $5,%ebx addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx movdqa %xmm0,(%esp) addl %ebx,%eax addl 24(%esp),%edi xorl %ecx,%esi movl %eax,%ebp psubd %xmm7,%xmm0 roll $5,%eax addl %esi,%edi xorl %ecx,%ebp rorl $7,%ebx addl %eax,%edi addl 28(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx addl 32(%esp),%ecx xorl %eax,%esi movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi .byte 102,15,56,0,214 addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi paddd %xmm7,%xmm1 roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx movdqa %xmm1,16(%esp) addl %ecx,%ebx addl 40(%esp),%eax xorl %edx,%esi movl %ebx,%ebp psubd %xmm7,%xmm1 roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx addl %ebx,%eax addl 44(%esp),%edi xorl %ecx,%ebp movl %eax,%esi roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi addl 48(%esp),%edx xorl %ebx,%esi movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax .byte 102,15,56,0,222 addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp movl %edx,%esi paddd %xmm7,%xmm2 roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi movdqa %xmm2,32(%esp) addl %edx,%ecx addl 56(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp psubd %xmm7,%xmm2 roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx addl 60(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx addl %ebp,%eax rorl $7,%ecx addl %ebx,%eax movl 192(%esp),%ebp addl (%ebp),%eax addl 4(%ebp),%esi addl 8(%ebp),%ecx movl %eax,(%ebp) addl 12(%ebp),%edx movl %esi,4(%ebp) addl 16(%ebp),%edi movl %ecx,8(%ebp) movl %ecx,%ebx movl %edx,12(%ebp) xorl %edx,%ebx movl %edi,16(%ebp) movl %esi,%ebp pshufd $238,%xmm0,%xmm4 andl %ebx,%esi movl %ebp,%ebx jmp .L006loop .align 16 .L007done: addl 16(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx addl 20(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx addl %ebp,%eax xorl %edx,%esi rorl $7,%ecx addl %ebx,%eax addl 24(%esp),%edi xorl %ecx,%esi movl %eax,%ebp roll $5,%eax addl %esi,%edi xorl %ecx,%ebp rorl $7,%ebx addl %eax,%edi addl 28(%esp),%edx xorl %ebx,%ebp movl %edi,%esi roll $5,%edi addl %ebp,%edx xorl %ebx,%esi rorl $7,%eax addl %edi,%edx addl 32(%esp),%ecx xorl %eax,%esi movl %edx,%ebp roll $5,%edx addl %esi,%ecx xorl %eax,%ebp rorl $7,%edi addl %edx,%ecx addl 36(%esp),%ebx xorl %edi,%ebp movl %ecx,%esi roll $5,%ecx addl %ebp,%ebx xorl %edi,%esi rorl $7,%edx addl %ecx,%ebx addl 40(%esp),%eax xorl %edx,%esi movl %ebx,%ebp roll $5,%ebx addl %esi,%eax xorl %edx,%ebp rorl $7,%ecx addl %ebx,%eax addl 44(%esp),%edi xorl %ecx,%ebp movl %eax,%esi roll $5,%eax addl %ebp,%edi xorl %ecx,%esi rorl $7,%ebx addl %eax,%edi addl 48(%esp),%edx xorl %ebx,%esi movl %edi,%ebp roll $5,%edi addl %esi,%edx xorl %ebx,%ebp rorl $7,%eax addl %edi,%edx addl 52(%esp),%ecx xorl %eax,%ebp movl %edx,%esi roll $5,%edx addl %ebp,%ecx xorl %eax,%esi rorl $7,%edi addl %edx,%ecx addl 56(%esp),%ebx xorl %edi,%esi movl %ecx,%ebp roll $5,%ecx addl %esi,%ebx xorl %edi,%ebp rorl $7,%edx addl %ecx,%ebx addl 60(%esp),%eax xorl %edx,%ebp movl %ebx,%esi roll $5,%ebx addl %ebp,%eax rorl $7,%ecx addl %ebx,%eax movl 192(%esp),%ebp addl (%ebp),%eax movl 204(%esp),%esp addl 4(%ebp),%esi addl 8(%ebp),%ecx movl %eax,(%ebp) addl 12(%ebp),%edx movl %esi,4(%ebp) addl 16(%ebp),%edi movl %ecx,8(%ebp) movl %edx,12(%ebp) movl %edi,16(%ebp) popl %edi popl %esi popl %ebx popl %ebp ret .size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 +.type _sha1_block_data_order_avx,@function +.align 16 +_sha1_block_data_order_avx: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L008pic_point +.L008pic_point: + popl %ebp + leal .LK_XX_XX-.L008pic_point(%ebp),%ebp +.Lavx_shortcut: + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp .L009loop +.align 16 +.L009loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L010done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp .L009loop +.align 16 +.L010done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx .align 64 .LK_XX_XX: .long 1518500249,1518500249,1518500249,1518500249 .long 1859775393,1859775393,1859775393,1859775393 .long 2400959708,2400959708,2400959708,2400959708 .long 3395469782,3395469782,3395469782,3395469782 .long 66051,67438087,134810123,202182159 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 .byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 .byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 .byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 .byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 .comm OPENSSL_ia32cap_P,16,4 #endif Index: stable/12/secure/lib/libcrypto/i386/sha256-586.S =================================================================== --- stable/12/secure/lib/libcrypto/i386/sha256-586.S (revision 364962) +++ stable/12/secure/lib/libcrypto/i386/sha256-586.S (revision 364963) @@ -1,9157 +1,13569 @@ /* $FreeBSD$ */ /* Do not modify. This file is auto-generated from sha256-586.pl. */ #ifdef PIC .text .globl sha256_block_data_order .type sha256_block_data_order,@function .align 16 sha256_block_data_order: .L_sha256_block_data_order_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax movl %esp,%ebx call .L000pic_point .L000pic_point: popl %ebp leal .L001K256-.L000pic_point(%ebp),%ebp subl $16,%esp andl $-64,%esp shll $6,%eax addl %edi,%eax movl %esi,(%esp) movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) leal OPENSSL_ia32cap_P-.L001K256(%ebp),%edx movl (%edx),%ecx movl 4(%edx),%ebx testl $1048576,%ecx jnz .L002loop movl 8(%edx),%edx testl $16777216,%ecx jz .L003no_xmm andl $1073741824,%ecx andl $268435968,%ebx testl $536870912,%edx jnz .L004shaext orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L005AVX testl $512,%ebx - jnz .L005SSSE3 + jnz .L006SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L006unrolled + jae .L007unrolled jmp .L002loop .align 16 .L002loop: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx bswap %eax movl 12(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx movl 16(%edi),%eax movl 20(%edi),%ebx movl 24(%edi),%ecx bswap %eax movl 28(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx movl 32(%edi),%eax movl 36(%edi),%ebx movl 40(%edi),%ecx bswap %eax movl 44(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx movl 48(%edi),%eax movl 52(%edi),%ebx movl 56(%edi),%ecx bswap %eax movl 60(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx addl $64,%edi leal -36(%esp),%esp movl %edi,104(%esp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi movl %ebx,8(%esp) xorl %ecx,%ebx movl %ecx,12(%esp) movl %edi,16(%esp) movl %ebx,(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edi movl %ebx,24(%esp) movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15: +.L00800_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx movl 28(%esp),%edi xorl %edx,%ecx xorl %edi,%esi movl 96(%esp),%ebx rorl $5,%ecx andl %edx,%esi movl %edx,20(%esp) xorl %ecx,%edx addl 32(%esp),%ebx xorl %edi,%esi rorl $6,%edx movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx addl %edx,%ebx movl 8(%esp),%edi xorl %eax,%ecx movl %eax,4(%esp) leal -4(%esp),%esp rorl $11,%ecx movl (%ebp),%esi xorl %eax,%ecx movl 20(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %esi,%ebx movl %eax,(%esp) addl %ebx,%edx andl 4(%esp),%eax addl %ecx,%ebx xorl %edi,%eax addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15 + jne .L00800_15 movl 156(%esp),%ecx - jmp .L00816_63 + jmp .L00916_63 .align 16 -.L00816_63: +.L00916_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 160(%esp),%ebx shrl $10,%edi addl 124(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 24(%esp),%esi rorl $14,%ecx addl %edi,%ebx movl 28(%esp),%edi xorl %edx,%ecx xorl %edi,%esi movl %ebx,96(%esp) rorl $5,%ecx andl %edx,%esi movl %edx,20(%esp) xorl %ecx,%edx addl 32(%esp),%ebx xorl %edi,%esi rorl $6,%edx movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx addl %edx,%ebx movl 8(%esp),%edi xorl %eax,%ecx movl %eax,4(%esp) leal -4(%esp),%esp rorl $11,%ecx movl (%ebp),%esi xorl %eax,%ecx movl 20(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %esi,%ebx movl %eax,(%esp) addl %ebx,%edx andl 4(%esp),%eax addl %ecx,%ebx xorl %edi,%eax movl 156(%esp),%ecx addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63 + jne .L00916_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) movl 24(%esp),%eax movl 28(%esp),%ebx movl 32(%esp),%ecx movl 360(%esp),%edi addl 16(%esi),%edx addl 20(%esi),%eax addl 24(%esi),%ebx addl 28(%esi),%ecx movl %edx,16(%esi) movl %eax,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi jb .L002loop movl 12(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .align 64 .L001K256: .long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 .long 66051,67438087,134810123,202182159 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L006unrolled: +.L007unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp movl 8(%esi),%ecx movl 12(%esi),%ebx movl %ebp,4(%esp) xorl %ecx,%ebp movl %ecx,8(%esp) movl %ebx,12(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%esi movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L009grand_loop + jmp .L010grand_loop .align 16 -.L009grand_loop: +.L010grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx movl 8(%edi),%esi bswap %ecx movl %ebx,32(%esp) bswap %esi movl %ecx,36(%esp) movl %esi,40(%esp) movl 12(%edi),%ebx movl 16(%edi),%ecx bswap %ebx movl 20(%edi),%esi bswap %ecx movl %ebx,44(%esp) bswap %esi movl %ecx,48(%esp) movl %esi,52(%esp) movl 24(%edi),%ebx movl 28(%edi),%ecx bswap %ebx movl 32(%edi),%esi bswap %ecx movl %ebx,56(%esp) bswap %esi movl %ecx,60(%esp) movl %esi,64(%esp) movl 36(%edi),%ebx movl 40(%edi),%ecx bswap %ebx movl 44(%edi),%esi bswap %ecx movl %ebx,68(%esp) bswap %esi movl %ecx,72(%esp) movl %esi,76(%esp) movl 48(%edi),%ebx movl 52(%edi),%ecx bswap %ebx movl 56(%edi),%esi bswap %ecx movl %ebx,80(%esp) bswap %esi movl %ecx,84(%esp) movl %esi,88(%esp) movl 60(%edi),%ebx addl $64,%edi bswap %ebx movl %edi,100(%esp) movl %ebx,92(%esp) movl %edx,%ecx movl 20(%esp),%esi rorl $14,%edx movl 24(%esp),%edi xorl %ecx,%edx movl 32(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1116352408(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 16(%esp),%ecx rorl $14,%edx movl 20(%esp),%edi xorl %esi,%edx movl 36(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1899447441(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 12(%esp),%esi rorl $14,%edx movl 16(%esp),%edi xorl %ecx,%edx movl 40(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3049323471(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 8(%esp),%ecx rorl $14,%edx movl 12(%esp),%edi xorl %esi,%edx movl 44(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3921009573(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl %edx,%ecx movl 4(%esp),%esi rorl $14,%edx movl 8(%esp),%edi xorl %ecx,%edx movl 48(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 961987163(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl (%esp),%ecx rorl $14,%edx movl 4(%esp),%edi xorl %esi,%edx movl 52(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1508970993(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 28(%esp),%esi rorl $14,%edx movl (%esp),%edi xorl %ecx,%edx movl 56(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2453635748(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 24(%esp),%ecx rorl $14,%edx movl 28(%esp),%edi xorl %esi,%edx movl 60(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2870763221(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 20(%esp),%esi rorl $14,%edx movl 24(%esp),%edi xorl %ecx,%edx movl 64(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3624381080(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 16(%esp),%ecx rorl $14,%edx movl 20(%esp),%edi xorl %esi,%edx movl 68(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 310598401(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 12(%esp),%esi rorl $14,%edx movl 16(%esp),%edi xorl %ecx,%edx movl 72(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 607225278(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 8(%esp),%ecx rorl $14,%edx movl 12(%esp),%edi xorl %esi,%edx movl 76(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1426881987(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl %edx,%ecx movl 4(%esp),%esi rorl $14,%edx movl 8(%esp),%edi xorl %ecx,%edx movl 80(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1925078388(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl (%esp),%ecx rorl $14,%edx movl 4(%esp),%edi xorl %esi,%edx movl 84(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2162078206(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 28(%esp),%esi rorl $14,%edx movl (%esp),%edi xorl %ecx,%edx movl 88(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2614888103(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 24(%esp),%ecx rorl $14,%edx movl 28(%esp),%edi xorl %esi,%edx movl 92(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3248222580(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 36(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 88(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 32(%esp),%ebx shrl $10,%edi addl 68(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,32(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3835390401(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 40(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 92(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 36(%esp),%ebx shrl $10,%edi addl 72(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,36(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 4022224774(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 44(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 32(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 40(%esp),%ebx shrl $10,%edi addl 76(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,40(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 264347078(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 48(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 36(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 44(%esp),%ebx shrl $10,%edi addl 80(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,44(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 604807628(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 52(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 40(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 48(%esp),%ebx shrl $10,%edi addl 84(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,48(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 770255983(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 56(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 44(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 52(%esp),%ebx shrl $10,%edi addl 88(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,52(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1249150122(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 60(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 48(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 56(%esp),%ebx shrl $10,%edi addl 92(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,56(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1555081692(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 64(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 52(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 60(%esp),%ebx shrl $10,%edi addl 32(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,60(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1996064986(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 68(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 56(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 64(%esp),%ebx shrl $10,%edi addl 36(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,64(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2554220882(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 72(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 60(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 68(%esp),%ebx shrl $10,%edi addl 40(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,68(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2821834349(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 76(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 64(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 72(%esp),%ebx shrl $10,%edi addl 44(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,72(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2952996808(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 80(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 68(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 76(%esp),%ebx shrl $10,%edi addl 48(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,76(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3210313671(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 84(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 72(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 80(%esp),%ebx shrl $10,%edi addl 52(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,80(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3336571891(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 88(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 76(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 84(%esp),%ebx shrl $10,%edi addl 56(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,84(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3584528711(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 92(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 80(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 88(%esp),%ebx shrl $10,%edi addl 60(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,88(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 113926993(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 32(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 84(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 92(%esp),%ebx shrl $10,%edi addl 64(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,92(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 338241895(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 36(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 88(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 32(%esp),%ebx shrl $10,%edi addl 68(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,32(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 666307205(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 40(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 92(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 36(%esp),%ebx shrl $10,%edi addl 72(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,36(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 773529912(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 44(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 32(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 40(%esp),%ebx shrl $10,%edi addl 76(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,40(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1294757372(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 48(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 36(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 44(%esp),%ebx shrl $10,%edi addl 80(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,44(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1396182291(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 52(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 40(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 48(%esp),%ebx shrl $10,%edi addl 84(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,48(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1695183700(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 56(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 44(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 52(%esp),%ebx shrl $10,%edi addl 88(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,52(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1986661051(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 60(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 48(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 56(%esp),%ebx shrl $10,%edi addl 92(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,56(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2177026350(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 64(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 52(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 60(%esp),%ebx shrl $10,%edi addl 32(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,60(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2456956037(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 68(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 56(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 64(%esp),%ebx shrl $10,%edi addl 36(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,64(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2730485921(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 72(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 60(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 68(%esp),%ebx shrl $10,%edi addl 40(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,68(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2820302411(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 76(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 64(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 72(%esp),%ebx shrl $10,%edi addl 44(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,72(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3259730800(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 80(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 68(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 76(%esp),%ebx shrl $10,%edi addl 48(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,76(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3345764771(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 84(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 72(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 80(%esp),%ebx shrl $10,%edi addl 52(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,80(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3516065817(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 88(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 76(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 84(%esp),%ebx shrl $10,%edi addl 56(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,84(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3600352804(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 92(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 80(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 88(%esp),%ebx shrl $10,%edi addl 60(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,88(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 4094571909(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 32(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 84(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 92(%esp),%ebx shrl $10,%edi addl 64(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,92(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 275423344(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 36(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 88(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 32(%esp),%ebx shrl $10,%edi addl 68(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,32(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 430227734(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 40(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 92(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 36(%esp),%ebx shrl $10,%edi addl 72(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,36(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 506948616(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 44(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 32(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 40(%esp),%ebx shrl $10,%edi addl 76(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,40(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 659060556(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 48(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 36(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 44(%esp),%ebx shrl $10,%edi addl 80(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,44(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 883997877(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 52(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 40(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 48(%esp),%ebx shrl $10,%edi addl 84(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,48(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 958139571(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 56(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 44(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 52(%esp),%ebx shrl $10,%edi addl 88(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,52(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1322822218(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 60(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 48(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 56(%esp),%ebx shrl $10,%edi addl 92(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,56(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1537002063(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 64(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 52(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 60(%esp),%ebx shrl $10,%edi addl 32(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,60(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1747873779(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 68(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 56(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 64(%esp),%ebx shrl $10,%edi addl 36(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,64(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1955562222(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 72(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 60(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 68(%esp),%ebx shrl $10,%edi addl 40(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,68(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2024104815(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 76(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 64(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 72(%esp),%ebx shrl $10,%edi addl 44(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,72(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2227730452(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 80(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 68(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 76(%esp),%ebx shrl $10,%edi addl 48(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,76(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2361852424(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 84(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 72(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 80(%esp),%ebx shrl $10,%edi addl 52(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,80(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2428436474(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 88(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 76(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 84(%esp),%ebx shrl $10,%edi addl 56(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,84(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2756734187(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 92(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 80(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 88(%esp),%ebx shrl $10,%edi addl 60(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3204031479(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 32(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 84(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 92(%esp),%ebx shrl $10,%edi addl 64(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3329325298(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 96(%esp),%esi xorl %edi,%ebp movl 12(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebp addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) movl %ebp,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) movl %ebp,4(%esp) xorl %edi,%ebp movl %edi,8(%esp) movl %ecx,12(%esp) movl 20(%esp),%edi movl 24(%esp),%ebx movl 28(%esp),%ecx addl 16(%esi),%edx addl 20(%esi),%edi addl 24(%esi),%ebx addl 28(%esi),%ecx movl %edx,16(%esi) movl %edi,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) movl %edi,20(%esp) movl 100(%esp),%edi movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L009grand_loop + jb .L010grand_loop movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .align 32 .L004shaext: subl $32,%esp movdqu (%esi),%xmm1 leal 128(%ebp),%ebp movdqu 16(%esi),%xmm2 movdqa 128(%ebp),%xmm7 pshufd $27,%xmm1,%xmm0 pshufd $177,%xmm1,%xmm1 pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext + jmp .L011loop_shaext .align 16 -.L010loop_shaext: +.L011loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%edi),%xmm6 movdqa %xmm2,16(%esp) movdqa -128(%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 nop movdqa %xmm1,(%esp) .byte 15,56,203,202 movdqa -112(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 leal 64(%edi),%edi .byte 15,56,204,220 .byte 15,56,203,202 movdqa -96(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa -80(%ebp),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa -64(%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa -48(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa -32(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa -16(%ebp),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa (%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 16(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 32(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 48(%ebp),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 64(%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 80(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 96(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 .byte 15,56,205,245 movdqa 128(%ebp),%xmm7 .byte 15,56,203,202 movdqa 112(%ebp),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 cmpl %edi,%eax nop .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L010loop_shaext + jnz .L011loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movl 44(%esp),%esp movdqu %xmm1,(%esi) movdqu %xmm2,16(%esi) popl %edi popl %esi popl %ebx popl %ebp ret .align 32 -.L005SSSE3: +.L006SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi movl %ebx,4(%esp) xorl %ecx,%ebx movl %ecx,8(%esp) movl %edi,12(%esp) movl 16(%esi),%edx movl 20(%esi),%edi movl 24(%esi),%ecx movl 28(%esi),%esi movl %edi,20(%esp) movl 100(%esp),%edi movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L012grand_ssse3 .align 16 -.L011grand_ssse3: +.L012grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 movdqu 48(%edi),%xmm3 addl $64,%edi .byte 102,15,56,0,199 movl %edi,100(%esp) .byte 102,15,56,0,207 movdqa (%ebp),%xmm4 .byte 102,15,56,0,215 movdqa 16(%ebp),%xmm5 paddd %xmm0,%xmm4 .byte 102,15,56,0,223 movdqa 32(%ebp),%xmm6 paddd %xmm1,%xmm5 movdqa 48(%ebp),%xmm7 movdqa %xmm4,32(%esp) paddd %xmm2,%xmm6 movdqa %xmm5,48(%esp) paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L013ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L013ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 rorl $14,%edx movl 20(%esp),%esi movdqa %xmm3,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi .byte 102,15,58,15,224,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,250,4 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 4(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm0 movl %eax,(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm3,%xmm7 xorl %esi,%ecx addl 32(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 12(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl 16(%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,12(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm0 movl %ebx,28(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 36(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 8(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm0 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) pshufd $80,%xmm0,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 40(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 4(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa (%ebp),%xmm6 andl %ecx,%esi movl %ecx,4(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm0 movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx paddd %xmm0,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 44(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movdqa %xmm6,32(%esp) movl %edx,%ecx movdqa %xmm2,%xmm4 rorl $14,%edx movl 4(%esp),%esi movdqa %xmm0,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi .byte 102,15,58,15,225,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,251,4 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 20(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm1 movl %eax,16(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm0,%xmm7 xorl %esi,%ecx addl 48(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 28(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl (%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,28(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm1 movl %ebx,12(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 52(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 24(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm1 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) pshufd $80,%xmm1,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 56(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 20(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa 16(%ebp),%xmm6 andl %ecx,%esi movl %ecx,20(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm1 movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx paddd %xmm1,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 60(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movdqa %xmm6,48(%esp) movl %edx,%ecx movdqa %xmm3,%xmm4 rorl $14,%edx movl 20(%esp),%esi movdqa %xmm1,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi .byte 102,15,58,15,226,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,248,4 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 4(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm2 movl %eax,(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm1,%xmm7 xorl %esi,%ecx addl 64(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 12(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl 16(%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,12(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm2 movl %ebx,28(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 68(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 8(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm2 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) pshufd $80,%xmm2,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 72(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 4(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa 32(%ebp),%xmm6 andl %ecx,%esi movl %ecx,4(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm2 movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx paddd %xmm2,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 76(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movdqa %xmm6,64(%esp) movl %edx,%ecx movdqa %xmm0,%xmm4 rorl $14,%edx movl 4(%esp),%esi movdqa %xmm2,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi .byte 102,15,58,15,227,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,249,4 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 20(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm3 movl %eax,16(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm2,%xmm7 xorl %esi,%ecx addl 80(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 28(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl (%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,28(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm3 movl %ebx,12(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 84(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 24(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm3 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) pshufd $80,%xmm3,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 88(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 20(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa 48(%ebp),%xmm6 andl %ecx,%esi movl %ecx,20(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm3 movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx paddd %xmm3,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 92(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L013ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi xorl %ecx,%edx movl 24(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 4(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,(%esp) xorl %eax,%ecx xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 32(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 12(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 16(%esp),%esi xorl %ecx,%edx movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,12(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,28(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 36(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 8(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 40(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 4(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,4(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 44(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 4(%esp),%esi xorl %ecx,%edx movl 8(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 20(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,16(%esp) xorl %eax,%ecx xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 48(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 28(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl (%esp),%esi xorl %ecx,%edx movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,28(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,12(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 52(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 24(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 56(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 20(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,20(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 60(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi xorl %ecx,%edx movl 24(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 4(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,(%esp) xorl %eax,%ecx xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 64(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 12(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 16(%esp),%esi xorl %ecx,%edx movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,12(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,28(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 68(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 8(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 72(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 4(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,4(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 76(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 4(%esp),%esi xorl %ecx,%edx movl 8(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 20(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,16(%esp) xorl %eax,%ecx xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 80(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 28(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl (%esp),%esi xorl %ecx,%edx movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,28(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,12(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 84(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 24(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 88(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 20(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,20(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 92(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movl 96(%esp),%esi xorl %edi,%ebx movl 12(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) movl %ebx,4(%esp) xorl %edi,%ebx movl %edi,8(%esp) movl %ecx,12(%esp) movl 20(%esp),%edi movl 24(%esp),%ecx addl 16(%esi),%edx addl 20(%esi),%edi addl 24(%esi),%ecx movl %edx,16(%esi) movl %edi,20(%esi) movl %edi,20(%esp) movl 28(%esp),%edi movl %ecx,24(%esi) addl 28(%esi),%edi movl %ecx,24(%esp) movl %edi,28(%esi) movl %edi,28(%esp) movl 100(%esp),%edi movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L012grand_ssse3 movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret +.align 32 +.L005AVX: + andl $264,%edx + cmpl $264,%edx + je .L014AVX_BMI + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L015grand_avx +.align 32 +.L015grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L016avx_00_47 +.align 16 +.L016avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L016avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L015grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L014AVX_BMI: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L017grand_avx_bmi +.align 32 +.L017grand_avx_bmi: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L018avx_bmi_00_47 +.align 16 +.L018avx_bmi_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + vpaddd %xmm4,%xmm0,%xmm0 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm0,%xmm0 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm0,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + vpaddd (%ebp),%xmm0,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + vpaddd %xmm4,%xmm1,%xmm1 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm1,%xmm1 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm1,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm0,%xmm1,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + vpaddd %xmm4,%xmm2,%xmm2 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm2,%xmm2 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm2,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + vpaddd %xmm4,%xmm3,%xmm3 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm3,%xmm3 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm3,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L018avx_bmi_00_47 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L017grand_avx_bmi + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 #else .text .globl sha256_block_data_order .type sha256_block_data_order,@function .align 16 sha256_block_data_order: .L_sha256_block_data_order_begin: pushl %ebp pushl %ebx pushl %esi pushl %edi movl 20(%esp),%esi movl 24(%esp),%edi movl 28(%esp),%eax movl %esp,%ebx call .L000pic_point .L000pic_point: popl %ebp leal .L001K256-.L000pic_point(%ebp),%ebp subl $16,%esp andl $-64,%esp shll $6,%eax addl %edi,%eax movl %esi,(%esp) movl %edi,4(%esp) movl %eax,8(%esp) movl %ebx,12(%esp) leal OPENSSL_ia32cap_P,%edx movl (%edx),%ecx movl 4(%edx),%ebx testl $1048576,%ecx jnz .L002loop movl 8(%edx),%edx testl $16777216,%ecx jz .L003no_xmm andl $1073741824,%ecx andl $268435968,%ebx testl $536870912,%edx jnz .L004shaext orl %ebx,%ecx andl $1342177280,%ecx cmpl $1342177280,%ecx + je .L005AVX testl $512,%ebx - jnz .L005SSSE3 + jnz .L006SSSE3 .L003no_xmm: subl %edi,%eax cmpl $256,%eax - jae .L006unrolled + jae .L007unrolled jmp .L002loop .align 16 .L002loop: movl (%edi),%eax movl 4(%edi),%ebx movl 8(%edi),%ecx bswap %eax movl 12(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx movl 16(%edi),%eax movl 20(%edi),%ebx movl 24(%edi),%ecx bswap %eax movl 28(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx movl 32(%edi),%eax movl 36(%edi),%ebx movl 40(%edi),%ecx bswap %eax movl 44(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx movl 48(%edi),%eax movl 52(%edi),%ebx movl 56(%edi),%ecx bswap %eax movl 60(%edi),%edx bswap %ebx pushl %eax bswap %ecx pushl %ebx bswap %edx pushl %ecx pushl %edx addl $64,%edi leal -36(%esp),%esp movl %edi,104(%esp) movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi movl %ebx,8(%esp) xorl %ecx,%ebx movl %ecx,12(%esp) movl %edi,16(%esp) movl %ebx,(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%edi movl %ebx,24(%esp) movl %ecx,28(%esp) movl %edi,32(%esp) .align 16 -.L00700_15: +.L00800_15: movl %edx,%ecx movl 24(%esp),%esi rorl $14,%ecx movl 28(%esp),%edi xorl %edx,%ecx xorl %edi,%esi movl 96(%esp),%ebx rorl $5,%ecx andl %edx,%esi movl %edx,20(%esp) xorl %ecx,%edx addl 32(%esp),%ebx xorl %edi,%esi rorl $6,%edx movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx addl %edx,%ebx movl 8(%esp),%edi xorl %eax,%ecx movl %eax,4(%esp) leal -4(%esp),%esp rorl $11,%ecx movl (%ebp),%esi xorl %eax,%ecx movl 20(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %esi,%ebx movl %eax,(%esp) addl %ebx,%edx andl 4(%esp),%eax addl %ecx,%ebx xorl %edi,%eax addl $4,%ebp addl %ebx,%eax cmpl $3248222580,%esi - jne .L00700_15 + jne .L00800_15 movl 156(%esp),%ecx - jmp .L00816_63 + jmp .L00916_63 .align 16 -.L00816_63: +.L00916_63: movl %ecx,%ebx movl 104(%esp),%esi rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 160(%esp),%ebx shrl $10,%edi addl 124(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 24(%esp),%esi rorl $14,%ecx addl %edi,%ebx movl 28(%esp),%edi xorl %edx,%ecx xorl %edi,%esi movl %ebx,96(%esp) rorl $5,%ecx andl %edx,%esi movl %edx,20(%esp) xorl %ecx,%edx addl 32(%esp),%ebx xorl %edi,%esi rorl $6,%edx movl %eax,%ecx addl %esi,%ebx rorl $9,%ecx addl %edx,%ebx movl 8(%esp),%edi xorl %eax,%ecx movl %eax,4(%esp) leal -4(%esp),%esp rorl $11,%ecx movl (%ebp),%esi xorl %eax,%ecx movl 20(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %esi,%ebx movl %eax,(%esp) addl %ebx,%edx andl 4(%esp),%eax addl %ecx,%ebx xorl %edi,%eax movl 156(%esp),%ecx addl $4,%ebp addl %ebx,%eax cmpl $3329325298,%esi - jne .L00816_63 + jne .L00916_63 movl 356(%esp),%esi movl 8(%esp),%ebx movl 16(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) movl 24(%esp),%eax movl 28(%esp),%ebx movl 32(%esp),%ecx movl 360(%esp),%edi addl 16(%esi),%edx addl 20(%esi),%eax addl 24(%esi),%ebx addl 28(%esi),%ecx movl %edx,16(%esi) movl %eax,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) leal 356(%esp),%esp subl $256,%ebp cmpl 8(%esp),%edi jb .L002loop movl 12(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .align 64 .L001K256: .long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 .long 66051,67438087,134810123,202182159 .byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 .byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 .byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 .byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 .byte 62,0 .align 16 -.L006unrolled: +.L007unrolled: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebp movl 8(%esi),%ecx movl 12(%esi),%ebx movl %ebp,4(%esp) xorl %ecx,%ebp movl %ecx,8(%esp) movl %ebx,12(%esp) movl 16(%esi),%edx movl 20(%esi),%ebx movl 24(%esi),%ecx movl 28(%esi),%esi movl %ebx,20(%esp) movl %ecx,24(%esp) movl %esi,28(%esp) - jmp .L009grand_loop + jmp .L010grand_loop .align 16 -.L009grand_loop: +.L010grand_loop: movl (%edi),%ebx movl 4(%edi),%ecx bswap %ebx movl 8(%edi),%esi bswap %ecx movl %ebx,32(%esp) bswap %esi movl %ecx,36(%esp) movl %esi,40(%esp) movl 12(%edi),%ebx movl 16(%edi),%ecx bswap %ebx movl 20(%edi),%esi bswap %ecx movl %ebx,44(%esp) bswap %esi movl %ecx,48(%esp) movl %esi,52(%esp) movl 24(%edi),%ebx movl 28(%edi),%ecx bswap %ebx movl 32(%edi),%esi bswap %ecx movl %ebx,56(%esp) bswap %esi movl %ecx,60(%esp) movl %esi,64(%esp) movl 36(%edi),%ebx movl 40(%edi),%ecx bswap %ebx movl 44(%edi),%esi bswap %ecx movl %ebx,68(%esp) bswap %esi movl %ecx,72(%esp) movl %esi,76(%esp) movl 48(%edi),%ebx movl 52(%edi),%ecx bswap %ebx movl 56(%edi),%esi bswap %ecx movl %ebx,80(%esp) bswap %esi movl %ecx,84(%esp) movl %esi,88(%esp) movl 60(%edi),%ebx addl $64,%edi bswap %ebx movl %edi,100(%esp) movl %ebx,92(%esp) movl %edx,%ecx movl 20(%esp),%esi rorl $14,%edx movl 24(%esp),%edi xorl %ecx,%edx movl 32(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1116352408(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 16(%esp),%ecx rorl $14,%edx movl 20(%esp),%edi xorl %esi,%edx movl 36(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1899447441(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 12(%esp),%esi rorl $14,%edx movl 16(%esp),%edi xorl %ecx,%edx movl 40(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3049323471(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 8(%esp),%ecx rorl $14,%edx movl 12(%esp),%edi xorl %esi,%edx movl 44(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3921009573(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl %edx,%ecx movl 4(%esp),%esi rorl $14,%edx movl 8(%esp),%edi xorl %ecx,%edx movl 48(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 961987163(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl (%esp),%ecx rorl $14,%edx movl 4(%esp),%edi xorl %esi,%edx movl 52(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1508970993(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 28(%esp),%esi rorl $14,%edx movl (%esp),%edi xorl %ecx,%edx movl 56(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2453635748(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 24(%esp),%ecx rorl $14,%edx movl 28(%esp),%edi xorl %esi,%edx movl 60(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2870763221(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 20(%esp),%esi rorl $14,%edx movl 24(%esp),%edi xorl %ecx,%edx movl 64(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3624381080(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 16(%esp),%ecx rorl $14,%edx movl 20(%esp),%edi xorl %esi,%edx movl 68(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 310598401(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 12(%esp),%esi rorl $14,%edx movl 16(%esp),%edi xorl %ecx,%edx movl 72(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 607225278(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 8(%esp),%ecx rorl $14,%edx movl 12(%esp),%edi xorl %esi,%edx movl 76(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1426881987(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl %edx,%ecx movl 4(%esp),%esi rorl $14,%edx movl 8(%esp),%edi xorl %ecx,%edx movl 80(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1925078388(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl (%esp),%ecx rorl $14,%edx movl 4(%esp),%edi xorl %esi,%edx movl 84(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2162078206(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl %edx,%ecx movl 28(%esp),%esi rorl $14,%edx movl (%esp),%edi xorl %ecx,%edx movl 88(%esp),%ebx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2614888103(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl %edx,%esi movl 24(%esp),%ecx rorl $14,%edx movl 28(%esp),%edi xorl %esi,%edx movl 92(%esp),%ebx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3248222580(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 36(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 88(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 32(%esp),%ebx shrl $10,%edi addl 68(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,32(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3835390401(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 40(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 92(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 36(%esp),%ebx shrl $10,%edi addl 72(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,36(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 4022224774(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 44(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 32(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 40(%esp),%ebx shrl $10,%edi addl 76(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,40(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 264347078(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 48(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 36(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 44(%esp),%ebx shrl $10,%edi addl 80(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,44(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 604807628(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 52(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 40(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 48(%esp),%ebx shrl $10,%edi addl 84(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,48(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 770255983(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 56(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 44(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 52(%esp),%ebx shrl $10,%edi addl 88(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,52(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1249150122(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 60(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 48(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 56(%esp),%ebx shrl $10,%edi addl 92(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,56(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1555081692(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 64(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 52(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 60(%esp),%ebx shrl $10,%edi addl 32(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,60(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1996064986(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 68(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 56(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 64(%esp),%ebx shrl $10,%edi addl 36(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,64(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2554220882(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 72(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 60(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 68(%esp),%ebx shrl $10,%edi addl 40(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,68(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2821834349(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 76(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 64(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 72(%esp),%ebx shrl $10,%edi addl 44(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,72(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2952996808(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 80(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 68(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 76(%esp),%ebx shrl $10,%edi addl 48(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,76(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3210313671(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 84(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 72(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 80(%esp),%ebx shrl $10,%edi addl 52(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,80(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3336571891(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 88(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 76(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 84(%esp),%ebx shrl $10,%edi addl 56(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,84(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3584528711(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 92(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 80(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 88(%esp),%ebx shrl $10,%edi addl 60(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,88(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 113926993(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 32(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 84(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 92(%esp),%ebx shrl $10,%edi addl 64(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,92(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 338241895(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 36(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 88(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 32(%esp),%ebx shrl $10,%edi addl 68(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,32(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 666307205(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 40(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 92(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 36(%esp),%ebx shrl $10,%edi addl 72(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,36(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 773529912(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 44(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 32(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 40(%esp),%ebx shrl $10,%edi addl 76(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,40(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1294757372(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 48(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 36(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 44(%esp),%ebx shrl $10,%edi addl 80(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,44(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1396182291(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 52(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 40(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 48(%esp),%ebx shrl $10,%edi addl 84(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,48(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1695183700(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 56(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 44(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 52(%esp),%ebx shrl $10,%edi addl 88(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,52(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1986661051(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 60(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 48(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 56(%esp),%ebx shrl $10,%edi addl 92(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,56(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2177026350(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 64(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 52(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 60(%esp),%ebx shrl $10,%edi addl 32(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,60(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2456956037(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 68(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 56(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 64(%esp),%ebx shrl $10,%edi addl 36(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,64(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2730485921(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 72(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 60(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 68(%esp),%ebx shrl $10,%edi addl 40(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,68(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2820302411(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 76(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 64(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 72(%esp),%ebx shrl $10,%edi addl 44(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,72(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3259730800(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 80(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 68(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 76(%esp),%ebx shrl $10,%edi addl 48(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,76(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3345764771(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 84(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 72(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 80(%esp),%ebx shrl $10,%edi addl 52(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,80(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3516065817(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 88(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 76(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 84(%esp),%ebx shrl $10,%edi addl 56(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,84(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3600352804(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 92(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 80(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 88(%esp),%ebx shrl $10,%edi addl 60(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,88(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 4094571909(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 32(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 84(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 92(%esp),%ebx shrl $10,%edi addl 64(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,92(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 275423344(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 36(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 88(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 32(%esp),%ebx shrl $10,%edi addl 68(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,32(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 430227734(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 40(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 92(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 36(%esp),%ebx shrl $10,%edi addl 72(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,36(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 506948616(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 44(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 32(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 40(%esp),%ebx shrl $10,%edi addl 76(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,40(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 659060556(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 48(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 36(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 44(%esp),%ebx shrl $10,%edi addl 80(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,44(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 883997877(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 52(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 40(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 48(%esp),%ebx shrl $10,%edi addl 84(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,48(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 958139571(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 56(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 44(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 52(%esp),%ebx shrl $10,%edi addl 88(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,52(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1322822218(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 60(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 48(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 56(%esp),%ebx shrl $10,%edi addl 92(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx movl %ebx,56(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1537002063(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 64(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 52(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 60(%esp),%ebx shrl $10,%edi addl 32(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx movl %ebx,60(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 1747873779(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 68(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 56(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 64(%esp),%ebx shrl $10,%edi addl 36(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 20(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 24(%esp),%edi xorl %ecx,%edx movl %ebx,64(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx addl 28(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 4(%esp),%edi xorl %eax,%ecx movl %eax,(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 1955562222(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 72(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 12(%esp),%edx addl %ecx,%ebp movl 60(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 68(%esp),%ebx shrl $10,%edi addl 40(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 16(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 20(%esp),%edi xorl %esi,%edx movl %ebx,68(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,12(%esp) xorl %esi,%edx addl 24(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl (%esp),%edi xorl %ebp,%esi movl %ebp,28(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2024104815(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 76(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 8(%esp),%edx addl %esi,%eax movl 64(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 72(%esp),%ebx shrl $10,%edi addl 44(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 12(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 16(%esp),%edi xorl %ecx,%edx movl %ebx,72(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx addl 20(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 28(%esp),%edi xorl %eax,%ecx movl %eax,24(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2227730452(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 80(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 4(%esp),%edx addl %ecx,%ebp movl 68(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 76(%esp),%ebx shrl $10,%edi addl 48(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 8(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 12(%esp),%edi xorl %esi,%edx movl %ebx,76(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,4(%esp) xorl %esi,%edx addl 16(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 24(%esp),%edi xorl %ebp,%esi movl %ebp,20(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2361852424(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 84(%esp),%ecx rorl $2,%esi addl %edx,%eax addl (%esp),%edx addl %esi,%eax movl 72(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 80(%esp),%ebx shrl $10,%edi addl 52(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 4(%esp),%esi rorl $14,%edx addl %edi,%ebx movl 8(%esp),%edi xorl %ecx,%edx movl %ebx,80(%esp) xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx addl 12(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 20(%esp),%edi xorl %eax,%ecx movl %eax,16(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 2428436474(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 88(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 28(%esp),%edx addl %ecx,%ebp movl 76(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 84(%esp),%ebx shrl $10,%edi addl 56(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl (%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 4(%esp),%edi xorl %esi,%edx movl %ebx,84(%esp) xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,28(%esp) xorl %esi,%edx addl 8(%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 16(%esp),%edi xorl %ebp,%esi movl %ebp,12(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 2756734187(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax movl 92(%esp),%ecx rorl $2,%esi addl %edx,%eax addl 24(%esp),%edx addl %esi,%eax movl 80(%esp),%esi movl %ecx,%ebx rorl $11,%ecx movl %esi,%edi rorl $2,%esi xorl %ebx,%ecx shrl $3,%ebx rorl $7,%ecx xorl %edi,%esi xorl %ecx,%ebx rorl $17,%esi addl 88(%esp),%ebx shrl $10,%edi addl 60(%esp),%ebx movl %edx,%ecx xorl %esi,%edi movl 28(%esp),%esi rorl $14,%edx addl %edi,%ebx movl (%esp),%edi xorl %ecx,%edx xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx addl 4(%esp),%ebx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%ebx rorl $9,%ecx movl %eax,%esi movl 12(%esp),%edi xorl %eax,%ecx movl %eax,8(%esp) xorl %edi,%eax rorl $11,%ecx andl %eax,%ebp leal 3204031479(%ebx,%edx,1),%edx xorl %esi,%ecx xorl %edi,%ebp movl 32(%esp),%esi rorl $2,%ecx addl %edx,%ebp addl 20(%esp),%edx addl %ecx,%ebp movl 84(%esp),%ecx movl %esi,%ebx rorl $11,%esi movl %ecx,%edi rorl $2,%ecx xorl %ebx,%esi shrl $3,%ebx rorl $7,%esi xorl %edi,%ecx xorl %esi,%ebx rorl $17,%ecx addl 92(%esp),%ebx shrl $10,%edi addl 64(%esp),%ebx movl %edx,%esi xorl %ecx,%edi movl 24(%esp),%ecx rorl $14,%edx addl %edi,%ebx movl 28(%esp),%edi xorl %esi,%edx xorl %edi,%ecx rorl $5,%edx andl %esi,%ecx movl %esi,20(%esp) xorl %esi,%edx addl (%esp),%ebx xorl %ecx,%edi rorl $6,%edx movl %ebp,%esi addl %edi,%ebx rorl $9,%esi movl %ebp,%ecx movl 8(%esp),%edi xorl %ebp,%esi movl %ebp,4(%esp) xorl %edi,%ebp rorl $11,%esi andl %ebp,%eax leal 3329325298(%ebx,%edx,1),%edx xorl %ecx,%esi xorl %edi,%eax rorl $2,%esi addl %edx,%eax addl 16(%esp),%edx addl %esi,%eax movl 96(%esp),%esi xorl %edi,%ebp movl 12(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebp addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) movl %ebp,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) movl %ebp,4(%esp) xorl %edi,%ebp movl %edi,8(%esp) movl %ecx,12(%esp) movl 20(%esp),%edi movl 24(%esp),%ebx movl 28(%esp),%ecx addl 16(%esi),%edx addl 20(%esi),%edi addl 24(%esi),%ebx addl 28(%esi),%ecx movl %edx,16(%esi) movl %edi,20(%esi) movl %ebx,24(%esi) movl %ecx,28(%esi) movl %edi,20(%esp) movl 100(%esp),%edi movl %ebx,24(%esp) movl %ecx,28(%esp) cmpl 104(%esp),%edi - jb .L009grand_loop + jb .L010grand_loop movl 108(%esp),%esp popl %edi popl %esi popl %ebx popl %ebp ret .align 32 .L004shaext: subl $32,%esp movdqu (%esi),%xmm1 leal 128(%ebp),%ebp movdqu 16(%esi),%xmm2 movdqa 128(%ebp),%xmm7 pshufd $27,%xmm1,%xmm0 pshufd $177,%xmm1,%xmm1 pshufd $27,%xmm2,%xmm2 .byte 102,15,58,15,202,8 punpcklqdq %xmm0,%xmm2 - jmp .L010loop_shaext + jmp .L011loop_shaext .align 16 -.L010loop_shaext: +.L011loop_shaext: movdqu (%edi),%xmm3 movdqu 16(%edi),%xmm4 movdqu 32(%edi),%xmm5 .byte 102,15,56,0,223 movdqu 48(%edi),%xmm6 movdqa %xmm2,16(%esp) movdqa -128(%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 102,15,56,0,231 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 nop movdqa %xmm1,(%esp) .byte 15,56,203,202 movdqa -112(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 102,15,56,0,239 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 leal 64(%edi),%edi .byte 15,56,204,220 .byte 15,56,203,202 movdqa -96(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 102,15,56,0,247 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa -80(%ebp),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa -64(%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa -48(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa -32(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa -16(%ebp),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa (%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 16(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 nop paddd %xmm7,%xmm6 .byte 15,56,204,220 .byte 15,56,203,202 movdqa 32(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,205,245 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm6,%xmm7 .byte 102,15,58,15,253,4 nop paddd %xmm7,%xmm3 .byte 15,56,204,229 .byte 15,56,203,202 movdqa 48(%ebp),%xmm0 paddd %xmm6,%xmm0 .byte 15,56,205,222 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm3,%xmm7 .byte 102,15,58,15,254,4 nop paddd %xmm7,%xmm4 .byte 15,56,204,238 .byte 15,56,203,202 movdqa 64(%ebp),%xmm0 paddd %xmm3,%xmm0 .byte 15,56,205,227 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm4,%xmm7 .byte 102,15,58,15,251,4 nop paddd %xmm7,%xmm5 .byte 15,56,204,243 .byte 15,56,203,202 movdqa 80(%ebp),%xmm0 paddd %xmm4,%xmm0 .byte 15,56,205,236 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 movdqa %xmm5,%xmm7 .byte 102,15,58,15,252,4 .byte 15,56,203,202 paddd %xmm7,%xmm6 movdqa 96(%ebp),%xmm0 paddd %xmm5,%xmm0 .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 .byte 15,56,205,245 movdqa 128(%ebp),%xmm7 .byte 15,56,203,202 movdqa 112(%ebp),%xmm0 paddd %xmm6,%xmm0 nop .byte 15,56,203,209 pshufd $14,%xmm0,%xmm0 cmpl %edi,%eax nop .byte 15,56,203,202 paddd 16(%esp),%xmm2 paddd (%esp),%xmm1 - jnz .L010loop_shaext + jnz .L011loop_shaext pshufd $177,%xmm2,%xmm2 pshufd $27,%xmm1,%xmm7 pshufd $177,%xmm1,%xmm1 punpckhqdq %xmm2,%xmm1 .byte 102,15,58,15,215,8 movl 44(%esp),%esp movdqu %xmm1,(%esi) movdqu %xmm2,16(%esi) popl %edi popl %esi popl %ebx popl %ebp ret .align 32 -.L005SSSE3: +.L006SSSE3: leal -96(%esp),%esp movl (%esi),%eax movl 4(%esi),%ebx movl 8(%esi),%ecx movl 12(%esi),%edi movl %ebx,4(%esp) xorl %ecx,%ebx movl %ecx,8(%esp) movl %edi,12(%esp) movl 16(%esi),%edx movl 20(%esi),%edi movl 24(%esi),%ecx movl 28(%esi),%esi movl %edi,20(%esp) movl 100(%esp),%edi movl %ecx,24(%esp) movl %esi,28(%esp) movdqa 256(%ebp),%xmm7 - jmp .L011grand_ssse3 + jmp .L012grand_ssse3 .align 16 -.L011grand_ssse3: +.L012grand_ssse3: movdqu (%edi),%xmm0 movdqu 16(%edi),%xmm1 movdqu 32(%edi),%xmm2 movdqu 48(%edi),%xmm3 addl $64,%edi .byte 102,15,56,0,199 movl %edi,100(%esp) .byte 102,15,56,0,207 movdqa (%ebp),%xmm4 .byte 102,15,56,0,215 movdqa 16(%ebp),%xmm5 paddd %xmm0,%xmm4 .byte 102,15,56,0,223 movdqa 32(%ebp),%xmm6 paddd %xmm1,%xmm5 movdqa 48(%ebp),%xmm7 movdqa %xmm4,32(%esp) paddd %xmm2,%xmm6 movdqa %xmm5,48(%esp) paddd %xmm3,%xmm7 movdqa %xmm6,64(%esp) movdqa %xmm7,80(%esp) - jmp .L012ssse3_00_47 + jmp .L013ssse3_00_47 .align 16 -.L012ssse3_00_47: +.L013ssse3_00_47: addl $64,%ebp movl %edx,%ecx movdqa %xmm1,%xmm4 rorl $14,%edx movl 20(%esp),%esi movdqa %xmm3,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi .byte 102,15,58,15,224,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,250,4 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 4(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm0 movl %eax,(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm3,%xmm7 xorl %esi,%ecx addl 32(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 12(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl 16(%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,12(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm0 movl %ebx,28(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 36(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 8(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm0 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) pshufd $80,%xmm0,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 40(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 4(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa (%ebp),%xmm6 andl %ecx,%esi movl %ecx,4(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm0 movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx paddd %xmm0,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 44(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movdqa %xmm6,32(%esp) movl %edx,%ecx movdqa %xmm2,%xmm4 rorl $14,%edx movl 4(%esp),%esi movdqa %xmm0,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi .byte 102,15,58,15,225,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,251,4 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 20(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm1 movl %eax,16(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm0,%xmm7 xorl %esi,%ecx addl 48(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 28(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl (%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,28(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm1 movl %ebx,12(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 52(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 24(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm1 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) pshufd $80,%xmm1,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 56(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 20(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa 16(%ebp),%xmm6 andl %ecx,%esi movl %ecx,20(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm1 movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx paddd %xmm1,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 60(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movdqa %xmm6,48(%esp) movl %edx,%ecx movdqa %xmm3,%xmm4 rorl $14,%edx movl 20(%esp),%esi movdqa %xmm1,%xmm7 xorl %ecx,%edx movl 24(%esp),%edi .byte 102,15,58,15,226,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,248,4 movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 4(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm2 movl %eax,(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm1,%xmm7 xorl %esi,%ecx addl 64(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 12(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl 16(%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,12(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm2 movl %ebx,28(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 68(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 8(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm2 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) pshufd $80,%xmm2,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 72(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 4(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa 32(%ebp),%xmm6 andl %ecx,%esi movl %ecx,4(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm2 movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx paddd %xmm2,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 76(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movdqa %xmm6,64(%esp) movl %edx,%ecx movdqa %xmm0,%xmm4 rorl $14,%edx movl 4(%esp),%esi movdqa %xmm2,%xmm7 xorl %ecx,%edx movl 8(%esp),%edi .byte 102,15,58,15,227,4 xorl %edi,%esi rorl $5,%edx andl %ecx,%esi .byte 102,15,58,15,249,4 movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi movdqa %xmm4,%xmm5 rorl $6,%edx movl %eax,%ecx movdqa %xmm4,%xmm6 addl %edi,%edx movl 20(%esp),%edi psrld $3,%xmm4 movl %eax,%esi rorl $9,%ecx paddd %xmm7,%xmm3 movl %eax,16(%esp) xorl %eax,%ecx psrld $7,%xmm6 xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx pshufd $250,%xmm2,%xmm7 xorl %esi,%ecx addl 80(%esp),%edx pslld $14,%xmm5 xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm4 addl %edx,%ebx addl 28(%esp),%edx psrld $11,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm5,%xmm4 movl (%esp),%esi xorl %ecx,%edx pslld $11,%xmm5 movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx pxor %xmm6,%xmm4 andl %ecx,%esi movl %ecx,28(%esp) movdqa %xmm7,%xmm6 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx pxor %xmm5,%xmm4 movl %ebx,%ecx addl %edi,%edx psrld $10,%xmm7 movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm4,%xmm3 movl %ebx,12(%esp) xorl %ebx,%ecx psrlq $17,%xmm6 xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx pxor %xmm6,%xmm7 andl %ebx,%eax xorl %esi,%ecx psrlq $2,%xmm6 addl 84(%esp),%edx xorl %edi,%eax rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%eax addl 24(%esp),%edx pshufd $128,%xmm7,%xmm7 addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi psrldq $8,%xmm7 movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi paddd %xmm7,%xmm3 rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) pshufd $80,%xmm3,%xmm7 xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx movdqa %xmm7,%xmm6 rorl $11,%ecx psrld $10,%xmm7 andl %eax,%ebx psrlq $17,%xmm6 xorl %esi,%ecx addl 88(%esp),%edx xorl %edi,%ebx rorl $2,%ecx pxor %xmm6,%xmm7 addl %edx,%ebx addl 20(%esp),%edx psrlq $2,%xmm6 addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx pxor %xmm6,%xmm7 movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi pshufd $8,%xmm7,%xmm7 xorl %edi,%esi rorl $5,%edx movdqa 48(%ebp),%xmm6 andl %ecx,%esi movl %ecx,20(%esp) pslldq $8,%xmm7 xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx paddd %xmm7,%xmm3 movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx paddd %xmm3,%xmm6 rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 92(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movdqa %xmm6,80(%esp) cmpl $66051,64(%ebp) - jne .L012ssse3_00_47 + jne .L013ssse3_00_47 movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi xorl %ecx,%edx movl 24(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 4(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,(%esp) xorl %eax,%ecx xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 32(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 12(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 16(%esp),%esi xorl %ecx,%edx movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,12(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,28(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 36(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 8(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 40(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 4(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,4(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 44(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 4(%esp),%esi xorl %ecx,%edx movl 8(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 20(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,16(%esp) xorl %eax,%ecx xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 48(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 28(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl (%esp),%esi xorl %ecx,%edx movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,28(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,12(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 52(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 24(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 56(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 20(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,20(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 60(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 20(%esp),%esi xorl %ecx,%edx movl 24(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,16(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 4(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,(%esp) xorl %eax,%ecx xorl %edi,%eax addl 28(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 64(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 12(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 16(%esp),%esi xorl %ecx,%edx movl 20(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,12(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl (%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,28(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 24(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 68(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 8(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 12(%esp),%esi xorl %ecx,%edx movl 16(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,8(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 28(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,24(%esp) xorl %eax,%ecx xorl %edi,%eax addl 20(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 72(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 4(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 8(%esp),%esi xorl %ecx,%edx movl 12(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,4(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 24(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,20(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 16(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 76(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl (%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 4(%esp),%esi xorl %ecx,%edx movl 8(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 20(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,16(%esp) xorl %eax,%ecx xorl %edi,%eax addl 12(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 80(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 28(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl (%esp),%esi xorl %ecx,%edx movl 4(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,28(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 16(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,12(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl 8(%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 84(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 24(%esp),%edx addl %ecx,%eax movl %edx,%ecx rorl $14,%edx movl 28(%esp),%esi xorl %ecx,%edx movl (%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,24(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %eax,%ecx addl %edi,%edx movl 12(%esp),%edi movl %eax,%esi rorl $9,%ecx movl %eax,8(%esp) xorl %eax,%ecx xorl %edi,%eax addl 4(%esp),%edx rorl $11,%ecx andl %eax,%ebx xorl %esi,%ecx addl 88(%esp),%edx xorl %edi,%ebx rorl $2,%ecx addl %edx,%ebx addl 20(%esp),%edx addl %ecx,%ebx movl %edx,%ecx rorl $14,%edx movl 24(%esp),%esi xorl %ecx,%edx movl 28(%esp),%edi xorl %edi,%esi rorl $5,%edx andl %ecx,%esi movl %ecx,20(%esp) xorl %ecx,%edx xorl %esi,%edi rorl $6,%edx movl %ebx,%ecx addl %edi,%edx movl 8(%esp),%edi movl %ebx,%esi rorl $9,%ecx movl %ebx,4(%esp) xorl %ebx,%ecx xorl %edi,%ebx addl (%esp),%edx rorl $11,%ecx andl %ebx,%eax xorl %esi,%ecx addl 92(%esp),%edx xorl %edi,%eax rorl $2,%ecx addl %edx,%eax addl 16(%esp),%edx addl %ecx,%eax movl 96(%esp),%esi xorl %edi,%ebx movl 12(%esp),%ecx addl (%esi),%eax addl 4(%esi),%ebx addl 8(%esi),%edi addl 12(%esi),%ecx movl %eax,(%esi) movl %ebx,4(%esi) movl %edi,8(%esi) movl %ecx,12(%esi) movl %ebx,4(%esp) xorl %edi,%ebx movl %edi,8(%esp) movl %ecx,12(%esp) movl 20(%esp),%edi movl 24(%esp),%ecx addl 16(%esi),%edx addl 20(%esi),%edi addl 24(%esi),%ecx movl %edx,16(%esi) movl %edi,20(%esi) movl %edi,20(%esp) movl 28(%esp),%edi movl %ecx,24(%esi) addl 28(%esi),%edi movl %ecx,24(%esp) movl %edi,28(%esi) movl %edi,28(%esp) movl 100(%esp),%edi movdqa 64(%ebp),%xmm7 subl $192,%ebp cmpl 104(%esp),%edi - jb .L011grand_ssse3 + jb .L012grand_ssse3 movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L005AVX: + andl $264,%edx + cmpl $264,%edx + je .L014AVX_BMI + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L015grand_avx +.align 32 +.L015grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L016avx_00_47 +.align 16 +.L016avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L016avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L015grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 32 +.L014AVX_BMI: + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L017grand_avx_bmi +.align 32 +.L017grand_avx_bmi: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L018avx_bmi_00_47 +.align 16 +.L018avx_bmi_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + vpaddd %xmm4,%xmm0,%xmm0 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm0,%xmm0 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm0,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + vpaddd (%ebp),%xmm0,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + vpaddd %xmm4,%xmm1,%xmm1 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm1,%xmm1 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm1,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + vpalignr $4,%xmm0,%xmm1,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + vpaddd %xmm4,%xmm2,%xmm2 + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm2,%xmm2 + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm2,%xmm7 + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm7 + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + vpsrld $7,%xmm4,%xmm6 + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrld $3,%xmm4,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpslld $14,%xmm4,%xmm5 + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpxor %xmm6,%xmm7,%xmm4 + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + vpsrld $11,%xmm6,%xmm6 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpslld $11,%xmm5,%xmm5 + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpsrld $10,%xmm7,%xmm6 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpxor %xmm5,%xmm4,%xmm4 + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpsrlq $17,%xmm7,%xmm5 + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + vpaddd %xmm4,%xmm3,%xmm3 + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + vpxor %xmm5,%xmm6,%xmm6 + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpsrlq $19,%xmm7,%xmm7 + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + vpshufd $132,%xmm6,%xmm7 + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + vpsrldq $8,%xmm7,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + vpaddd %xmm7,%xmm3,%xmm3 + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + vpshufd $80,%xmm3,%xmm7 + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + vpsrld $10,%xmm7,%xmm6 + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + vpsrlq $17,%xmm7,%xmm5 + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + vpxor %xmm5,%xmm6,%xmm6 + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + vpsrlq $19,%xmm7,%xmm7 + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + vpshufd $232,%xmm6,%xmm7 + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + vpslldq $8,%xmm7,%xmm7 + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L018avx_bmi_00_47 + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 32(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 36(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 40(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 44(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 48(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 52(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 56(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 60(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,16(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 24(%esp),%edx,%esi + xorl %edi,%ecx + andl 20(%esp),%edx + movl %eax,(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 4(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + andl %eax,%ebx + addl 64(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 12(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,12(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 20(%esp),%edx,%esi + xorl %edi,%ecx + andl 16(%esp),%edx + movl %ebx,28(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl (%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + andl %ebx,%eax + addl 68(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 8(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,8(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 16(%esp),%edx,%esi + xorl %edi,%ecx + andl 12(%esp),%edx + movl %eax,24(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 28(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + andl %eax,%ebx + addl 72(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 4(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,4(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 12(%esp),%edx,%esi + xorl %edi,%ecx + andl 8(%esp),%edx + movl %ebx,20(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 24(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + andl %ebx,%eax + addl 76(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl (%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 8(%esp),%edx,%esi + xorl %edi,%ecx + andl 4(%esp),%edx + movl %eax,16(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 20(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + andl %eax,%ebx + addl 80(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 28(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,28(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 4(%esp),%edx,%esi + xorl %edi,%ecx + andl (%esp),%edx + movl %ebx,12(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 16(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + andl %ebx,%eax + addl 84(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 24(%esp),%edx + leal (%eax,%ecx,1),%eax + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,24(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl (%esp),%edx,%esi + xorl %edi,%ecx + andl 28(%esp),%edx + movl %eax,8(%esp) + orl %esi,%edx + rorxl $2,%eax,%edi + rorxl $13,%eax,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%eax,%ecx + xorl %edi,%esi + movl 12(%esp),%edi + xorl %esi,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + andl %eax,%ebx + addl 88(%esp),%edx + xorl %edi,%ebx + addl %edx,%ecx + addl 20(%esp),%edx + leal (%ebx,%ecx,1),%ebx + rorxl $6,%edx,%ecx + rorxl $11,%edx,%esi + movl %edx,20(%esp) + rorxl $25,%edx,%edi + xorl %esi,%ecx + andnl 28(%esp),%edx,%esi + xorl %edi,%ecx + andl 24(%esp),%edx + movl %ebx,4(%esp) + orl %esi,%edx + rorxl $2,%ebx,%edi + rorxl $13,%ebx,%esi + leal (%edx,%ecx,1),%edx + rorxl $22,%ebx,%ecx + xorl %edi,%esi + movl 8(%esp),%edi + xorl %esi,%ecx + xorl %edi,%ebx + addl (%esp),%edx + andl %ebx,%eax + addl 92(%esp),%edx + xorl %edi,%eax + addl %edx,%ecx + addl 16(%esp),%edx + leal (%eax,%ecx,1),%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L017grand_avx_bmi + movl 108(%esp),%esp + vzeroall popl %edi popl %esi popl %ebx popl %ebp ret .size sha256_block_data_order,.-.L_sha256_block_data_order_begin .comm OPENSSL_ia32cap_P,16,4 #endif Index: stable/12 =================================================================== --- stable/12 (revision 364962) +++ stable/12 (revision 364963) Property changes on: stable/12 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r364822-364823